In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from PIL import Image
from IPython.display import display, Image
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
# Setup / Data-Reading
file_path = 'amazon.csv'
df_original = pd.read_csv(file_path)
amazon = df_original.copy()

# By default, the program only prints 4 values, and fills the rest with '...'
pd.set_option('display.max_columns', None)

# By default, there is a width limit set, so items over that limit would be printed in a new line.
pd.set_option('display.width', 1000)

In [None]:
# Checking which products have images

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Initialize the new column in the dataframe with None values
amazon['has_image'] = None

start_time = time.time()  # Start timing

total_entries = len(amazon)

for i in range(total_entries):
    data_uri = amazon.loc[i, 'img_link']
    driver.get(data_uri)
    try:
        # Check for 'Bad Request' message in the page
        driver.find_element(By.XPATH, "//*[contains(text(), 'Bad Request')]")
        amazon.loc[i, 'has_image'] = False
    except NoSuchElementException:
        # No 'Bad Request' found, set the column to True
        amazon.loc[i, 'has_image'] = True

driver.quit()

end_time = time.time()  # End timing
elapsed_time = end_time - start_time  # Calculate elapsed time
print(f"Time taken to check images: {elapsed_time:.2f} seconds")

valid_images_percentage_total = (amazon['has_image'].sum() / total_entries) * 100
print(f"Percentage of image links without 'Bad Request': {valid_images_percentage_total:.2f}%")

# Time: 294.00 seconds, 314.13, 416.26
# Outcome: Roughly 59.52% of products have images. 59.59

amazon.to_csv('amazon_updated.csv', index=False)
