In [1]:
import pandas as pd 
import numpy as np


In [2]:
products = pd.read_csv('Data/amazon_products.csv')
categories = pd.read_csv('Data/amazon_categories.csv')

In [3]:
products.head()

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth
0,B014TMV5YE,"Sion Softside Expandable Roller Luggage, Black...",https://m.media-amazon.com/images/I/815dLQKYIY...,https://www.amazon.com/dp/B014TMV5YE,4.5,0,139.99,0.0,104,False,2000
1,B07GDLCQXV,Luggage Sets Expandable PC+ABS Durable Suitcas...,https://m.media-amazon.com/images/I/81bQlm7vf6...,https://www.amazon.com/dp/B07GDLCQXV,4.5,0,169.99,209.99,104,False,1000
2,B07XSCCZYG,Platinum Elite Softside Expandable Checked Lug...,https://m.media-amazon.com/images/I/71EA35zvJB...,https://www.amazon.com/dp/B07XSCCZYG,4.6,0,365.49,429.99,104,False,300
3,B08MVFKGJM,Freeform Hardside Expandable with Double Spinn...,https://m.media-amazon.com/images/I/91k6NYLQyI...,https://www.amazon.com/dp/B08MVFKGJM,4.6,0,291.59,354.37,104,False,400
4,B01DJLKZBA,Winfield 2 Hardside Expandable Luggage with Sp...,https://m.media-amazon.com/images/I/61NJoaZcP9...,https://www.amazon.com/dp/B01DJLKZBA,4.5,0,174.99,309.99,104,False,400


In [4]:
products= products.drop(columns = ['productURL','asin'], axis = 1)

In [5]:
products.head()

Unnamed: 0,title,imgUrl,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth
0,"Sion Softside Expandable Roller Luggage, Black...",https://m.media-amazon.com/images/I/815dLQKYIY...,4.5,0,139.99,0.0,104,False,2000
1,Luggage Sets Expandable PC+ABS Durable Suitcas...,https://m.media-amazon.com/images/I/81bQlm7vf6...,4.5,0,169.99,209.99,104,False,1000
2,Platinum Elite Softside Expandable Checked Lug...,https://m.media-amazon.com/images/I/71EA35zvJB...,4.6,0,365.49,429.99,104,False,300
3,Freeform Hardside Expandable with Double Spinn...,https://m.media-amazon.com/images/I/91k6NYLQyI...,4.6,0,291.59,354.37,104,False,400
4,Winfield 2 Hardside Expandable Luggage with Sp...,https://m.media-amazon.com/images/I/61NJoaZcP9...,4.5,0,174.99,309.99,104,False,400


In [6]:
products.shape

(1426337, 9)

In [7]:
categories.head()

Unnamed: 0,id,category_name
0,1,Beading & Jewelry Making
1,2,Fabric Decorating
2,3,Knitting & Crochet Supplies
3,4,Printmaking Supplies
4,5,Scrapbooking & Stamping Supplies


In [8]:
categories.rename(columns ={'id':'category_id'}, inplace = True)
categories.head()

Unnamed: 0,category_id,category_name
0,1,Beading & Jewelry Making
1,2,Fabric Decorating
2,3,Knitting & Crochet Supplies
3,4,Printmaking Supplies
4,5,Scrapbooking & Stamping Supplies


In [9]:
categories.shape

(248, 2)

In [10]:
merged_df = products.merge(categories, on = 'category_id')
merged_df.head()

Unnamed: 0,title,imgUrl,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth,category_name
0,"Sion Softside Expandable Roller Luggage, Black...",https://m.media-amazon.com/images/I/815dLQKYIY...,4.5,0,139.99,0.0,104,False,2000,Suitcases
1,Luggage Sets Expandable PC+ABS Durable Suitcas...,https://m.media-amazon.com/images/I/81bQlm7vf6...,4.5,0,169.99,209.99,104,False,1000,Suitcases
2,Platinum Elite Softside Expandable Checked Lug...,https://m.media-amazon.com/images/I/71EA35zvJB...,4.6,0,365.49,429.99,104,False,300,Suitcases
3,Freeform Hardside Expandable with Double Spinn...,https://m.media-amazon.com/images/I/91k6NYLQyI...,4.6,0,291.59,354.37,104,False,400,Suitcases
4,Winfield 2 Hardside Expandable Luggage with Sp...,https://m.media-amazon.com/images/I/61NJoaZcP9...,4.5,0,174.99,309.99,104,False,400,Suitcases


STRATIFIED SAMPLING

In [11]:
from sklearn.model_selection import train_test_split

# Ensure 'products' contains a 'category_id' column
sampled_df, _ = train_test_split(merged_df, stratify=products['category_id'], train_size=50000, random_state=1)


In [12]:
# Define minimum samples per category
min_samples_per_category = 500

# Group by category and sample
sampled_dfs = []
for category, group in merged_df.groupby('category_id'):
    sampled_dfs.append(group.sample(n=min(len(group), min_samples_per_category), random_state=1))

# Combine sampled data
sampled_df = pd.concat(sampled_dfs)

# If you need a specific size (e.g., 30,000), randomly sample further
if len(sampled_df) > 100000:
    sampled_df = sampled_df.sample(n=100000, random_state=1)


In [13]:
print(sampled_df['category_id'].value_counts())

category_id
222    446
154    446
176    445
172    445
42     445
      ... 
193     61
186     46
185     37
102     32
194     18
Name: count, Length: 248, dtype: int64


In [14]:
sampled_df.head()

Unnamed: 0,title,imgUrl,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth,category_name
1088710,"RV Designer H305, Drawer Slide Socket Set, C -...",https://m.media-amazon.com/images/I/61lbNJfAGn...,4.6,937,9.99,0.0,211,False,100,Hardware
1173796,Drawn To Life: Spongebob Squarepants,https://m.media-amazon.com/images/I/91mS3Uzm3u...,4.4,0,0.0,0.0,242,False,0,"Nintendo DS Games, Consoles & Accessories"
1243483,"Premium Brass 24.8"" Winterize Sprinkler System...",https://m.media-amazon.com/images/I/61DNnTT1Uv...,4.7,0,19.99,0.0,27,False,0,RV Parts & Accessories
1313090,"7.9"" LCD Screen Replacement for iPad Mini 3 A1...",https://m.media-amazon.com/images/I/61l3RW-HTx...,0.0,0,59.9,0.0,58,False,0,Tablet Replacement Parts
1015776,"Listerine Total Care Anticavity Mouthwash, Fre...",https://m.media-amazon.com/images/I/712Gsk3WPv...,5.0,0,0.0,0.0,45,False,0,Beauty & Personal Care


In [15]:
sampled_df.shape

(100000, 10)

In [16]:
sampled_df.columns

Index(['title', 'imgUrl', 'stars', 'reviews', 'price', 'listPrice',
       'category_id', 'isBestSeller', 'boughtInLastMonth', 'category_name'],
      dtype='object')

In [17]:
sampled_df = sampled_df.reset_index(drop=False).rename(columns={'index': 'index'})
sampled_df.head()

Unnamed: 0,index,title,imgUrl,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth,category_name
0,1088710,"RV Designer H305, Drawer Slide Socket Set, C -...",https://m.media-amazon.com/images/I/61lbNJfAGn...,4.6,937,9.99,0.0,211,False,100,Hardware
1,1173796,Drawn To Life: Spongebob Squarepants,https://m.media-amazon.com/images/I/91mS3Uzm3u...,4.4,0,0.0,0.0,242,False,0,"Nintendo DS Games, Consoles & Accessories"
2,1243483,"Premium Brass 24.8"" Winterize Sprinkler System...",https://m.media-amazon.com/images/I/61DNnTT1Uv...,4.7,0,19.99,0.0,27,False,0,RV Parts & Accessories
3,1313090,"7.9"" LCD Screen Replacement for iPad Mini 3 A1...",https://m.media-amazon.com/images/I/61l3RW-HTx...,0.0,0,59.9,0.0,58,False,0,Tablet Replacement Parts
4,1015776,"Listerine Total Care Anticavity Mouthwash, Fre...",https://m.media-amazon.com/images/I/712Gsk3WPv...,5.0,0,0.0,0.0,45,False,0,Beauty & Personal Care


Now rather than training on the 1.4 million dataset, I'll work with a smaller more handle-able dataset-size

In [None]:
import os
from tqdm import tqdm
import requests

image_dir = 'Product_images_sampled'

os.makedirs(image_dir, exist_ok=True)

def download_image(url, save_path):
    try:
        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                file.write(response.content)
    except Exception as e:
        print(f"Error downloading image from {url}: {e}")

for idx, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
    url = row['imgUrl']
    img_id = row['index']
    save_path = os.path.join(image_dir, f"{img_id}.jpg")
    if not os.path.exists(save_path): #avoid re-downloading
        download_image(url, save_path)

In [None]:
image_folder = "Product_images_sampled"

# Generate the expected file paths based on the `index` column
sampled_df['image_path'] = sampled_df['index'].apply(lambda idx: os.path.join(image_folder, f"{idx}.jpg"))

# Identify rows where the image file does not exist
missing_images_df = sampled_df[~sampled_df['image_path'].apply(os.path.exists)]

print(f"Number of missing images: {len(missing_images_df)}")

In [None]:
def retry_download_image(img_url, save_path, retries=3, timeout=10):
    """Function to download an image with retries."""
    for attempt in range(retries):
        try:
            response = requests.get(img_url, timeout=timeout)
            response.raise_for_status()
            with open(save_path, 'wb') as f:
                f.write(response.content)
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for {img_url}: {e}")
    return False

# Retry downloading for missing images
for _, row in tqdm(missing_images_df.iterrows(), total=len(missing_images_df)):
    img_url = row['imgUrl']
    save_path = row['image_path']
    
    success = retry_download_image(img_url, save_path)
    if success:
        print(f"Successfully downloaded image for index {row['index']}.")
    else:
        print(f"Failed to download image for index {row['index']} after retries.")

In [None]:
# Recheck for remaining missing files
remaining_missing_images_df = missing_images_df[~missing_images_df['image_path'].apply(os.path.exists)]

if not remaining_missing_images_df.empty:
    print(f"{len(remaining_missing_images_df)} images are still missing after retries.")
else:
    print("All missing images have been successfully downloaded.")


In [None]:
# Remove rows with missing images
sampled_df = sampled_df[~sampled_df.index.isin(remaining_missing_images_df.index)].reset_index(drop=True)

print(f"Dataset shape after removing rows with missing images: {sampled_df.shape}")


In [None]:
sampled_df.to_csv("larger_cleaned_sampled_df.csv", index=False)
print("Cleaned dataset saved as 'larger_cleaned_sampled_df.csv'.")