In [1]:
from google.colab import files

# Upload the Kaggle API key file
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d lokeshparab/amazon-products-dataset

Dataset URL: https://www.kaggle.com/datasets/lokeshparab/amazon-products-dataset
License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Downloading amazon-products-dataset.zip to /content
 98% 78.0M/79.7M [00:00<00:00, 114MB/s]
100% 79.7M/79.7M [00:00<00:00, 105MB/s]


In [4]:
!unzip amazon-products-dataset.zip

Archive:  amazon-products-dataset.zip
  inflating: Air Conditioners.csv    
  inflating: All Appliances.csv      
  inflating: All Books.csv           
  inflating: All Car and Motorbike Products.csv  
  inflating: All Electronics.csv     
  inflating: All English.csv         
  inflating: All Exercise and Fitness.csv  
  inflating: All Grocery and Gourmet Foods.csv  
  inflating: All Hindi.csv           
  inflating: All Home and Kitchen.csv  
  inflating: All Movies and TV Shows.csv  
  inflating: All Music.csv           
  inflating: All Pet Supplies.csv    
  inflating: All Sports Fitness and Outdoors.csv  
  inflating: All Video Games.csv     
  inflating: Amazon Fashion.csv      
  inflating: Amazon Pharmacy.csv     
  inflating: Amazon-Products.csv     
  inflating: Baby Bath Skin and Grooming.csv  
  inflating: Baby Fashion.csv        
  inflating: Baby Products.csv       
  inflating: Backpacks.csv           
  inflating: Badminton.csv           
  inflating: Bags and Luggage.

In [5]:
import pandas as pd
import numpy as np
import requests
from io import BytesIO
from PIL import Image
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, GlobalAveragePooling2D, concatenate, Dropout
from sklearn.model_selection import train_test_split
import pickle
import time

In [6]:
def is_image_successful(url):
    try:
        response = requests.get(url, timeout=5)
        return response.status_code == 200
    except requests.RequestException:
        return False


In [7]:
def get_top_20_successful_images(dataset):
    successful_images = []
    for index, row in dataset.iterrows():
        if len(successful_images) >= 20:
            break
        if is_image_successful(row['image']):
            successful_images.append(row)
    return pd.DataFrame(successful_images)

In [8]:
def load_and_filter_datasets(dataset_paths):
    combined_data = []
    for path in dataset_paths:
        dataset = pd.read_csv(path)
        filtered_data = get_top_20_successful_images(dataset)
        combined_data.append(filtered_data)
    return pd.concat(combined_data, ignore_index=True)

In [9]:
dataset_paths = [
    # Sports Equipment Datasets
    '/content/Football.csv',
    '/content/Badminton.csv',
    '/content/Cycling.csv',
    '/content/Cricket.csv',
    '/content/Yoga.csv',
    '/content/Strength Training.csv',
    '/content/Running.csv',
    '/content/Fitness Accessories.csv',
    '/content/Cardio Equipment.csv',
    '/content/Sports Shoes.csv',
    '/content/Sportswear.csv',
    '/content/Sports Collectibles.csv',

    # Electronics Datasets
    '/content/Air Conditioners.csv',
    '/content/Cameras.csv',
    '/content/Headphones.csv',
    '/content/Televisions.csv',
    '/content/Car Electronics.csv',
    '/content/Security Cameras.csv',
    '/content/Home Audio and Theater.csv',
    '/content/Personal Care Appliances.csv',
    '/content/Heating and Cooling Appliances.csv',
    '/content/Refrigerators.csv',
    '/content/Washing Machines.csv',

    # Fashion Datasets
    '/content/Mens Fashion.csv',
    '/content/Womens Fashion.csv',
    '/content/Kids Fashion.csv',
    '/content/Shoes.csv',
    '/content/Casual Shoes.csv',
    '/content/Formal Shoes.csv',
    '/content/Ethnic Wear.csv',
    '/content/Innerwear.csv',
    '/content/Ballerinas.csv',
    '/content/Fashion and Silver Jewellery.csv',
    '/content/Gold and Diamond Jewellery.csv',
    '/content/Handbags and Clutches.csv',
    '/content/Jeans.csv',
    '/content/Lingerie and Nightwear.csv',
    '/content/T-shirts and Polos.csv',
    '/content/Western Wear.csv',

    # Books Datasets
    '/content/All Books.csv',
    '/content/Fiction Books.csv',
    '/content/Childrens Books.csv',
    '/content/Exam Central.csv',
    '/content/School Textbooks.csv',
    '/content/Textbooks.csv',
    '/content/Kindle eBooks.csv',
    '/content/Indian Language Books.csv',
    '/content/All English.csv',
    '/content/All Hindi.csv',

    # Home and Kitchen Datasets
    '/content/All Home and Kitchen.csv',
    '/content/Kitchen and Dining.csv',
    '/content/Furniture.csv',
    '/content/Home Furnishing.csv',
    '/content/Home Storage.csv',
    '/content/Home Dcor.csv',
    '/content/Bedroom Linen.csv',
    '/content/Kitchen Storage and Containers.csv',
    '/content/Heating and Cooling Appliances.csv',
    '/content/Home Entertainment Systems.csv',
    '/content/Home Improvement.csv',
    '/content/Garden and Outdoors.csv',

    # Grocery Datasets
    '/content/All Grocery and Gourmet Foods.csv',
    '/content/Coffee Tea and Beverages.csv',
    '/content/Diet and Nutrition.csv',
    '/content/Household Supplies.csv',
    '/content/Snack Foods.csv',
    '/content/Pantry.csv',
    '/content/Value Bazaar.csv',

    # Pharmacy Datasets
    '/content/Amazon Pharmacy.csv',
    '/content/Health and Personal Care.csv',
    '/content/Diet and Nutrition.csv',

    # Baby Products Datasets
    '/content/Baby Bath Skin and Grooming.csv',
    '/content/Baby Fashion.csv',
    '/content/Baby Products.csv',
    '/content/Diapers.csv',
    '/content/Nursing and Feeding.csv',
    '/content/Strollers and Prams.csv',

    # Cars and Motorbikes Datasets
    '/content/All Car and Motorbike Products.csv',
    '/content/Car Accessories.csv',
    '/content/Car and Bike Care.csv',
    '/content/Car Parts.csv',
    '/content/Motorbike Accessories and Parts.csv',

    # Toys and Games Datasets
    '/content/All Video Games.csv',
    '/content/Toys and Games.csv',
    '/content/STEM Toys Store.csv',
    '/content/Toys Gifting Store.csv',
    '/content/Gaming Consoles.csv',
    '/content/PC Games.csv',
    '/content/Gaming Accessories.csv',
    '/content/Video Games Deals.csv',

    # Luggage Datasets
    '/content/Backpacks.csv',
    '/content/Bags and Luggage.csv',
    '/content/Handbags and Clutches.csv',
    '/content/Rucksacks.csv',
    '/content/School Bags.csv',
    '/content/Suitcases and Trolley Bags.csv',
    '/content/Travel Accessories.csv',
    '/content/Travel Duffles.csv',

    # Watches and Jewellery Datasets
    '/content/Watches.csv',
    '/content/Jewellery.csv',
    '/content/Fashion and Silver Jewellery.csv',
    '/content/Gold and Diamond Jewellery.csv',

    # Pet Supplies Datasets
    '/content/Dog supplies.csv',
    '/content/All Pet Supplies.csv',

    # Musical Instruments Datasets
    '/content/Musical Instruments and Professional Audio.csv',
    '/content/Indian Classical.csv',

    # Movies and TV Datasets
    '/content/All Movies and TV Shows.csv',
    '/content/Blu-ray.csv',

    # Collectibles Datasets
    '/content/Entertainment Collectibles.csv',
    '/content/Sports Collectibles.csv',

    # Outdoor and Adventure Datasets
    '/content/Camping and Hiking.csv',
    '/content/Garden and Outdoors.csv',

    # Health and Personal Care Datasets
    '/content/Health and Personal Care.csv',
    '/content/Personal Care Appliances.csv',

    # Kitchen Storage Datasets
    '/content/Kitchen Storage and Containers.csv',

    # Bedding Datasets
    '/content/Bedroom Linen.csv',
]

In [10]:
combined_data = load_and_filter_datasets(dataset_paths)

In [11]:
combined_data['name'] = combined_data['name'].astype(str)
combined_data['image'] = combined_data['image'].astype(str)

In [12]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(combined_data['name'].values)
sequences = tokenizer.texts_to_sequences(combined_data['name'].values)
max_len = 100
X_text = pad_sequences(sequences, maxlen=max_len)

In [13]:
def preprocess_image_from_url(image_url):
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        if response.status_code == 502:
            raise ValueError("Bad Gateway")
        img = Image.open(BytesIO(response.content))
        img = img.resize((224, 224))
        img = np.array(img)
        img = preprocess_input(img)
        return img
    except requests.RequestException as e:
        print(f"Skipping image due to error: {e}")
        return None
    except Exception as e:
        print(f"Skipping image due to processing error: {e}")
        return None

In [14]:
gpus = tf.config.experimental.list_physical_devices('GPU')

In [15]:
gpus

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [16]:
tf.config.experimental.set_memory_growth(gpus[0],True)

In [17]:
def process_images_with_progress(urls):
    num_urls = len(urls)
    processed_images = []
    valid_urls = []
    start_time = time.time()

    for i, url in enumerate(urls):
        img = preprocess_image_from_url(url)
        if img is not None:
            processed_images.append(img)
            valid_urls.append(url)  # Keep track of valid image URLs

            # Print progress every 100 images
            if (i + 1) % 100 == 0:
                elapsed_time = time.time() - start_time
                processed_count = i + 1
                remaining_count = num_urls - processed_count
                estimated_time_remaining = (elapsed_time / processed_count) * remaining_count
                print(f"Processed {processed_count}/{num_urls} images ({(processed_count / num_urls) * 100:.2f}%). Estimated time remaining: {estimated_time_remaining / 60:.2f} minutes")

    return np.array(processed_images), valid_urls

In [18]:
X_images, valid_image_urls = process_images_with_progress(combined_data['image'].values)

Processed 100/1840 images (5.43%). Estimated time remaining: 1.91 minutes
Processed 200/1840 images (10.87%). Estimated time remaining: 2.34 minutes
Processed 300/1840 images (16.30%). Estimated time remaining: 2.71 minutes
Processed 400/1840 images (21.74%). Estimated time remaining: 2.70 minutes
Processed 500/1840 images (27.17%). Estimated time remaining: 2.44 minutes
Processed 600/1840 images (32.61%). Estimated time remaining: 2.09 minutes
Processed 700/1840 images (38.04%). Estimated time remaining: 1.75 minutes
Processed 800/1840 images (43.48%). Estimated time remaining: 1.50 minutes
Processed 900/1840 images (48.91%). Estimated time remaining: 1.27 minutes
Processed 1000/1840 images (54.35%). Estimated time remaining: 1.07 minutes
Processed 1100/1840 images (59.78%). Estimated time remaining: 0.90 minutes
Processed 1200/1840 images (65.22%). Estimated time remaining: 0.74 minutes
Processed 1300/1840 images (70.65%). Estimated time remaining: 0.61 minutes
Processed 1400/1840 im

In [19]:
valid_indices = combined_data['image'].isin(valid_image_urls)
combined_data = combined_data[valid_indices]

In [20]:

X_text = pad_sequences(tokenizer.texts_to_sequences(combined_data['name'].values), maxlen=max_len)
X_images, valid_image_urls = process_images_with_progress(combined_data['image'].values)


Processed 100/1840 images (5.43%). Estimated time remaining: 0.97 minutes
Processed 200/1840 images (10.87%). Estimated time remaining: 0.88 minutes
Processed 300/1840 images (16.30%). Estimated time remaining: 0.81 minutes
Processed 400/1840 images (21.74%). Estimated time remaining: 0.75 minutes
Processed 500/1840 images (27.17%). Estimated time remaining: 0.71 minutes
Processed 600/1840 images (32.61%). Estimated time remaining: 0.66 minutes
Processed 700/1840 images (38.04%). Estimated time remaining: 0.61 minutes
Processed 800/1840 images (43.48%). Estimated time remaining: 0.55 minutes
Processed 900/1840 images (48.91%). Estimated time remaining: 0.50 minutes
Processed 1000/1840 images (54.35%). Estimated time remaining: 0.45 minutes
Processed 1100/1840 images (59.78%). Estimated time remaining: 0.40 minutes
Processed 1200/1840 images (65.22%). Estimated time remaining: 0.35 minutes
Processed 1300/1840 images (70.65%). Estimated time remaining: 0.29 minutes
Processed 1400/1840 im

In [21]:
combined_data.head()

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price
0,Puma Unisex-Child Tacto Ii Fg/Ag Junior Footba...,sports & fitness,Football,https://m.media-amazon.com/images/I/51SgGEKmqx...,https://www.amazon.in/Puma-Unisex-Kid-Parisian...,4.0,14,"₹1,447","₹2,999"
1,Shrishti Creations Women's Ankle Length Velvet...,sports & fitness,Football,https://m.media-amazon.com/images/I/41SfdngTkF...,https://www.amazon.in/Winter-Thermal-Colour-Le...,4.1,704,₹299,₹999
2,PLUMBURY Women's No-show Net Socks (Pack of 5)...,sports & fitness,Football,https://m.media-amazon.com/images/I/41nv2qt+vH...,https://www.amazon.in/PLUMBURY-Womens-Girls-An...,3.6,110,₹499,₹799
3,Nivia 1021OR Blade Machine Stitched Football,sports & fitness,Football,https://m.media-amazon.com/images/I/71a0pxsDfm...,https://www.amazon.in/NIVIA-Machine-Stitched-F...,3.5,158,₹474,₹569
4,Amazon Brand - Symactive Men's Regular Track P...,sports & fitness,Football,https://m.media-amazon.com/images/I/71oZLTcPR4...,https://www.amazon.in/Amazon-Brand-Symbol-Trac...,3.9,524,₹509,"₹1,799"


In [22]:
combined_data['new_index'] = None
for i in range(len(combined_data)):
    combined_data.at[i, 'new_index'] = f'Index_{i+1}'
combined_data.set_index('new_index', inplace=True)

In [23]:
combined_data.head()

Unnamed: 0_level_0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Index_1,Puma Unisex-Child Tacto Ii Fg/Ag Junior Footba...,sports & fitness,Football,https://m.media-amazon.com/images/I/51SgGEKmqx...,https://www.amazon.in/Puma-Unisex-Kid-Parisian...,4.0,14,"₹1,447","₹2,999"
Index_2,Shrishti Creations Women's Ankle Length Velvet...,sports & fitness,Football,https://m.media-amazon.com/images/I/41SfdngTkF...,https://www.amazon.in/Winter-Thermal-Colour-Le...,4.1,704,₹299,₹999
Index_3,PLUMBURY Women's No-show Net Socks (Pack of 5)...,sports & fitness,Football,https://m.media-amazon.com/images/I/41nv2qt+vH...,https://www.amazon.in/PLUMBURY-Womens-Girls-An...,3.6,110,₹499,₹799
Index_4,Nivia 1021OR Blade Machine Stitched Football,sports & fitness,Football,https://m.media-amazon.com/images/I/71a0pxsDfm...,https://www.amazon.in/NIVIA-Machine-Stitched-F...,3.5,158,₹474,₹569
Index_5,Amazon Brand - Symactive Men's Regular Track P...,sports & fitness,Football,https://m.media-amazon.com/images/I/71oZLTcPR4...,https://www.amazon.in/Amazon-Brand-Symbol-Trac...,3.9,524,₹509,"₹1,799"


In [24]:
y = combined_data.index.values

In [25]:
X_train_images, X_test_images, X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_images, X_text, y, test_size=0.2, random_state=42)

In [26]:
image_input = Input(shape=(224, 224, 3))
base_model = ResNet50(weights='imagenet', include_top=False)(image_input)
x_image = GlobalAveragePooling2D()(base_model)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [27]:
text_input = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=10000, output_dim=128, input_length=max_len)(text_input)
x_text = LSTM(128)(embedding_layer)



In [28]:
combined = concatenate([x_image, x_text])
x = Dense(256, activation='relu')(combined)
x = Dropout(0.3)(x)
output = Dense(1, activation='sigmoid')(x)

In [29]:
model = Model(inputs=[image_input, text_input], outputs=output)

In [30]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [31]:
y_train = np.where(np.char.startswith(y_train.astype(str), 'Index_'), 1, 0)
y_test = np.where(np.char.startswith(y_test.astype(str), 'Index_'), 1, 0)
model.fit([X_train_images, X_train_text], y_train, validation_data=([X_test_images, X_test_text], y_test), epochs=10, batch_size=32)

Epoch 1/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 592ms/step - accuracy: 0.9675 - loss: 0.0497 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 2/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 523ms/step - accuracy: 1.0000 - loss: 1.7889e-08 - val_accuracy: 1.0000 - val_loss: 2.4613e-34
Epoch 3/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 555ms/step - accuracy: 1.0000 - loss: 2.8818e-08 - val_accuracy: 1.0000 - val_loss: 3.0457e-14
Epoch 4/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 540ms/step - accuracy: 1.0000 - loss: 1.4124e-08 - val_accuracy: 1.0000 - val_loss: 1.0652e-10
Epoch 5/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 540ms/step - accuracy: 1.0000 - loss: 3.3813e-08 - val_accuracy: 1.0000 - val_loss: 1.0007e-09
Epoch 6/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 539ms/step - accuracy: 1.0000 - loss: 1.4796e-08 - val_accuracy: 1.0000 -

<keras.src.callbacks.history.History at 0x7abf40078d60>

In [32]:
model.save('/content/saved_models/product_recommendation_model.h5')



In [34]:
import os
import pickle
os.makedirs('/content/models', exist_ok=True)

with open('/content/models/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from google.colab import files

# Download the saved .h5 model file
files.download('/content/product_recommendation_model.h5')
