In [1]:
import pandas as pd
import numpy as np


In [2]:

# Load the dataset
df = pd.read_csv("amazon.csv")


In [16]:
df.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link'],
      dtype='object')

In [17]:
df.head()

Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,user_name,review_id,review_title,review_content,img_link,product_link
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,399.0,1099.0,64.0,4.2,24269.0,High Compatibility : Compatible With iPhone 12...,AG3D6O4STAQKAY2UVGEUV46KN35Q,"Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jasp...","R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
1,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,399.0,1099.0,64.0,4.2,24269.0,High Compatibility : Compatible With iPhone 12...,AHMY5CWJMMK5BJRBBSNLYT3ONILA,"Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jasp...","R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
2,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,399.0,1099.0,64.0,4.2,24269.0,High Compatibility : Compatible With iPhone 12...,AHCTC6ULH4XB6YHDY6PCH2R772LQ,"Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jasp...","R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
3,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,399.0,1099.0,64.0,4.2,24269.0,High Compatibility : Compatible With iPhone 12...,AGYHHIERNXKA6P5T7CZLXKVPT7IQ,"Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jasp...","R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
4,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,399.0,1099.0,64.0,4.2,24269.0,High Compatibility : Compatible With iPhone 12...,AG4OGOFWXJZTQ2HKYIOCOY3KXF2Q,"Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jasp...","R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...


In [3]:

# ========== 1. Clean price, discount and rating columns ==========
def clean_price(value):
    return float(value.replace('₹', '').replace(',', '').strip()) if isinstance(value, str) else np.nan

def clean_discount(value):
    return float(value.replace('%', '').strip()) if isinstance(value, str) else np.nan

def clean_rating_count(value):
    return int(value.replace(',', '').strip()) if isinstance(value, str) and value.replace(',', '').isdigit() else np.nan


In [4]:

# Apply cleaning functions
df['discounted_price'] = df['discounted_price'].apply(clean_price)
df['actual_price'] = df['actual_price'].apply(clean_price)
df['discount_percentage'] = df['discount_percentage'].apply(clean_discount)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['rating_count'] = df['rating_count'].apply(clean_rating_count)


In [5]:

# ========== 2. Explode user_id (and other review fields if needed) ==========
# Convert comma-separated user_ids to list
df['user_id'] = df['user_id'].apply(lambda x: str(x).split(','))


In [6]:

# Explode the user_id column (each row becomes one user-product pair)
df = df.explode('user_id')


In [7]:

# Remove whitespaces
df['user_id'] = df['user_id'].str.strip()



In [8]:
# ========== 3. Drop nulls and duplicates ==========
df.dropna(subset=['user_id', 'product_id', 'rating'], inplace=True)
df.drop_duplicates(subset=['user_id', 'product_id'], inplace=True)


In [9]:

# Preview cleaned dataset
print(df[['user_id', 'product_id', 'rating']].head())




                        user_id  product_id  rating
0  AG3D6O4STAQKAY2UVGEUV46KN35Q  B07JW9H4J1     4.2
0  AHMY5CWJMMK5BJRBBSNLYT3ONILA  B07JW9H4J1     4.2
0  AHCTC6ULH4XB6YHDY6PCH2R772LQ  B07JW9H4J1     4.2
0  AGYHHIERNXKA6P5T7CZLXKVPT7IQ  B07JW9H4J1     4.2
0  AG4OGOFWXJZTQ2HKYIOCOY3KXF2Q  B07JW9H4J1     4.2


In [10]:
# (Optional) Save cleaned dataset
df.to_csv("cleaned_amazon_data.csv", index=False)

In [11]:
# Load the cleaned dataset
df = pd.read_csv("cleaned_amazon_data.csv")

# Safe version without warning
interaction_df = df[['user_id', 'product_id', 'rating']].copy()

# Now it's safe to modify
interaction_df.drop_duplicates(subset=['user_id', 'product_id'], inplace=True)
interaction_df.reset_index(drop=True, inplace=True)

# Preview the interaction matrix
print(interaction_df.head())

# (Optional) Save for next step
interaction_df.to_csv("user_product_interaction.csv", index=False)

                        user_id  product_id  rating
0  AG3D6O4STAQKAY2UVGEUV46KN35Q  B07JW9H4J1     4.2
1  AHMY5CWJMMK5BJRBBSNLYT3ONILA  B07JW9H4J1     4.2
2  AHCTC6ULH4XB6YHDY6PCH2R772LQ  B07JW9H4J1     4.2
3  AGYHHIERNXKA6P5T7CZLXKVPT7IQ  B07JW9H4J1     4.2
4  AG4OGOFWXJZTQ2HKYIOCOY3KXF2Q  B07JW9H4J1     4.2


In [12]:
# STEP 1: Group by product_id to get unique products
grouped_df = df.groupby('product_id').agg({
    'product_name': 'first',
    'about_product': lambda x: ' '.join(x.dropna().astype(str)),
    'category': 'first',
    'review_content': lambda x: ' '.join(x.dropna().astype(str))
}).reset_index()

# STEP 2: Create combined feature
grouped_df['combined_features'] = grouped_df['about_product'] + ' ' + \
                                  grouped_df['category'] + ' ' + \
                                  grouped_df['review_content']

# STEP 3: TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(grouped_df['combined_features'])

# STEP 4: Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# STEP 5: Product ID to Index Mapping
product_indices = pd.Series(grouped_df.index, index=grouped_df['product_id'])

# STEP 6: Recommendation Function
def recommend_similar_products(product_id, top_n=5):
    if product_id not in product_indices:
        return f"❌ Product ID '{product_id}' not found."

    idx = int(product_indices[product_id])
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    product_indices_top = [i[0] for i in sim_scores]

    return grouped_df.iloc[product_indices_top][['product_id', 'product_name']]


In [13]:
print(recommend_similar_products("B07JW9H4J1", top_n=5))


     product_id                                       product_name
339  B07JH1CBGW  Wayona Nylon Braided Usb Syncing And Charging ...
345  B07JW1Y6XV  Wayona Nylon Braided 3A Lightning to USB A Syn...
338  B07JH1C41D  Wayona Nylon Braided (2 Pack) Lightning Fast U...
369  B07LGT55SJ  Wayona Usb Nylon Braided Data Sync And Chargin...
337  B07JGDB5M1  Wayona Nylon Braided 2M / 6Ft Fast Charge Usb ...


In [14]:
print(df['img_link'].dropna().unique()[:5])

['https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/51UsScvHQNL._SX300_SY300_QL70_FMwebp_.jpg'
 'https://m.media-amazon.com/images/W/WEBP_402378-T2/images/I/31zOsqQOAOL._SY445_SX342_QL70_FMwebp_.jpg'
 'https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/31IvNJZnmdL._SY445_SX342_QL70_FMwebp_.jpg'
 'https://m.media-amazon.com/images/I/41V5FtEWPkL._SX300_SY300_QL70_FMwebp_.jpg'
 'https://m.media-amazon.com/images/W/WEBP_402378-T2/images/I/31VzNhhqifL._SX300_SY300_QL70_FMwebp_.jpg']


In [15]:
df['product_id'][10]

'B098NS6PVG'