# Importing Data

In [None]:
import datasets
print(datasets.__version__)


2.17.0


In [None]:
!pip install -U "datasets==2.17.0"


Collecting datasets==2.17.0
  Downloading datasets-2.17.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets==2.17.0)
  Downloading pyarrow_hotfix-0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.17.0)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Downloading datasets-2.17.0-py3-none-any.whl (536 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2023.10.0-py3-none-any.whl (166 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow_hotfix-0.7-py3-none-any.whl (7.9 kB)
Installing collected packages: pyarrow-hotfix, fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstal

In [None]:
from datasets import load_dataset

# 1️⃣ Load metadata (Movies & TV)
meta_dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_meta_Movies_and_TV",
    split="full",  # must use "full"
    streaming=True,
    trust_remote_code=True
)

print("✅ Metadata sample:")
for i, example in enumerate(meta_dataset):
    print(example)
    if i >= 2:
        break

# 2️⃣ Load reviews (All Beauty)
reviews_dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_review_All_Beauty",
    split="full",  # fixed here
    streaming=True,  # safer for large files
    trust_remote_code=True
)

print("\n✅ Reviews sample:")
for i, review in enumerate(reviews_dataset):
    print(review)
    if i >= 2:
        break


✅ Metadata sample:
{'main_category': 'Prime Video', 'title': 'Glee', 'average_rating': 4.7, 'rating_number': 2004, 'features': ['IMDb 6.8', '2013', '22 episodes', 'X-Ray', 'TV-14'], 'description': ['Entering its fourth season, this year the members of New Directions compete amongst themselves to be the "new Rachel" and hold auditions to find new students. Meanwhile, the graduating class leaves the comforts of McKinley where Rachel struggles to please her demanding NYADA teacher (Kate Hudson) and Kurt second-guesses his decision to stay in Lima. Four newcomers also join the musical comedy.'], 'price': '22.39', 'images': {'hi_res': [None], 'large': [None], 'thumb': [None], 'variant': ['MAIN']}, 'videos': {'title': [], 'url': [], 'user_id': []}, 'store': None, 'categories': ['Comedy', 'Drama', 'Arts, Entertainment, and Culture', 'Music Videos and Concerts'], 'details': '{"Content advisory": ["Violence", "substance use", "alcohol use", "smoking", "foul language", "sexual content"], "Audio 

In [None]:
import pandas as pd

# Load a few samples into lists first (since these are streaming datasets)
review_samples = []
meta_samples = []

for i, review in enumerate(reviews_dataset):
    review_samples.append(review)
    if i >= 500:
        break

for i, meta in enumerate(meta_dataset):
    meta_samples.append(meta)
    if i >= 500:
        break

# Convert to DataFrames
df_reviews = pd.DataFrame(review_samples)
df_meta = pd.DataFrame(meta_samples)

# Merge on 'parent_asin'
df_merged = pd.merge(df_reviews, df_meta, on="parent_asin", how="left")

# Preview
df_merged.head()


Unnamed: 0,rating,title_x,text,images_x,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,...,description,price,images_y,videos,store,categories,details,bought_together,subtitle,author
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588687728923,0,True,...,,,,,,,,,,
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588615855070,1,True,...,,,,,,,,,,
2,5.0,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,1589665266052,2,True,...,,,,,,,,,,
3,1.0,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1643393630220,0,True,...,,,,,,,,,,
4,5.0,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1609322563534,0,True,...,,,,,,,,,,


In [None]:
df_merged.shape

(501, 25)

In [None]:

# ✅ Imports
from datasets import load_dataset
import pandas as pd
import json
from tqdm import tqdm


In [None]:
# ✅ Define the category
category = "All_Beauty"  # You can change to other categories too

# ✅ Output file
reviews_file = f"reviews_{category}.jsonl"

# ✅ Load & stream reviews
reviews_dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    f"raw_review_{category}",
    split="full",
    streaming=True,
    trust_remote_code=True
)

# ✅ Save to JSONL
with open(reviews_file, "w", encoding="utf-8") as f_out:
    for review in tqdm(reviews_dataset, desc="Saving Reviews"):
        f_out.write(json.dumps(review) + "\n")

print(f"✅ Saved reviews to {reviews_file}")


Saving Reviews: 701528it [05:03, 2308.71it/s]

✅ Saved reviews to reviews_All_Beauty.jsonl





In [None]:
# ✅ Output file
meta_file = f"meta_{category}.jsonl"

# ✅ Load & stream metadata
meta_dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    f"raw_meta_{category}",
    split="full",
    streaming=True,
    trust_remote_code=True
)

# ✅ Save to JSONL
with open(meta_file, "w", encoding="utf-8") as f_out:
    for meta in tqdm(meta_dataset, desc="Saving Metadata"):
        f_out.write(json.dumps(meta) + "\n")

print(f"✅ Saved metadata to {meta_file}")


Saving Metadata: 112590it [01:38, 1141.45it/s]

✅ Saved metadata to meta_All_Beauty.jsonl





In [None]:
# ✅ Load into pandas
reviews_df = pd.read_json(reviews_file, lines=True)
meta_df = pd.read_json(meta_file, lines=True)


# ✅ Merge on 'parent_asin'
merged_df = pd.merge(reviews_df, meta_df, how="inner", on="parent_asin")

# ✅ Save to CSV
merged_file = f"merged_{category}.csv"
merged_df.to_csv(merged_file, index=False)

print(f"✅ Merged file saved to {merged_file} — Shape: {merged_df.shape}")


✅ Merged file saved to merged_All_Beauty.csv — Shape: (701528, 25)


In [None]:
merged_df.head()

Unnamed: 0,rating,title_x,text,images_x,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,...,description,price,images_y,videos,store,categories,details,bought_together,subtitle,author
0,5,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,True,...,"[If given the choice, weÕd leave most telltale...",,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['Best Hair Product For Summer!', 'O...",HERBIVORE,[],"{""Hair Type"": ""Wavy"", ""Material Type Free"": ""D...",,,
1,4,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,True,...,[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['Easy to apply!'], 'url': ['https:/...",Two Goats Apothecary,[],"{""Brand"": ""Two Goats Apothecary"", ""Item Form"":...",,,
2,5,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,True,...,[New Road Beauty Paraffin Wax is recommended f...,21.98,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['Opening the Creamsicle assortment ...,New Road Beauty,[],"{""Package Dimensions"": ""10.5 x 6.4 x 1.6 inche...",,,
3,1,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,True,...,[Hair Material: Brazilian Virgin Human Hair Bu...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",muaowig,[],"{""Brand"": ""muaowig"", ""Material"": ""Human Hair"",...",,,
4,5,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,True,...,[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Yinhua,[],"{""Package Dimensions"": ""8.5 x 3.82 x 2.24 inch...",,,


In [None]:
merged_df.columns

Index(['rating', 'title_x', 'text', 'images_x', 'asin', 'parent_asin',
       'user_id', 'timestamp', 'helpful_vote', 'verified_purchase',
       'main_category', 'title_y', 'average_rating', 'rating_number',
       'features', 'description', 'price', 'images_y', 'videos', 'store',
       'categories', 'details', 'bought_together', 'subtitle', 'author'],
      dtype='object')

In [None]:
# Select multiple columns using a list of column names
merged_df[['images_x','images_y','title_x','title_y','rating', 'average_rating','videos', 'store','text','description']].head()

Unnamed: 0,images_x,images_y,title_x,title_y,rating,average_rating,videos,store,text,description
0,[],{'hi_res': ['https://m.media-amazon.com/images...,Such a lovely scent but not overpowering.,Herbivore - Natural Sea Mist Texturizing Salt ...,5,4.3,"{'title': ['Best Hair Product For Summer!', 'O...",HERBIVORE,This spray is really nice. It smells really go...,"[If given the choice, weÕd leave most telltale..."
1,[],{'hi_res': ['https://m.media-amazon.com/images...,Works great but smells a little weird.,All Natural Vegan Dry Shampoo Powder - Eco Fri...,4,4.0,"{'title': ['Easy to apply!'], 'url': ['https:/...",Two Goats Apothecary,"This product does what I need it to do, I just...",[]
2,[],{'hi_res': ['https://m.media-amazon.com/images...,Yes!,New Road Beauty - Creamsicle - Variety 3 Pack ...,5,4.4,{'title': ['Opening the Creamsicle assortment ...,New Road Beauty,"Smells good, feels great!",[New Road Beauty Paraffin Wax is recommended f...
3,[],{'hi_res': ['https://m.media-amazon.com/images...,Synthetic feeling,muaowig Ombre Body Wave Bundles 1B Grey Human ...,1,1.0,"{'title': [], 'url': [], 'user_id': []}",muaowig,Felt synthetic,[Hair Material: Brazilian Virgin Human Hair Bu...
4,[],{'hi_res': ['https://m.media-amazon.com/images...,A+,Yinhua Electric Nail Drill Kit Portable Profes...,5,3.5,"{'title': [], 'url': [], 'user_id': []}",Yinhua,Love it,[]


In [None]:
# ✅ Columns to drop
cols_to_drop = ['images_x', 'images_y', 'videos', 'bought_together',
                'subtitle', 'author', 'categories']

# ✅ Drop unwanted columns
merged_df_cleaned = merged_df.drop(columns=cols_to_drop)

# ✅ Rename columns
merged_df_cleaned = merged_df_cleaned.rename(columns={
    'title_x': 'review_title',
    'text': 'review_description',
    'title_y': 'product_title'
})

# ✅ Remove rows where any column has null or empty list `[]`
merged_df_cleaned = merged_df_cleaned[
    merged_df_cleaned.applymap(lambda x: x not in [None, [], ""]).all(axis=1)
]

# ✅ Check result
print("✅ Cleaned DataFrame shape:", merged_df_cleaned.shape)
merged_df_cleaned.head()


  merged_df_cleaned.applymap(lambda x: x not in [None, [], ""]).all(axis=1)


✅ Cleaned DataFrame shape: (136480, 18)


Unnamed: 0,rating,review_title,review_description,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,main_category,product_title,average_rating,rating_number,features,description,price,store,details
2,5,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,True,All Beauty,New Road Beauty - Creamsicle - Variety 3 Pack ...,4.4,699,"[Same Great Product, NEW PACKAGING., MOISTURIZ...",[New Road Beauty Paraffin Wax is recommended f...,21.98,New Road Beauty,"{""Package Dimensions"": ""10.5 x 6.4 x 1.6 inche..."
3,1,Synthetic feeling,Felt synthetic,B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,True,All Beauty,muaowig Ombre Body Wave Bundles 1B Grey Human ...,1.0,1,[?Hair Bundle Material?:Brazilian Virgin Human...,[Hair Material: Brazilian Virgin Human Hair Bu...,,muaowig,"{""Brand"": ""muaowig"", ""Material"": ""Human Hair"",..."
5,4,Pretty Color,The polish was quiet thick and did not apply s...,B00R8DXL44,B00R8DXL44,AGMJ3EMDVL6OWBJF7CA5RGJLXN5A,2020-08-27 22:30:08.138,0,True,All Beauty,"China Glaze Nail Polish, Wanderlust 1381",3.8,32,[Light lavender pink nail color with golden sh...,"[China Glaze Nail Polish, Wanderlust, 1381, .5...",7.1,China Glaze,"{""Brand"": ""China Glaze"", ""Item Form"": ""Liquid""..."
21,5,Great combo pack. Wish I had been using this y...,"I love this combo package, particularly the fl...",B01M7UMAUG,B01M7UMAUG,AFSKPY37N3C43SOI5IEXEK5JSIYA,2017-10-23 14:57:04.887,0,False,All Beauty,Philips Sonicare Essence+ Gum Health & Airflos...,4.5,235,[The complete oral care solution for healthier...,[Improve your oral health with the new Philips...,,Philips Sonicare,"{""Brand"": ""Philips Sonicare"", ""Power Source"": ..."
22,3,I just don't get it,I don't see the fuss with this toothbrush. As ...,B00JMDPK8S,B00JMDPK8S,AFSKPY37N3C43SOI5IEXEK5JSIYA,2014-07-03 15:47:43.000,0,False,All Beauty,Panasonic EW-DL82 Sonic Vibration Rechargeable...,3.4,17,[Soft Start Function: Toothbrush powers on wit...,[Sonic vibration rechargeable toothbrush. Clin...,,Panasonic,"{""Brand"": ""Panasonic"", ""Age Range (Description..."


In [None]:
# Clean 'price' column: remove rows where price is None, "None", "", or nan
merged_df_cleaned = merged_df_cleaned[
    ~merged_df_cleaned['price'].isin([None, "None", "", [], float('nan')])
]

# Also drop rows with any other missing/null/empty list values in remaining columns
merged_df_cleaned = merged_df_cleaned[
    merged_df_cleaned.applymap(lambda x: x not in [None, "None", "", [], float('nan')]).all(axis=1)
]

# Reset index (optional)
merged_df_cleaned.reset_index(drop=True, inplace=True)

# Check result
print("✅ Final cleaned shape:", merged_df_cleaned.shape)
merged_df_cleaned.head()


  merged_df_cleaned.applymap(lambda x: x not in [None, "None", "", [], float('nan')]).all(axis=1)


✅ Final cleaned shape: (68435, 18)


Unnamed: 0,rating,review_title,review_description,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,main_category,product_title,average_rating,rating_number,features,description,price,store,details
0,5,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,True,All Beauty,New Road Beauty - Creamsicle - Variety 3 Pack ...,4.4,699,"[Same Great Product, NEW PACKAGING., MOISTURIZ...",[New Road Beauty Paraffin Wax is recommended f...,21.98,New Road Beauty,"{""Package Dimensions"": ""10.5 x 6.4 x 1.6 inche..."
1,4,Pretty Color,The polish was quiet thick and did not apply s...,B00R8DXL44,B00R8DXL44,AGMJ3EMDVL6OWBJF7CA5RGJLXN5A,2020-08-27 22:30:08.138,0,True,All Beauty,"China Glaze Nail Polish, Wanderlust 1381",3.8,32,[Light lavender pink nail color with golden sh...,"[China Glaze Nail Polish, Wanderlust, 1381, .5...",7.1,China Glaze,"{""Brand"": ""China Glaze"", ""Item Form"": ""Liquid""..."
2,5,They smell good. They are just the right size,I'm a BIG wet nap fan. Always have been. The...,B0020MKBNW,B0020MKBNW,AFZUK3MTBIBEDQOPAK3OATUOUKLA,2014-07-22 23:53:19.000,4,True,All Beauty,Wet-Nap Moist Towelette (case of 1000),3.4,25,[1 travel size moist towelette in individually...,[1 travel size moist towelette in individually...,57.81,Wet-Nap,"{""Brand"": ""Wet-Nap"", ""Unit Count"": ""1000 Count..."
3,5,QUICK RAIN PROTECTION,Rrain hats are really handy when<br />one does...,B00023J4AW,B00023J4AW,AFKNVFEXRGUGJAGMENCOWLVDYVCQ,2020-12-25 22:40:24.878,0,True,All Beauty,Premium Life Rain Hat With Full Visor,4.5,614,[Rain Hat With Full Visor is designed for bouf...,[Rain Hat With Full Visor is designed for bouf...,4.8,Soft 'N Style,"{""Is Discontinued By Manufacturer"": ""No"", ""Pac..."
4,4,Good Product,"Worked out fine. Just a bit tight, but o.k.",B005IYYF5E,B005IYYF5E,AFKNVFEXRGUGJAGMENCOWLVDYVCQ,2017-03-18 23:34:20.000,0,True,All Beauty,"Shower Cap - Blue Dot Pattern, Vinyl material,...",4.3,1190,"[X-Large Size/fits over any hairstyle, Strong ...",[NaRaMax Shower Caps X-Large],4.08,Ameliana,"{""Brand"": ""Ameliana"", ""Color"": ""Poka Dot"", ""Ma..."


In [None]:

# Define unwanted values
unwanted_vals = [None, "None", "", [], "#N/A", "N/A", "nan", float('nan')]

# Filter out rows with any unwanted values
merged_df_cleaned = merged_df_cleaned[
    merged_df_cleaned.applymap(lambda x: str(x).strip() not in unwanted_vals).all(axis=1)
]

# Reset index
merged_df_cleaned.reset_index(drop=True, inplace=True)

# Final check
print("✅ Cleaned shape:", merged_df_cleaned.shape)
merged_df_cleaned.head()

  merged_df_cleaned.applymap(lambda x: str(x).strip() not in unwanted_vals).all(axis=1)


✅ Cleaned shape: (68353, 18)


Unnamed: 0,rating,review_title,review_description,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,main_category,product_title,average_rating,rating_number,features,description,price,store,details
0,5,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,True,All Beauty,New Road Beauty - Creamsicle - Variety 3 Pack ...,4.4,699,"[Same Great Product, NEW PACKAGING., MOISTURIZ...",[New Road Beauty Paraffin Wax is recommended f...,21.98,New Road Beauty,"{""Package Dimensions"": ""10.5 x 6.4 x 1.6 inche..."
1,4,Pretty Color,The polish was quiet thick and did not apply s...,B00R8DXL44,B00R8DXL44,AGMJ3EMDVL6OWBJF7CA5RGJLXN5A,2020-08-27 22:30:08.138,0,True,All Beauty,"China Glaze Nail Polish, Wanderlust 1381",3.8,32,[Light lavender pink nail color with golden sh...,"[China Glaze Nail Polish, Wanderlust, 1381, .5...",7.1,China Glaze,"{""Brand"": ""China Glaze"", ""Item Form"": ""Liquid""..."
2,5,They smell good. They are just the right size,I'm a BIG wet nap fan. Always have been. The...,B0020MKBNW,B0020MKBNW,AFZUK3MTBIBEDQOPAK3OATUOUKLA,2014-07-22 23:53:19.000,4,True,All Beauty,Wet-Nap Moist Towelette (case of 1000),3.4,25,[1 travel size moist towelette in individually...,[1 travel size moist towelette in individually...,57.81,Wet-Nap,"{""Brand"": ""Wet-Nap"", ""Unit Count"": ""1000 Count..."
3,5,QUICK RAIN PROTECTION,Rrain hats are really handy when<br />one does...,B00023J4AW,B00023J4AW,AFKNVFEXRGUGJAGMENCOWLVDYVCQ,2020-12-25 22:40:24.878,0,True,All Beauty,Premium Life Rain Hat With Full Visor,4.5,614,[Rain Hat With Full Visor is designed for bouf...,[Rain Hat With Full Visor is designed for bouf...,4.8,Soft 'N Style,"{""Is Discontinued By Manufacturer"": ""No"", ""Pac..."
4,4,Good Product,"Worked out fine. Just a bit tight, but o.k.",B005IYYF5E,B005IYYF5E,AFKNVFEXRGUGJAGMENCOWLVDYVCQ,2017-03-18 23:34:20.000,0,True,All Beauty,"Shower Cap - Blue Dot Pattern, Vinyl material,...",4.3,1190,"[X-Large Size/fits over any hairstyle, Strong ...",[NaRaMax Shower Caps X-Large],4.08,Ameliana,"{""Brand"": ""Ameliana"", ""Color"": ""Poka Dot"", ""Ma..."


In [None]:
merged_df_cleaned.to_csv("cleaned_amazon_reviews.csv", index=False)


# *** RUN CODE FROM HERE***

In [1]:
import pandas as pd

In [2]:
merged_df_cleaned=pd.read_csv("cleaned_amazon_reviews.csv")

In [None]:
merged_df_cleaned.columns

Index(['rating', 'review_title', 'review_description', 'asin', 'parent_asin',
       'user_id', 'timestamp', 'helpful_vote', 'verified_purchase',
       'main_category', 'product_title', 'average_rating', 'rating_number',
       'features', 'description', 'price', 'store', 'details'],
      dtype='object')

In [None]:
merged_df_cleaned.head()

Unnamed: 0,rating,review_title,review_description,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,main_category,product_title,average_rating,rating_number,features,description,price,store,details
0,5,Yes!,"Smells good, feels great!",B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,True,All Beauty,New Road Beauty - Creamsicle - Variety 3 Pack ...,4.4,699,"['Same Great Product, NEW PACKAGING.', 'MOISTU...",['New Road Beauty Paraffin Wax is recommended ...,21.98,New Road Beauty,"{""Package Dimensions"": ""10.5 x 6.4 x 1.6 inche..."
1,4,Pretty Color,The polish was quiet thick and did not apply s...,B00R8DXL44,B00R8DXL44,AGMJ3EMDVL6OWBJF7CA5RGJLXN5A,2020-08-27 22:30:08.138,0,True,All Beauty,"China Glaze Nail Polish, Wanderlust 1381",3.8,32,['Light lavender pink nail color with golden s...,"['China Glaze Nail Polish, Wanderlust, 1381, ....",7.1,China Glaze,"{""Brand"": ""China Glaze"", ""Item Form"": ""Liquid""..."
2,5,They smell good. They are just the right size,I'm a BIG wet nap fan. Always have been. The...,B0020MKBNW,B0020MKBNW,AFZUK3MTBIBEDQOPAK3OATUOUKLA,2014-07-22 23:53:19.000,4,True,All Beauty,Wet-Nap Moist Towelette (case of 1000),3.4,25,['1 travel size moist towelette in individuall...,['1 travel size moist towelette in individuall...,57.81,Wet-Nap,"{""Brand"": ""Wet-Nap"", ""Unit Count"": ""1000 Count..."
3,5,QUICK RAIN PROTECTION,Rrain hats are really handy when<br />one does...,B00023J4AW,B00023J4AW,AFKNVFEXRGUGJAGMENCOWLVDYVCQ,2020-12-25 22:40:24.878,0,True,All Beauty,Premium Life Rain Hat With Full Visor,4.5,614,['Rain Hat With Full Visor is designed for bou...,['Rain Hat With Full Visor is designed for bou...,4.8,Soft 'N Style,"{""Is Discontinued By Manufacturer"": ""No"", ""Pac..."
4,4,Good Product,"Worked out fine. Just a bit tight, but o.k.",B005IYYF5E,B005IYYF5E,AFKNVFEXRGUGJAGMENCOWLVDYVCQ,2017-03-18 23:34:20.000,0,True,All Beauty,"Shower Cap - Blue Dot Pattern, Vinyl material,...",4.3,1190,"['X-Large Size/fits over any hairstyle', 'Stro...",['NaRaMax Shower Caps X-Large'],4.08,Ameliana,"{""Brand"": ""Ameliana"", ""Color"": ""Poka Dot"", ""Ma..."


In [3]:
# Number of rows
print(f"Number of rows: {merged_df_cleaned.shape[0]}")

# Number of unique users
print(f"Number of unique users: {merged_df_cleaned['user_id'].nunique()}")

# Number of unique products
print(f"Number of unique products: {merged_df_cleaned['asin'].nunique()}")

Number of rows: 68353
Number of unique users: 66778
Number of unique products: 5472


In [None]:
# # Ensure compatible versions of numpy, scipy, and scikit-learn
# !pip install --upgrade --force-reinstall numpy==1.23.5 scipy scikit-learn


# Popularity Baseline

In [None]:
# 1️⃣ Compute popularity score (mean rating *or* count of ratings — you can choose)
popularity_df = (
    rating_df
    .groupby('asin')
    .agg(avg_rating=('rating', 'mean'), rating_count=('rating', 'count'))
    .reset_index()
)

# Let’s use rating_count for popularity
popularity_df = popularity_df.sort_values(by='rating_count', ascending=False)

# 2️⃣ Popularity-based recommender function
def popularity_recommendations(top_n=6):
    return popularity_df['asin'].head(top_n).tolist()

def precision_at_k_popularity(user_id, k=6, rating_threshold=3.0):
    relevant_items = rating_df[
        (rating_df['user_id'] == user_id) &
        (rating_df['rating'] >= rating_threshold)
    ]['asin'].tolist()
    if len(relevant_items) == 0:
        return None

    recommended_items = popularity_recommendations(top_n=k)
    hits = len(set(recommended_items) & set(relevant_items))
    return hits / k

def recall_at_k_popularity(user_id, k=6, rating_threshold=3.0):
    relevant_items = rating_df[
        (rating_df['user_id'] == user_id) &
        (rating_df['rating'] >= rating_threshold)
    ]['asin'].tolist()
    if len(relevant_items) == 0:
        return None

    recommended_items = popularity_recommendations(top_n=k)
    hits = len(set(recommended_items) & set(relevant_items))
    return hits / len(relevant_items)

def f1_at_k_popularity(user_id, k=6, rating_threshold=3.0):
    prec = precision_at_k_popularity(user_id, k, rating_threshold)
    rec = recall_at_k_popularity(user_id, k, rating_threshold)
    if prec is None or rec is None or (prec + rec) == 0:
        return None
    return 2 * (prec * rec) / (prec + rec)

# Get unique user IDs from the rating_df
user_ids = rating_df['user_id'].unique()

precision_pop = []
recall_pop = []
f1_pop = []

for uid in user_ids[:100]: # Iterate over the first 100 unique user IDs
    prec = precision_at_k_popularity(uid, k=6)
    rec = recall_at_k_popularity(uid, k=6)
    f1 = f1_at_k_popularity(uid, k=6)

    if prec is not None:
        precision_pop.append(prec)
    if rec is not None:
        recall_pop.append(rec)
    if f1 is not None:
        f1_pop.append(f1)

# Report
print(f"Popularity Precision@10: {np.mean(precision_pop):.4f}")
print(f"Popularity Recall@10: {np.mean(recall_pop):.4f}")
print(f"Popularity F1@10: {np.mean(f1_pop):.4f}")

Popularity Precision@10: 0.0131
Popularity Recall@10: 0.0787
Popularity F1@10: 0.2857


# Hybrid Recommender system

In [21]:
# 1️⃣ Imports
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# 2️⃣ Prepare Data
df = merged_df_cleaned.dropna(subset=['rating', 'asin', 'user_id']).copy()
df = df.drop_duplicates(subset=['asin']).reset_index(drop=True)

user_encoder = LabelEncoder()
product_encoder = LabelEncoder()
df['product_index'] = product_encoder.fit_transform(df['asin'])

rating_df = merged_df_cleaned[['user_id', 'asin', 'rating']].dropna()
rating_df = rating_df[rating_df['asin'].isin(df['asin'])]
rating_df['user_index'] = user_encoder.fit_transform(rating_df['user_id'])
rating_df['product_index'] = product_encoder.transform(rating_df['asin'])

n_users = rating_df['user_index'].nunique()
n_products = df['product_index'].nunique()

# 3️⃣ User-Item Rating Matrix
rating_matrix = np.zeros((n_users, n_products))
for row in rating_df.itertuples():
    rating_matrix[row.user_index, row.product_index] = row.rating

# 4️⃣ Collaborative Filtering SVD
svd_cf = TruncatedSVD(n_components=20, random_state=42)
user_features = svd_cf.fit_transform(rating_matrix)
predicted_ratings = np.dot(user_features, svd_cf.components_)

# 5️⃣ Enriched content + LSA + cosine similarity
def clean_text(x):
    if isinstance(x, list):
        return ' '.join(x).lower()
    if isinstance(x, dict):
        return ' '.join([f"{k} {v}" for k, v in x.items()]).lower()
    return str(x).lower()

df['combined_text'] = (
    df['product_title'].apply(clean_text) * 2 + ' ' +
    df['main_category'].apply(clean_text) * 2 + ' ' +
    df['description'].apply(clean_text) + ' ' +
    df['price'].apply(clean_text) + ' ' +
    df['store'].apply(clean_text) + ' ' +
    df['features'].apply(clean_text) + ' ' +
    df['details'].apply(clean_text)
)

tfidf = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1,2))
tfidf_matrix = tfidf.fit_transform(df['combined_text'])

lsa = TruncatedSVD(n_components=100, random_state=42)
lsa_matrix = lsa.fit_transform(tfidf_matrix)

product_cosine_sim = cosine_similarity(lsa_matrix)

# 6️⃣ Hybrid recommender
def hybrid_recommendations_weighted_score(user_id, asin, top_n=10, cf_weight=0.6, cb_weight=0.4):
    if user_id not in user_encoder.classes_ or asin not in product_encoder.classes_:
        return []

    user_idx = user_encoder.transform([user_id])[0]
    predicted_user_ratings = predicted_ratings[user_idx]

    cf_scores = (predicted_user_ratings - predicted_user_ratings.min()) / (
        predicted_user_ratings.max() - predicted_user_ratings.min() + 1e-8)

    cb_idx = df[df['asin'] == asin].index[0]
    cb_scores = product_cosine_sim[cb_idx]
    cb_scores = (cb_scores - cb_scores.min()) / (cb_scores.max() - cb_scores.min() + 1e-8)

    hybrid_scores = cf_weight * cf_scores + cb_weight * cb_scores
    top_indices = np.argsort(hybrid_scores)[::-1]
    return df.iloc[top_indices]['asin'].tolist()[:top_n]

# 7️⃣ Evaluation functions
def precision_at_k(user_id, k=10, rating_threshold=2.5):
    relevant_items = rating_df[
        (rating_df['user_id'] == user_id) &
        (rating_df['rating'] >= rating_threshold)
    ]['asin'].tolist()
    if len(relevant_items) == 0:
        return None

    asin = rating_df[rating_df['user_id'] == user_id]['asin'].iloc[0]
    recommended_items = hybrid_recommendations_weighted_score(user_id, asin, top_n=k)
    if not recommended_items:
        return None

    hits = len(set(recommended_items) & set(relevant_items))
    return hits / k

def recall_at_k(user_id, k=10, rating_threshold=2.5):
    relevant_items = rating_df[
        (rating_df['user_id'] == user_id) &
        (rating_df['rating'] >= rating_threshold)
    ]['asin'].tolist()
    if len(relevant_items) == 0:
        return None

    asin = rating_df[rating_df['user_id'] == user_id]['asin'].iloc[0]
    recommended_items = hybrid_recommendations_weighted_score(user_id, asin, top_n=k)
    if not recommended_items:
        return None

    hits = len(set(recommended_items) & set(relevant_items))
    return hits / len(relevant_items)

def f1_at_k(user_id, k=10, rating_threshold=2.5):
    prec = precision_at_k(user_id, k, rating_threshold)
    rec = recall_at_k(user_id, k, rating_threshold)
    if prec is None or rec is None or (prec + rec) == 0:
        return None
    return 2 * (prec * rec) / (prec + rec)

# 8️⃣ Filter users with at least 3 relevant items
eligible_users = rating_df.groupby('user_id') \
    .apply(lambda x: (x['rating'] >= 2.5).sum()) \
    .reset_index(name='relevant_count')
eligible_users = eligible_users[eligible_users['relevant_count'] >= 4]['user_id'].tolist()

# 9️⃣ Run evaluation
precision_scores = []
recall_scores = []
f1_scores = []

for uid in eligible_users[:100]:
    prec = precision_at_k(uid, k=10, rating_threshold=2.5)
    rec = recall_at_k(uid, k=10, rating_threshold=2.5)
    f1 = f1_at_k(uid, k=10, rating_threshold=2.5)

    if prec is not None:
        precision_scores.append(prec)
    if rec is not None:
        recall_scores.append(rec)
    if f1 is not None:
        f1_scores.append(f1)

# 🔟 Report
print("\n✅ FINAL RESULTS (k=10, rating_threshold=2.5)")
print(f"   → Precision@10: {np.mean(precision_scores):.4f}")
print(f"   → Recall@10:    {np.mean(recall_scores):.4f}")
print(f"   → F1@10:        {np.mean(f1_scores):.4f}")


  .apply(lambda x: (x['rating'] >= 2.5).sum()) \



✅ FINAL RESULTS (k=10, rating_threshold=2.5)
   → Precision@10: 0.1323
   → Recall@10:    0.3003
   → F1@10:        0.1823


In [22]:
# Group by user_id and count their ratings
user_rating_counts = rating_df.groupby('user_id').size()

# Filter users with at least 3 ratings
active_users = user_rating_counts[user_rating_counts >= 3]

# Number of such users
num_active_users = active_users.shape[0]

print(f"Number of users who rated at least 3 products: {num_active_users}")


Number of users who rated at least 3 products: 117


In [8]:
print(rating_df.groupby('user_id').apply(lambda x: (x['rating'] >= 3.0).sum()).describe())


count    66778.000000
mean         0.843436
std          0.421791
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         10.000000
dtype: float64


  print(rating_df.groupby('user_id').apply(lambda x: (x['rating'] >= 3.0).sum()).describe())


#lightFM

In [24]:
!pip install lightfm

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp311-cp311-linux_x86_64.whl size=831123 sha256=0f06bb8c9f831c4f411205ee760a45af106663bf8b25d9096da8132a5543afd2
  Stored in directory: /root/.cache/pip/wheels/b9/0d/8a/0729d2e6e3ca2a898ba55201f905da7db3f838a33df5b3fcdd
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [27]:
# 1️⃣ Imports
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

# 2️⃣ Prepare Data
df = merged_df_cleaned[['user_id', 'asin', 'rating', 'main_category', 'store']].dropna()

# Binarize ratings: 1 if >= 2.5 (match your threshold)
df['rating_bin'] = df['rating'].apply(lambda x: 1 if x >= 2.5 else 0)

# Filter active users / popular products
user_counts = df['user_id'].value_counts()
product_counts = df['asin'].value_counts()
df = df[df['user_id'].isin(user_counts[user_counts >= 3].index)]
df = df[df['asin'].isin(product_counts[product_counts >= 3].index)]

# 3️⃣ Create LightFM Dataset
dataset = Dataset()
dataset.fit(
    users=df['user_id'],
    items=df['asin'],
    user_features=df['user_id'].unique(),
    item_features=list(df['main_category'].unique()) + list(df['store'].unique())
)

(interactions, _) = dataset.build_interactions(
    [(row['user_id'], row['asin'], row['rating_bin']) for _, row in df.iterrows()]
)

user_features = dataset.build_user_features(
    ((u, [u]) for u in df['user_id'].unique())
)
item_features = dataset.build_item_features(
    ((row['asin'], [row['main_category'], row['store']]) for _, row in df.iterrows())
)

# 4️⃣ Build + Train model (use WARP for ranking)
model = LightFM(loss='warp', no_components=64, learning_rate=0.05, random_state=42)
model.fit(
    interactions,
    user_features=user_features,
    item_features=item_features,
    epochs=50,
    num_threads=4
)

# 5️⃣ Evaluate
precision = precision_at_k(model, interactions, k=10, user_features=user_features, item_features=item_features).mean()
recall = recall_at_k(model, interactions, k=10, user_features=user_features, item_features=item_features).mean()
auc = auc_score(model, interactions, user_features=user_features, item_features=item_features).mean()

# 6️⃣ Report
print("\n✅ LightFM Results (k=10, rating_threshold=2.5)")
print(f"Precision@10: {precision:.4f}")
print(f"Recall@10:    {recall:.4f}")
print(f"AUC:          {auc:.4f}")



✅ LightFM Results (k=10, rating_threshold=2.5)
Precision@10: 0.1704
Recall@10:    0.5530
AUC:          0.9987


In [4]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k

# Prepare your dataset (same as before)
df = merged_df_cleaned[['user_id', 'asin', 'rating', 'main_category', 'store']].dropna()
df['rating_bin'] = df['rating'].apply(lambda x: 1 if x >= 2.5 else 0)

user_counts = df['user_id'].value_counts()
product_counts = df['asin'].value_counts()
df = df[df['user_id'].isin(user_counts[user_counts >= 3].index)]
df = df[df['asin'].isin(product_counts[product_counts >= 3].index)]

dataset = Dataset()
dataset.fit(
    users=df['user_id'],
    items=df['asin'],
    user_features=df['user_id'].unique(),
    item_features=list(df['main_category'].unique()) + list(df['store'].unique())
)

(interactions, _) = dataset.build_interactions(
    [(row['user_id'], row['asin'], row['rating_bin']) for _, row in df.iterrows()]
)

user_features = dataset.build_user_features(
    ((u, [u]) for u in df['user_id'].unique())
)
item_features = dataset.build_item_features(
    ((row['asin'], [row['main_category'], row['store']]) for _, row in df.iterrows())
)

# Hyperparameter grid
losses = ['warp', 'bpr']
components = [32, 64, 128]
learning_rates = [0.01, 0.05, 0.1]

# Grid search loop
results = []
for loss in losses:
    for no_components in components:
        for lr in learning_rates:
            print(f"\n🔍 Training LightFM (loss={loss}, components={no_components}, lr={lr})")
            model = LightFM(loss=loss, no_components=no_components, learning_rate=lr, random_state=42)
            model.fit(
                interactions,
                user_features=user_features,
                item_features=item_features,
                epochs=30,
                num_threads=4
            )
            prec = precision_at_k(model, interactions, k=10, user_features=user_features, item_features=item_features).mean()
            rec = recall_at_k(model, interactions, k=10, user_features=user_features, item_features=item_features).mean()
            results.append({
                'loss': loss,
                'no_components': no_components,
                'learning_rate': lr,
                'precision@10': prec,
                'recall@10': rec
            })
            print(f"→ Precision@10: {prec:.4f} | Recall@10: {rec:.4f}")

# Display best configs
results_df = pd.DataFrame(results)
print("\n🏆 Top 5 configurations by Precision@10:")
print(results_df.sort_values(by='precision@10', ascending=False).head())

print("\n🏆 Top 5 configurations by Recall@10:")
print(results_df.sort_values(by='recall@10', ascending=False).head())



🔍 Training LightFM (loss=warp, components=32, lr=0.01)
→ Precision@10: 0.0548 | Recall@10: 0.1552

🔍 Training LightFM (loss=warp, components=32, lr=0.05)
→ Precision@10: 0.1687 | Recall@10: 0.5357

🔍 Training LightFM (loss=warp, components=32, lr=0.1)
→ Precision@10: 0.1713 | Recall@10: 0.5559

🔍 Training LightFM (loss=warp, components=64, lr=0.01)
→ Precision@10: 0.0487 | Recall@10: 0.1357

🔍 Training LightFM (loss=warp, components=64, lr=0.05)
→ Precision@10: 0.1678 | Recall@10: 0.5397

🔍 Training LightFM (loss=warp, components=64, lr=0.1)
→ Precision@10: 0.1704 | Recall@10: 0.5530

🔍 Training LightFM (loss=warp, components=128, lr=0.01)
→ Precision@10: 0.0470 | Recall@10: 0.1313

🔍 Training LightFM (loss=warp, components=128, lr=0.05)
→ Precision@10: 0.1696 | Recall@10: 0.5501

🔍 Training LightFM (loss=warp, components=128, lr=0.1)
→ Precision@10: 0.1713 | Recall@10: 0.5559

🔍 Training LightFM (loss=bpr, components=32, lr=0.01)
→ Precision@10: 0.0200 | Recall@10: 0.0646

🔍 Training