In [2]:
# Install HuggingFace datasets
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [3]:
import os
import pandas as pd
import random
from datasets import load_dataset
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Connecting the Google Drive to Colab to store the datasets



In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Output paths
base_dir = "/content/drive/MyDrive/amazon_data_2023"
os.makedirs(base_dir, exist_ok=True)

review_csv = os.path.join(base_dir, "amazon_data_review.csv")
meta_csv = os.path.join(base_dir, "amazon_data_meta.csv")

# 1. Data Aquisition
# a) Obtain the Entire Dataset

# Sampling Amazon Metadata

This part of the code is used to collect a sample of Amazon product metadata from all categories. Since the full dataset is too large to process all at once, we take a smaller, manageable sample from each category.

1. The function **sample_meta()** loads meta from the Hugging Face dataset McAuley-Lab/Amazon-Reviews-2023. For each category, it randomly selects 2000 metadata and stops reading after 5 million entries to keep things efficient.

2. The parent_asin is saved into a set called collected_asins. This is a suprise tool that will help us later! (Join review data with item meta)

3. Then, each category's sample is saved to a CSV file (meta_csv)and the categories that have already been processed is being tracked using a text file called completed_categories.txt (if the script crashes or needs to be rerun)

4. To be safe and avoid hitting any limits, short pauses (time.sleep) was added between processing each category (A LOT OF CLIENT/REQUEST ERROR)

The output should be:

*   A combined CSV with thousands of metadata across multiple categories.
*   A list of unique parent_asin values to use when merging with reviews.
*   A progress file that tells us which categories are done.

Reference:
https://colab.research.google.com/drive/1sEQvZK94lk_YD4dc_g9m9RhtrFcut6VU?usp=sharing


In [6]:
import time


meta_categories = [
    "raw_meta_All_Beauty", "raw_meta_Amazon_Fashion", "raw_meta_Appliances",
    "raw_meta_Arts_Crafts_and_Sewing", "raw_meta_Automotive", "raw_meta_Baby_Products",
    "raw_meta_Beauty_and_Personal_Care", "raw_meta_Books", "raw_meta_CDs_and_Vinyl", "raw_meta_Cell_Phones_and_Accessories",
    "raw_meta_Clothing_Shoes_and_Jewelry", "raw_meta_Digital_Music", "raw_meta_Electronics",
    "raw_meta_Gift_Cards", "raw_meta_Grocery_and_Gourmet_Food", "raw_meta_Handmade_Products",
    "raw_meta_Health_and_Household", "raw_meta_Health_and_Personal_Care", "raw_meta_Home_and_Kitchen",
    "raw_meta_Industrial_and_Scientific", "raw_meta_Kindle_Store", "raw_meta_Magazine_Subscriptions", "raw_meta_Movies_and_TV",
    "raw_meta_Musical_Instruments", "raw_meta_Office_Products", "raw_meta_Patio_Lawn_and_Garden", "raw_meta_Pet_Supplies",
    "raw_meta_Software", "raw_meta_Sports_and_Outdoors", "raw_meta_Subscription_Boxes", "raw_meta_Tools_and_Home_Improvement",
    "raw_meta_Toys_and_Games", "raw_meta_Video_Games", "raw_meta_Unknown"
]

collected_asins = set() # store unique ASINs here

# Function to randomly sample metadata and collect ASINs
def sample_meta(meta_cat, sample_size=25000, seed=42):
    print(f"Sampling metadata from: {meta_cat}")
    random.seed(seed)

    # a) Obtain the Entire* Dataset
    dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", name=meta_cat, split="full", streaming=True, trust_remote_code=True)

    sample = []
    # *only a sample size of 25000 for each cat is retrieved
    for idx, example in enumerate(dataset):
        if idx < sample_size:
            sample.append(example)
        else:
            r = random.randint(0, idx)
            if r < sample_size:
                sample[r] = example
        if idx > 5_000_000:
            break

    df = pd.DataFrame(sample)

    # collect ASINs
    if 'parent_asin' in df.columns:
        collected_asins.update(df['parent_asin'].dropna().tolist())


    df.to_csv(meta_csv, mode='a', header=not os.path.exists(meta_csv), index=False)
    print(f"Saved {len(df)} meta from {meta_cat}")
    time.sleep(2) # I am not a robot
    return True


completed_categories = set() # to track the cat that was already processed
if os.path.exists("completed_categories.txt"):
    with open("completed_categories.txt", "r") as f:
        completed_categories = set(line.strip() for line in f)


for meta_cat in tqdm(meta_categories, desc="Sampling meta one-by-one"):
    if meta_cat in completed_categories:
        print(f"Skipping {meta_cat}, already processed.")
        continue
    try:
        sample_meta(meta_cat)
        with open("completed_categories.txt", "a") as f:
            f.write(meta_cat + "\n")
        time.sleep(5) # Let me in pleasee
    except Exception as e:
        print(f"Error while processing {meta_cat}: {e}")


print(f"\nCollected {len(collected_asins)} unique ASINs from meta")

Sampling meta one-by-one:   0%|          | 0/34 [00:00<?, ?it/s]

Sampling metadata from: raw_meta_All_Beauty


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Amazon-Reviews-2023.py:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Saved 25000 meta from raw_meta_All_Beauty
Sampling metadata from: raw_meta_Amazon_Fashion
Saved 25000 meta from raw_meta_Amazon_Fashion
Sampling metadata from: raw_meta_Appliances
Saved 25000 meta from raw_meta_Appliances
Sampling metadata from: raw_meta_Arts_Crafts_and_Sewing
Saved 25000 meta from raw_meta_Arts_Crafts_and_Sewing
Sampling metadata from: raw_meta_Automotive
Saved 25000 meta from raw_meta_Automotive
Sampling metadata from: raw_meta_Baby_Products
Saved 25000 meta from raw_meta_Baby_Products
Sampling metadata from: raw_meta_Beauty_and_Personal_Care
Saved 25000 meta from raw_meta_Beauty_and_Personal_Care
Sampling metadata from: raw_meta_Books
Saved 25000 meta from raw_meta_Books
Sampling metadata from: raw_meta_CDs_and_Vinyl
Saved 25000 meta from raw_meta_CDs_and_Vinyl
Sampling metadata from: raw_meta_Cell_Phones_and_Accessories
Saved 25000 meta from raw_meta_Cell_Phones_and_Accessories
Sampling metadata from: raw_meta_Clothing_Shoes_and_Jewelry
Saved 25000 meta from raw_me

In [7]:
meta_df = pd.read_csv(meta_csv)

print("Shape:", meta_df.shape)

  meta_df = pd.read_csv(meta_csv)


Shape: (780169, 16)


After sampling the metadata, we can focus on collecting the matching user review based on the parent_asin (Product ID).

1. Similar to the above code, we keep track of which review categories we’ve already processed using a file called completed_review.txt. This helps avoid downloading the same data twice if we rerun the code.

2. The function fetch_matching_reviews() loads reviews for specific categories. It streams through each category and checks if its parent_asin matches one of the ASINs we collected from metadata. Save to the list if so.

3. While running the code, it had request errors and it was noted that it would reload and scan the categories again. To prevent this, once all matching metadata is found for a particular category, it’s saved to its own temporary CSV file (e.g. temp_metadata_Electronics.csv). This helps keep each category separate until we’re ready to merge everything.

4. After collecting metadata from all categories, all the temporary files are loaded and combined into one big DataFrame. Then remove duplicates based on the parent_asin and save the final cleaned and deduplicated metadata to meta_csv. Bonus: All temporary files are deleted for space management.

Output:
* A clean reviews file

In [12]:
completed_cat = set() #to track the cat

if os.path.exists("completed_reviews.txt"):
    with open("completed_reviews.txt", "r") as f:
        completed_cat = set(line.strip() for line in f)

# Function to load reviews based on the matching ASINs of reviews
def fetch_matching_reviews(rev_cat):
    print(f"Scanning reviews: {rev_cat}")
    matches = []
    try:
        dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", name=rev_cat, split="full", streaming=True, trust_remote_code=True)
        for item in dataset:
            asin = item.get("parent_asin") or item.get("asin") # ASINs
            if asin in collected_asins:
              matches.append(item)

              if len(matches) >= 100000:
                break

        if matches: # if it matches, store it in its temporary cat file
            df = pd.DataFrame(matches)
            temp_filename = f"temp_reviews_{rev_cat}.csv"
            df.to_csv(temp_filename, index=False)
            print(f"Saved {len(matches)} matches to {temp_filename}")

            # Log completed category
            with open("completed_reviews.txt", "a") as f:
                f.write(rev_cat + "\n")
        else:
            print(f"No matches found in {rev_cat}")
        return True

    except Exception as e:
        print(f"Error in {rev_cat}: {e}")
        return False

    finally:
        time.sleep(5) # Let me in pleasee

categories = [
    "raw_review_All_Beauty", "raw_review_Amazon_Fashion", "raw_review_Appliances",
    "raw_review_Arts_Crafts_and_Sewing", "raw_review_Automotive", "raw_review_Baby_Products",
    "raw_review_Beauty_and_Personal_Care", "raw_review_Books","raw_review_CDs_and_Vinyl","raw_review_Cell_Phones_and_Accessories",
    "raw_review_Clothing_Shoes_and_Jewelry", "raw_review_Digital_Music", "raw_review_Electronics",
    "raw_review_Gift_Cards", "raw_review_Grocery_and_Gourmet_Food", "raw_review_Handmade_Products",
    "raw_review_Health_and_Household", "raw_review_Health_and_Personal_Care", "raw_review_Home_and_Kitchen",
    "raw_review_Industrial_and_Scientific", "raw_review_Kindle_Store", "raw_review_Magazine_Subscriptions", "raw_review_Movies_and_TV",
    "raw_review_Musical_Instruments", "raw_review_Office_Products", "raw_review_Patio_Lawn_and_Garden", "raw_review_Pet_Supplies",
    "raw_review_Software", "raw_review_Sports_and_Outdoors", "raw_review_Subscription_Boxes", "raw_review_Tools_and_Home_Improvement",
    "raw_review_Toys_and_Games", "raw_review_Video_Games", "raw_review_Unknown"
]

for category in tqdm(categories, desc="Fetching reviews"):
    if category in completed_cat:
        print(f"Skipping {category} (already completed)")
        continue
    try:
        fetch_matching_reviews(category)
        time.sleep(3)
    except Exception as e:
        print(f"Error in {category}: {e}")

print("\n Merging all review files...")
temp_files = [f for f in os.listdir() if f.startswith("temp_reviews") and f.endswith(".csv")]

# deduplication
all_dfs = []
for f in temp_files:
    df = pd.read_csv(f)
    all_dfs.append(df)

if all_dfs:
    combined_df = pd.concat(all_dfs, ignore_index=True)
    combined_df.drop_duplicates(subset='parent_asin', inplace=True)
    combined_df.to_csv(review_csv, index=False) # save to the meta csv
    print(f" Final deduplicated reviews saved to: {review_csv}")
else:
    print(" No review files found to merge.")

# delete temporary files for storage
for f in temp_files:
    os.remove(f)

print("All review files are saved in review_csv!")

Fetching reviews:   0%|          | 0/34 [00:00<?, ?it/s]

Skipping raw_review_All_Beauty (already completed)
Skipping raw_review_Amazon_Fashion (already completed)
Skipping raw_review_Appliances (already completed)
Skipping raw_review_Arts_Crafts_and_Sewing (already completed)
Skipping raw_review_Automotive (already completed)
Skipping raw_review_Baby_Products (already completed)
Skipping raw_review_Beauty_and_Personal_Care (already completed)
Scanning reviews: raw_review_Books
Saved 100000 matches to temp_reviews_raw_review_Books.csv
Scanning reviews: raw_review_CDs_and_Vinyl
Saved 100000 matches to temp_reviews_raw_review_CDs_and_Vinyl.csv
Skipping raw_review_Cell_Phones_and_Accessories (already completed)
Skipping raw_review_Clothing_Shoes_and_Jewelry (already completed)
Skipping raw_review_Digital_Music (already completed)
Skipping raw_review_Electronics (already completed)
Skipping raw_review_Gift_Cards (already completed)
Skipping raw_review_Grocery_and_Gourmet_Food (already completed)
Skipping raw_review_Handmade_Products (already comp

# What does the dataset look like?

In [None]:
combined_df.head()

# Review Dataset Information

In [14]:
review_df = pd.read_csv(review_csv)

print("Shape:", review_df.shape)

print("Number of rows:", len(review_df))

print("Columns:", review_df.columns.tolist())

review_df.head()

Shape: (477060, 10)
Number of rows: 477060
Columns: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase']


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588687728923,0,True
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588615855070,1,True
2,5.0,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,1589665266052,2,True
3,3.0,Just ok,I try to get Keratin treatments every 3 months...,[],B07SLFWZKN,B07SLFWZKN,AFSKPY37N3C43SOI5IEXEK5JSIYA,1619737501209,0,False
4,5.0,Great refreshing skin care routine!,I had never tried anything for my skin consist...,[],B08GLG6W8T,B08GLG6W8T,AFSKPY37N3C43SOI5IEXEK5JSIYA,1613319236253,0,False


# Meta Dataset Information

In [15]:
meta_df = pd.read_csv(meta_csv)

print("Shape:", meta_df.shape)

print("Number of rows:", len(meta_df))

print("Columns:", meta_df.columns.tolist())

meta_df.head()

  meta_df = pd.read_csv(meta_csv)


Shape: (780169, 16)
Number of rows: 780169
Columns: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author']


Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,All Beauty,JIMIRE Russian Strip Eyelashes D Curl Wispy La...,3.2,40.0,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",JIMIRE,[],"{""Color"": ""Black"", ""Brand"": ""JIMIRE"", ""Materia...",B09TKGV7VH,,,
1,All Beauty,Yes to Tomatoes Detoxifying Charcoal Cleanser ...,4.5,3.0,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Yes To,[],"{""Item Form"": ""Powder"", ""Skin Type"": ""Acne Pro...",B076WQZGPM,,,
2,All Beauty,Pack of 2 e.l.f. Aqua Beauty Molten Liquid Eye...,3.8,18.0,['Pack of 2'],['Pack of 2 e.l.f. Aqua Beauty Molten Liquid E...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",e.l.f.,[],"{""Brand"": ""e.l.f."", ""Item Form"": ""liquid"", ""Fi...",B07Z6QD5T3,,,
3,All Beauty,BT21 Official Merchandise by Line Friends - SH...,4.9,17.0,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['BT21 How it Began'], 'url': ['http...",BT21,[],"{""Package Dimensions"": ""8.5 x 6 x 0.6 inches; ...",B07YSYZL8H,,,
4,All Beauty,Precision Plunger Bars for Cartridge Grips – 9...,4.3,7.0,"['Material: 304 Stainless Steel; Brass tip', '...",['The Precision Plunger Bars are designed to w...,,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Precision,[],"{""UPC"": ""644287689178""}",B07NGFDN6G,,,


# 2. Data Cleaning & Preprocessing
# a) Merge on parent_asin

In [16]:
merged_df = pd.merge(review_df, meta_df, on="parent_asin", how="inner")

print("Merged shape:", merged_df.shape)

print("Columns:", merged_df.columns.tolist())

merged_df.head()

Merged shape: (477060, 25)
Columns: ['rating', 'title_x', 'text', 'images_x', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase', 'main_category', 'title_y', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images_y', 'videos', 'store', 'categories', 'details', 'bought_together', 'subtitle', 'author']


Unnamed: 0,rating,title_x,text,images_x,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,...,description,price,images_y,videos,store,categories,details,bought_together,subtitle,author
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588687728923,0,True,...,"['If given the choice, weÕd leave most telltal...",,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['Best Hair Product For Summer!', 'O...",HERBIVORE,[],"{""Hair Type"": ""Wavy"", ""Material Type Free"": ""D...",,,
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588615855070,1,True,...,[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['Easy to apply!'], 'url': ['https:/...",Two Goats Apothecary,[],"{""Brand"": ""Two Goats Apothecary"", ""Item Form"":...",,,
2,5.0,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,1589665266052,2,True,...,['New Road Beauty Paraffin Wax is recommended ...,21.98,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['Opening the Creamsicle assortment ...,New Road Beauty,[],"{""Package Dimensions"": ""10.5 x 6.4 x 1.6 inche...",,,
3,3.0,Just ok,I try to get Keratin treatments every 3 months...,[],B07SLFWZKN,B07SLFWZKN,AFSKPY37N3C43SOI5IEXEK5JSIYA,1619737501209,0,False,...,[],49.95,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['Keratin Secrets DIY Treatment Syst...,Keratin Secrets,[],"{""Package Dimensions"": ""8.27 x 4.21 x 3.9 inch...",,,
4,5.0,Great refreshing skin care routine!,I had never tried anything for my skin consist...,[],B08GLG6W8T,B08GLG6W8T,AFSKPY37N3C43SOI5IEXEK5JSIYA,1613319236253,0,False,...,[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",HANHOO,[],"{""Package Dimensions"": ""13.07 x 9.49 x 2.95 in...",,,


#  b) Handle Invalid / Missing Values

In [17]:
# Drop rows where star rating is missing...
merged_df = merged_df.dropna(subset=['rating'])

# or not in [1–5].
merged_df = merged_df[merged_df['rating'].between(1, 5)]

In [18]:
# Drop rows if text (the review body) is empty.
merged_df = merged_df.dropna(subset=['text'])

merged_df = merged_df[merged_df['text'].str.strip() != '']

In [19]:
# If brand cannot be found in the metadata (e.g., missing in details or store), set brand = “Unknown”.

def extract_brand(details, store):
    if isinstance(details, dict) and 'brand' in details and details['brand']:
        return details['brand']
    elif pd.notna(store) and store.strip() != "":
        return store
    else:
        return "Unknown"

# Add a new column -> brand
merged_df['brand'] = merged_df.apply(lambda row: extract_brand(row.get('details', {}), row.get('store', '')), axis=1)

In [21]:
print(merged_df['brand'].value_counts().head(10))
# Checking the values in brand column

brand
Unknown                                          19483
Generic                                           7562
Format: Audio CD                                  4601
Whirlpool                                         1093
uxcell                                             792
GE                                                 663
Various Artists  (Artist)    Format: Audio CD      654
Amazon Renewed                                     609
Format: DVD                                        514
Nintendo                                           475
Name: count, dtype: int64


# c) Remove Duplicates

In [22]:
merged_df.drop_duplicates(subset=['user_id', 'asin', 'text'], keep='first', inplace=True)

# d) Derived Columns

In [23]:
import re

# Review Length
merged_df['review_length'] = merged_df['text'].apply(lambda x: len(re.findall(r'\b\w+\b', str(x))))

In [24]:
# Year
merged_df['year'] = pd.to_datetime(merged_df['timestamp'], unit='ms', errors='coerce').dt.year

In [25]:
print(merged_df.isna().sum()) # check Null values

rating                    0
title_x                  54
text                      0
images_x                  0
asin                      0
parent_asin               0
user_id                   0
timestamp                 0
helpful_vote              0
verified_purchase         0
main_category         31838
title_y                5134
average_rating            1
rating_number           312
features               5099
description            5099
price                265834
images_y                  0
videos                    0
store                 19018
categories             5099
details                   0
bought_together      476684
subtitle             447560
author               452452
brand                     0
review_length             0
year                      0
dtype: int64


In [26]:
print("Categories:", merged_df['main_category'].nunique(), "\n", merged_df['main_category'].unique()) # check if all categories are present

Categories: 45 
 ['All Beauty' 'Premium Beauty' 'Handmade' 'Health & Personal Care'
 'Office Products' 'Amazon Home' 'Pet Supplies' nan 'Buy a Kindle' 'Books'
 'Audible Audiobooks' 'Toys & Games' 'Arts, Crafts & Sewing'
 'Digital Music' 'Movies & TV' 'Tools & Home Improvement'
 'Musical Instruments' 'Software' 'All Electronics' 'Video Games'
 'Cell Phones & Accessories' 'Industrial & Scientific' 'Grocery'
 'SUBSCRIPTION BOXES' 'Sports & Outdoors' 'AMAZON FASHION' 'Baby'
 'Automotive' 'Computers' 'Camera & Photo' 'Home Audio & Theater'
 'Appliances' 'Portable Audio & Accessories' 'Gift Cards'
 'Collectible Coins' 'Magazine Subscriptions' 'Appstore for Android'
 'Car Electronics' 'Amazon Devices' 'GPS & Navigation'
 'Collectibles & Fine Art' 'Apple Products' 'Amazon Fire TV'
 'Sports Collectibles' 'Entertainment' 'Prime Video']


# e) Unified Output

In [27]:
cleaned_csv = os.path.join(base_dir, "cleaned_amazon_data.csv")
merged_df.to_csv(cleaned_csv, index=False)
print(f"Unified dataset saved to: {cleaned_csv}")

Unified dataset saved to: /content/drive/MyDrive/amazon_data_2023/cleaned_amazon_data.csv


# What does the cleaned dataset look like?

In [28]:
# Check cleaned dataset
clean_df = pd.read_csv(cleaned_csv)

print("Shape:", clean_df.shape)

clean_df.head()

  clean_df = pd.read_csv(cleaned_csv)


Shape: (476684, 28)


Unnamed: 0,rating,title_x,text,images_x,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,...,videos,store,categories,details,bought_together,subtitle,author,brand,review_length,year
0,5.0,Such a lovely scent but not overpowering.,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588687728923,0,True,...,"{'title': ['Best Hair Product For Summer!', 'O...",HERBIVORE,[],"{""Hair Type"": ""Wavy"", ""Material Type Free"": ""D...",,,,HERBIVORE,62,2020
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588615855070,1,True,...,"{'title': ['Easy to apply!'], 'url': ['https:/...",Two Goats Apothecary,[],"{""Brand"": ""Two Goats Apothecary"", ""Item Form"":...",,,,Two Goats Apothecary,47,2020
2,5.0,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,1589665266052,2,True,...,{'title': ['Opening the Creamsicle assortment ...,New Road Beauty,[],"{""Package Dimensions"": ""10.5 x 6.4 x 1.6 inche...",,,,New Road Beauty,4,2020
3,3.0,Just ok,I try to get Keratin treatments every 3 months...,[],B07SLFWZKN,B07SLFWZKN,AFSKPY37N3C43SOI5IEXEK5JSIYA,1619737501209,0,False,...,{'title': ['Keratin Secrets DIY Treatment Syst...,Keratin Secrets,[],"{""Package Dimensions"": ""8.27 x 4.21 x 3.9 inch...",,,,Keratin Secrets,152,2021
4,5.0,Great refreshing skin care routine!,I had never tried anything for my skin consist...,[],B08GLG6W8T,B08GLG6W8T,AFSKPY37N3C43SOI5IEXEK5JSIYA,1613319236253,0,False,...,"{'title': [], 'url': [], 'user_id': []}",HANHOO,[],"{""Package Dimensions"": ""13.07 x 9.49 x 2.95 in...",,,,HANHOO,131,2021
