<a href="https://colab.research.google.com/github/BTT-Cadence-Design-Systems-2A/AI-Studio-Project/blob/main/Cadence_2A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 **Install libraries**

In [None]:
!pip install -U datasets huggingface_hub



**Imports & config**

In [None]:
import json
import fsspec
from itertools import islice
import pandas as pd

REPO = "McAuley-Lab/Amazon-Reviews-2023"


# CATEGORIES = ["Software", "Video_Games", "All_Beauty"]
CATEGORIES = ["All_Beauty", "Amazon_Fashion", "Appliances", "Arts_Crafts_and_Sewing", "Automotive", "Baby_Products", "Beauty_and_Personal_Care", "Books",
              "CDs_and_Vinyl", "Cell_Phones_and_Accessories", "Clothing_Shoes_and_Jewelry", "Digital_Music", "Electronics", "Gift_Cards", "Grocery_and_Gourmet_Food",
              "Handmade_Products", "Health_and_Household", "Health_and_Personal_Care", "Home_and_Kitchen", "Industrial_and_Scientific",
              "Kindle_Store", "Magazine_Subscriptions", "Movies_and_TV", "Musical_Instruments", "Office_Products", "Patio_Lawn_and_Garden", "Pet_Supplies",
              "Software", "Sports_and_Outdoors", "Subscription_Boxes", "Tools_and_Home_Improvement", "Toys_and_Games", "Video_Games",
              "Unknown"]


N_PER_CAT = 10_000
N_META    = 60_000

pd.set_option("display.max_colwidth", 200)

**Load & sample each category (streaming) and concatenate**

In [None]:
def stream_jsonl(url: str, limit: int | None = None):
    """
    Stream a JSONL file line-by-line from Hugging Face
    Normalizes mixed-type fields like 'price'
    """
    with fsspec.open(url, "rt") as f:
        for idx, line in enumerate(f):
            if limit is not None and idx >= limit:
                break
            obj = json.loads(line)


            if "price" in obj and obj["price"] is not None:
                obj["price"] = str(obj["price"])

            return_obj = obj
            yield return_obj


def ensure_asin(df: pd.DataFrame) -> pd.DataFrame:
    """
    Ensure there is an 'asin' column
    """
    for cand in ["asin", "parent_asin", "product_id", "item_id", "Parent_ASIN", "ParentAsin"]:
        if cand in df.columns:
            if "asin" not in df.columns:
                df["asin"] = df[cand]
            return df
    if len(df) > 0:
        print("No recognizable ASIN-like key found. Example row:\n", df.head(1).to_dict("records")[0])
    return df


def load_category(category: str, n_reviews: int, n_meta: int):
    """
    Load one category's reviews and meta as DataFrames
    """
    reviews_url = f"hf://datasets/{REPO}/raw/review_categories/{category}.jsonl"
    meta_url    = f"hf://datasets/{REPO}/raw/meta_categories/meta_{category}.jsonl"

    reviews_df = pd.DataFrame(islice(stream_jsonl(reviews_url), n_reviews)).assign(category=category)
    meta_df    = pd.DataFrame(islice(stream_jsonl(meta_url),    n_meta)).assign(category=category)
    return reviews_df, meta_df

**Inspect schemas and key columns**

In [None]:
all_reviews, all_meta = [], []

for cat in CATEGORIES:
    r_df, m_df = load_category(cat, n_reviews=N_PER_CAT, n_meta=N_META)
    all_reviews.append(r_df)
    all_meta.append(m_df)

reviews_df = pd.concat(all_reviews, ignore_index=True)
meta_df    = pd.concat(all_meta,    ignore_index=True)

reviews_df = ensure_asin(reviews_df)
meta_df    = ensure_asin(meta_df)


if "asin" in reviews_df:
    reviews_df = reviews_df[reviews_df["asin"].notna()]
if "asin" in meta_df:
    meta_df = meta_df[meta_df["asin"].notna()]

print(f"Loaded rows -> reviews: {len(reviews_df):,} | meta: {len(meta_df):,}")
display(reviews_df.head(2))
display(meta_df.head(2))

print(f"Unique products in reviews: {reviews_df['asin'].nunique():,}")
print(f"Unique products in meta: {meta_df['asin'].nunique():,}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loaded rows -> reviews: 340,000 | meta: 1,865,169


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,category
0,5.0,Such a lovely scent but not overpowering.,"This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, med...",[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588687728923,0,True,All_Beauty
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was descri...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588615855070,1,True,All_Beauty


Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,category,subtitle,author,asin
0,All Beauty,"Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)",4.8,10.0,[],[],,"[{'thumb': 'https://m.media-amazon.com/images/I/41qfjSfqNyL._SS40_.jpg', 'large': 'https://m.media-amazon.com/images/I/41qfjSfqNyL.jpg', 'variant': 'MAIN', 'hi_res': None}, {'thumb': 'https://m.me...",[],Howard Products,[],"{'Package Dimensions': '7.1 x 5.5 x 3 inches; 2.38 Pounds', 'UPC': '617390882781'}",B01CUPMQZE,,All_Beauty,,,B01CUPMQZE
1,All Beauty,"Yes to Tomatoes Detoxifying Charcoal Cleanser (Pack of 2) with Charcoal Powder, Tomato Fruit Extract, and Gingko Biloba Leaf Extract, 5 fl. oz.",4.5,3.0,[],[],,"[{'thumb': 'https://m.media-amazon.com/images/I/41b+11d5igL._SS40_.jpg', 'large': 'https://m.media-amazon.com/images/I/41b+11d5igL.jpg', 'variant': 'MAIN', 'hi_res': 'https://m.media-amazon.com/im...",[],Yes To,[],"{'Item Form': 'Powder', 'Skin Type': 'Acne Prone', 'Brand': 'Yes To', 'Age Range (Description)': 'Adult', 'Unit Count': '10 Fl Oz', 'Is Discontinued By Manufacturer': 'No', 'Item model number': 'S...",B076WQZGPM,,All_Beauty,,,B076WQZGPM


Unique products in reviews: 265,889
Unique products in meta: 1,865,169


In [None]:
# print(reviews_df.columns)
# print(meta_df.columns)
# merged = reviews_df.merge(meta_df, on="parent_asin", how="left", suffixes=("_review", "_meta"))
# print(merged)
# print(merged.columns)
# merged.shape

**Helper: ensure_asin + normalize IDs**

In [None]:
meta_keys = {"asin", "parent_asin", "category"}
meta_keep = ["asin", "parent_asin"] + [c for c in meta_df.columns if c not in meta_keys]


m1 = reviews_df.merge(meta_df[meta_keep], on="asin", how="left", suffixes=("_review", "_meta"))


m2 = reviews_df.merge(
    meta_df[meta_keep].rename(columns={"asin": "asin_meta2", "parent_asin": "parent_asin_meta2"}),
    left_on="parent_asin",
    right_on="asin_meta2",
    how="left",
)


merged = m1.copy()
for col in meta_keep:
    if col in {"asin", "parent_asin"}:
        continue
    col_m1 = col
    col_m2 = col + "_m2"
    if col in m2.columns:
        merged[col_m2] = m2[col]
        merged[col] = merged[col].where(merged[col].notna(), merged[col_m2])
        merged.drop(columns=[col_m2], inplace=True)


if "asin_meta2" in m2.columns:
    merged["asin_meta_fallback"] = m2["asin_meta2"]

print("Merged shape:", merged.shape)


meta_signal = [c for c in merged.columns if c.endswith("_meta") or c in ["average_rating", "rating_number", "price", "store", "categories", "details", "title", "images", "videos", "main_category"]]
coverage = merged[meta_signal].notna().any(axis=1).mean() if meta_signal else 0.0
print(f"Rows with ANY meta fields: {coverage:.2%}")

display(merged.head(5))

Merged shape: (340000, 28)
Rows with ANY meta fields: 36.34%


Unnamed: 0,rating,title_review,text,images_review,asin,parent_asin_review,user_id,timestamp,helpful_vote,verified_purchase,...,price,images_meta,videos,store,categories,details,bought_together,subtitle,author,asin_meta_fallback
0,5.0,Such a lovely scent but not overpowering.,"This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, med...",[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588687728923,0,True,...,,"[{'thumb': 'https://m.media-amazon.com/images/I/41VQKmI9uKL._SS40_.jpg', 'large': 'https://m.media-amazon.com/images/I/41VQKmI9uKL.jpg', 'variant': 'MAIN', 'hi_res': 'https://m.media-amazon.com/im...","[{'title': 'Best Hair Product For Summer!', 'url': 'https://www.amazon.com/vdp/04c64593e097481890293339ed4a9481?ref=dp_vse_rvc_0', 'user_id': '/shop/influencer-865f4c65'}, {'title': 'Ouai Texturiz...",HERBIVORE,[],"{'Hair Type': 'Wavy', 'Material Type Free': 'Dairy Free', 'Scent': 'Coconut', 'Liquid Volume': '8 Fluid Ounces', 'Item Form': 'Spray', 'Is Discontinued By Manufacturer': 'No', 'Package Dimensions'...",,,,B00YQ6X8EO
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was descri...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,1588615855070,1,True,...,,"[{'thumb': 'https://m.media-amazon.com/images/I/41Jq6qGaBBL._SS40_.jpg', 'large': 'https://m.media-amazon.com/images/I/41Jq6qGaBBL.jpg', 'variant': 'MAIN', 'hi_res': 'https://m.media-amazon.com/im...","[{'title': 'Easy to apply!', 'url': 'https://www.amazon.com/vdp/c3ae2c6ce7834850bac583d65d9ddd2e?ref=dp_vse_rvc_0', 'user_id': 'AG33GUSUEUCH54H7UYL2NW4JCELA'}]",Two Goats Apothecary,[],"{'Brand': 'Two Goats Apothecary', 'Item Form': 'Powder', 'Age Range (Description)': 'Adult', 'Unit Count': '2.0 Ounce', 'Package Dimensions': '6.6 x 4.2 x 1.5 inches; 5.61 Ounces'}",,,,B081TJ8YS3
2,5.0,Yes!,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,1589665266052,2,True,...,21.98,,"[{'title': 'Opening the Creamsicle assortment ', 'url': 'https://www.amazon.com/vdp/f4e84c6e3ed8497db3ad7c84a09f7a31?ref=dp_vse_rvc_0', 'user_id': 'AFOK3DPGYFDWNZFIDBEI3S2JVQKQ'}]",New Road Beauty,[],"{'Package Dimensions': '10.5 x 6.4 x 1.6 inches; 2.6 Pounds', 'UPC': '695924647044', 'Manufacturer': 'New Road Beauty'}",,,,B097R46CSY
3,1.0,Synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1643393630220,0,True,...,,"[{'thumb': 'https://m.media-amazon.com/images/I/51fTyNsyylL._SS40_.jpg', 'large': 'https://m.media-amazon.com/images/I/51fTyNsyylL.jpg', 'variant': 'MAIN', 'hi_res': 'https://m.media-amazon.com/im...",[],muaowig,[],"{'Brand': 'muaowig', 'Material': 'Human Hair', 'Extension Length': '12 Inches', 'Hair Type': 'Wavy', 'Material Feature': 'Natural', 'Package Dimensions': '13.94 x 10.43 x 2.32 inches; 13.09 Ounces...",,,,B09JS339BZ
4,5.0,A+,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1609322563534,0,True,...,,"[{'thumb': 'https://m.media-amazon.com/images/I/41cMyE4DTmL._SS40_.jpg', 'large': 'https://m.media-amazon.com/images/I/41cMyE4DTmL.jpg', 'variant': 'MAIN', 'hi_res': 'https://m.media-amazon.com/im...",[],Yinhua,[],{'Package Dimensions': '8.5 x 3.82 x 2.24 inches; 9.14 Ounces'},,,,B08BZ63GMJ
