<a href="https://colab.research.google.com/github/BTT-Cadence-Design-Systems-2A/AI-Studio-Project/blob/main/Cadence_2A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 **Install libraries**

In [1]:
!pip install -U datasets huggingface_hub



**Imports & config**

In [2]:
import json
import fsspec
from itertools import islice
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt_tab")

REPO = "McAuley-Lab/Amazon-Reviews-2023"


CATEGORIES = ["Software", "Video_Games", "All_Beauty"]
ALL_CATEGORIES = ["All_Beauty", "Amazon_Fashion", "Appliances", "Arts_Crafts_and_Sewing", "Automotive", "Baby_Products", "Beauty_and_Personal_Care", "Books",
              "CDs_and_Vinyl", "Cell_Phones_and_Accessories", "Clothing_Shoes_and_Jewelry", "Digital_Music", "Electronics", "Gift_Cards", "Grocery_and_Gourmet_Food",
              "Handmade_Products", "Health_and_Household", "Health_and_Personal_Care", "Home_and_Kitchen", "Industrial_and_Scientific",
              "Kindle_Store", "Magazine_Subscriptions", "Movies_and_TV", "Musical_Instruments", "Office_Products", "Patio_Lawn_and_Garden", "Pet_Supplies",
              "Software", "Sports_and_Outdoors", "Subscription_Boxes", "Tools_and_Home_Improvement", "Toys_and_Games", "Video_Games",
              "Unknown"]


N_PER_CAT = 10_000
N_META    = 60_000

pd.set_option("display.max_colwidth", 200)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


**Load & sample each category (streaming) and concatenate**

In [3]:
def stream_jsonl(url: str, limit: int | None = None):
    """
    Stream a JSONL file line-by-line from Hugging Face
    Normalizes mixed-type fields like 'price'
    """
    with fsspec.open(url, "rt") as f:
        for idx, line in enumerate(f):
            if limit is not None and idx >= limit:
                break
            obj = json.loads(line)


            if "price" in obj and obj["price"] is not None:
                obj["price"] = str(obj["price"])

            return_obj = obj
            yield return_obj


def ensure_asin(df: pd.DataFrame) -> pd.DataFrame:
    """
    Ensure there is an 'asin' column
    """
    for cand in ["asin", "parent_asin", "product_id", "item_id", "Parent_ASIN", "ParentAsin"]:
        if cand in df.columns:
            if "asin" not in df.columns:
                df["asin"] = df[cand]
            return df
    if len(df) > 0:
        print("No recognizable ASIN-like key found. Example row:\n", df.head(1).to_dict("records")[0])
    return df


def load_category(category: str, n_reviews: int, n_meta: int):
    """
    Load one category's reviews and meta as DataFrames
    """
    reviews_url = f"hf://datasets/{REPO}/raw/review_categories/{category}.jsonl"
    meta_url    = f"hf://datasets/{REPO}/raw/meta_categories/meta_{category}.jsonl"

    reviews_df = pd.DataFrame(islice(stream_jsonl(reviews_url), n_reviews)).assign(category=category)
    meta_df    = pd.DataFrame(islice(stream_jsonl(meta_url),    n_meta)).assign(category=category)
    return reviews_df, meta_df

**Inspect schemas and key columns**

In [4]:
all_reviews, all_meta = [], []

for cat in CATEGORIES:
    r_df, m_df = load_category(cat, n_reviews=N_PER_CAT, n_meta=N_META)
    all_reviews.append(r_df)
    all_meta.append(m_df)

reviews_df = pd.concat(all_reviews, ignore_index=True)
meta_df    = pd.concat(all_meta,    ignore_index=True)

reviews_df = ensure_asin(reviews_df)
meta_df    = ensure_asin(meta_df)


if "asin" in reviews_df:
    reviews_df = reviews_df[reviews_df["asin"].notna()]
if "asin" in meta_df:
    meta_df = meta_df[meta_df["asin"].notna()]

print(f"Loaded rows -> reviews: {len(reviews_df):,} | meta: {len(meta_df):,}")
display(reviews_df.head(2))
display(meta_df.head(2))

print(f"Unique products in reviews: {reviews_df['asin'].nunique():,}")
print(f"Unique products in meta: {meta_df['asin'].nunique():,}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


KeyboardInterrupt: 

In [None]:
# print(reviews_df.columns)
# print(meta_df.columns)
# merged = reviews_df.merge(meta_df, on="parent_asin", how="left", suffixes=("_review", "_meta"))
# print(merged)
# print(merged.columns)
# merged.shape

**Helper: ensure_asin + normalize IDs**

In [None]:
meta_keys = {"asin", "parent_asin", "category"}
meta_keep = ["asin", "parent_asin"] + [c for c in meta_df.columns if c not in meta_keys]


m1 = reviews_df.merge(meta_df[meta_keep], on="asin", how="left", suffixes=("_review", "_meta"))


m2 = reviews_df.merge(
    meta_df[meta_keep].rename(columns={"asin": "asin_meta2", "parent_asin": "parent_asin_meta2"}),
    left_on="parent_asin",
    right_on="asin_meta2",
    how="left",
)


merged = m1.copy()
for col in meta_keep:
    if col in {"asin", "parent_asin"}:
        continue
    col_m1 = col
    col_m2 = col + "_m2"
    if col in m2.columns:
        merged[col_m2] = m2[col]
        merged[col] = merged[col].where(merged[col].notna(), merged[col_m2])
        merged.drop(columns=[col_m2], inplace=True)


if "asin_meta2" in m2.columns:
    merged["asin_meta_fallback"] = m2["asin_meta2"]

print("Merged shape:", merged.shape)


meta_signal = [c for c in merged.columns if c.endswith("_meta") or c in ["average_rating", "rating_number", "price", "store", "categories", "details", "title", "images", "videos", "main_category"]]
coverage = merged[meta_signal].notna().any(axis=1).mean() if meta_signal else 0.0
print(f"Rows with ANY meta fields: {coverage:.2%}")

display(merged.head(5))

Merged shape: (30000, 28)
Rows with ANY meta fields: 65.98%


Unnamed: 0,rating,title_review,text,images_review,asin,parent_asin_review,user_id,timestamp,helpful_vote,verified_purchase,...,price,images_meta,videos,store,categories,details,bought_together,subtitle,author,asin_meta_fallback
0,1.0,malware,mcaffee IS malware,[],B07BFS3G7P,B0BQSK9QCF,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1562182632076,0,False,...,34.99,,"[{'title': 'McAfee REAL Support', 'url': 'https://www.amazon.com/vdp/1d78f93b842f4ad2b7a5784562785995?ref=dp_vse_rvc_0', 'user_id': 'AHL62TTXAOHG7TW7I42NKJJYMWXQ'}, {'title': 'How to Activate and ...",McAfee,"[Software, Antivirus & Security, Internet Security Suites]","{'Product Dimensions': '7.5 x 5.5 x 0.5 inches; 0.49 Ounces', 'Item model number': 'MTP00EAMXRAAS', 'Date First Available': 'September 26, 2018', 'Manufacturer': 'McAfee', 'Country of Origin': 'USA'}",,,,B0BQSK9QCF
1,5.0,Lots of Fun,"I love playing tapped out because it is fun to watch the town grow by earning money and buying buildings. I love helping my neighbors, too.",[],B00CTQ6SIG,B00CTQ6SIG,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1424120336000,0,True,...,0.0,"[{'large': 'https://m.media-amazon.com/images/I/A1oXfoxcSJL.png', 'variant': 'MAIN'}, {'large': 'https://m.media-amazon.com/images/I/A1REwvZmyCL.png', 'variant': 'PT01'}, {'large': 'https://m.medi...","[{'title': '', 'url': '', 'user_id': ''}]",Electronic Arts,[],"{'Release Date': '2013', 'Date first listed on Amazon': 'June 24, 2013', 'Developed By': 'Electronic Arts', 'Size': '73.2MB', 'Version': '4.62.0', 'Application Permissions': ['ACCESS_DOWNLOAD_MANA...",,,,B00CTQ6SIG
2,5.0,Light Up The Dark,"I love this flashlight app! It really illuminates the dark, very cool! Get this app, you will love it, really!",[],B0066WJLU6,B0066WJLU6,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1362399267000,0,True,...,,,,,,,,,,
3,4.0,Fun game,One of my favorite games,[],B00KCYMAWK,B00KCYMAWK,AH6CATODIVPVUOJEWHRSRCSKAOHA,1561061428662,0,True,...,0.0,"[{'large': 'https://m.media-amazon.com/images/I/A1ZIEO4ZTEL.jpg', 'variant': 'MAIN'}, {'large': 'https://m.media-amazon.com/images/I/B1uBRtRYlVL.png', 'variant': 'PT01'}, {'large': 'https://m.medi...","[{'title': '', 'url': 'https://images-na.ssl-images-amazon.com/images/I/D1uRaN4cXyS.mp4', 'user_id': ''}]",SG Interactive,[],"{'Release Date': '2014', 'Date first listed on Amazon': 'May 22, 2014', 'Developed By': 'SG Interactive', 'Size': '93.3MB', 'Version': '39.0.0', 'Application Permissions': ['Access information abo...",,,,B00KCYMAWK
4,4.0,I am not that good at it but my kids are,Cute game. I am not that good at it but my kids are. We love Nik Wallenda!,[],B00P1RK566,B00P1RK566,AEINY4XOINMMJCK5GZ3M6MMHBN6A,1418257196000,0,True,...,0.99,"[{'large': 'https://m.media-amazon.com/images/I/51kWAmsxozL.png', 'variant': 'MAIN'}, {'large': 'https://m.media-amazon.com/images/I/51KW52oWAUL.png', 'variant': 'PT01'}, {'large': 'https://m.medi...","[{'title': '', 'url': '', 'user_id': ''}]",Tapinator,[],"{'Release Date': '2014', 'Date first listed on Amazon': 'November 2, 2014', 'Developed By': 'Tapinator', 'Size': '26.4MB', 'Version': '1.0', 'Application Permissions': ['Access information about n...",,,,B00P1RK566


# **Milestone #1: Sentiment Analysis of a Singular Review**


Goal: Take the reviews dataframe, only maintain the rating, title, category, and text columns, and then train a model that predicts the rating given a review text


In [7]:
def load_category_into_review(category: str, n_reviews: int):
    """
    Load one category's reviews as DataFrames
    """
    reviews_url = f"hf://datasets/{REPO}/raw/review_categories/{category}.jsonl"

    data = (
        {k: row.get(k) for k in ["rating", "title", "text"]}
        for row in islice(stream_jsonl(reviews_url), n_reviews)
    )

    reviews_df = pd.DataFrame(data).assign(category=category)
    return reviews_df

In [9]:
sentiment_reviews =  []

for cat in ALL_CATEGORIES:
    r_df = load_category_into_review(cat, n_reviews=N_PER_CAT)
    sentiment_reviews.append(r_df)

reviews_df_milestone1 = pd.concat(sentiment_reviews, ignore_index=True)


print(f"Loaded rows -> reviews: {len(reviews_df_milestone1):,}")
display(reviews_df_milestone1.head(2))

Loaded rows -> reviews: 340,000


Unnamed: 0,rating,title,text,category
0,5.0,Such a lovely scent but not overpowering.,"This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, med...",All_Beauty
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was descri...",All_Beauty


In [10]:
reviews_df_milestone1.info()
reviews_df_milestone1['rating'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340000 entries, 0 to 339999
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   rating    340000 non-null  float64
 1   title     340000 non-null  object 
 2   text      340000 non-null  object 
 3   category  340000 non-null  object 
dtypes: float64(1), object(3)
memory usage: 10.4+ MB


Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5.0,224659
4.0,53930
3.0,26023
1.0,21621
2.0,13767


## Milestone #1: Data Cleaning

In [11]:
reviews_df_milestone1.isna().sum()

Unnamed: 0,0
rating,0
title,0
text,0
category,0


### Text Normalization (removing punctuation)

In [12]:
import string


def remove_punctuation(text: str) -> str:
    """
    Function removes all punctuation from a string
    """
    if not isinstance(text, str):
        return ""
    return text.translate(str.maketrans("", "", string.punctuation))

In [13]:
"""
   Creates clean_review and clean_title and clean_review. These two columns will be used during model training.
"""
reviews_df_milestone1['clean_review'] = (
    reviews_df_milestone1['text']
    .str.lower()
    .apply(remove_punctuation)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

reviews_df_milestone1['clean_title'] = (
    reviews_df_milestone1['title']
    .str.lower()
    .apply(remove_punctuation)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

### Lemmitization of Reviews

In [14]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmas)

In [15]:
reviews_df_milestone1['lemmatized_review'] = reviews_df_milestone1['clean_review'].apply(lemmatize_text)
reviews_df_milestone1['lemmatized_title'] = reviews_df_milestone1['clean_title'].apply(lemmatize_text)

### Creating Sentiment Labels


In [16]:
def create_sentiment_label(rating: int) -> str:
  if rating >= 4:
    return 'positive'
  elif rating <= 2:
    return 'negative'
  else:
    return 'neutral'

In [17]:
reviews_df_milestone1['sentiment_labels'] = (
    reviews_df_milestone1['rating']
    .apply(create_sentiment_label)
)

In [None]:
reviews_df_milestone1.head()

Unnamed: 0,rating,title,text,category,clean_review,clean_title,lemmatized_review,lemmatized_title,sentiment_labels
0,5.0,Such a lovely scent but not overpowering.,"This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, med...",All_Beauty,this spray is really nice it smells really good goes on really fine and does the trick i will say it feels like you need a lot of it though to get the texture i want i have a lot of hair medium th...,such a lovely scent but not overpowering,this spray is really nice it smell really good go on really fine and doe the trick i will say it feel like you need a lot of it though to get the texture i want i have a lot of hair medium thickne...,such a lovely scent but not overpowering,positive
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was descri...",All_Beauty,this product does what i need it to do i just wish it was odorless or had a soft coconut smell having my head smell like an orange coffee is offputting granted i did know the smell was described b...,works great but smells a little weird,this product doe what i need it to do i just wish it wa odorless or had a soft coconut smell having my head smell like an orange coffee is offputting granted i did know the smell wa described but ...,work great but smell a little weird,positive
2,5.0,Yes!,"Smells good, feels great!",All_Beauty,smells good feels great,yes,smell good feel great,yes,positive
3,1.0,Synthetic feeling,Felt synthetic,All_Beauty,felt synthetic,synthetic feeling,felt synthetic,synthetic feeling,negative
4,5.0,A+,Love it,All_Beauty,love it,a,love it,a,positive


### Tokenization of Reviews


In [None]:
# documents = reviews_df_milestone1['clean_review'].tolist()

In [None]:
# vectorizer = TfidfVectorizer(
#     stop_words="english",   # remove english stopwords like this, a, the, etc
#     # max_features=5000,      # keep top 5000 words (tune this)
# )
# X = vectorizer.fit_transform(documents)

In [None]:
# print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

# df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# df_tfidf.head()

In [23]:
from nltk.tokenize import word_tokenize
reviews_df_milestone1['tokenized_review'] = reviews_df_milestone1['clean_review'].apply(word_tokenize)

In [24]:
reviews_df_milestone1.head(5)

Unnamed: 0,rating,title,text,category,clean_review,clean_title,lemmatized_review,lemmatized_title,sentiment_labels,tokenized_review
0,5.0,Such a lovely scent but not overpowering.,"This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, med...",All_Beauty,this spray is really nice it smells really good goes on really fine and does the trick i will say it feels like you need a lot of it though to get the texture i want i have a lot of hair medium th...,such a lovely scent but not overpowering,this spray is really nice it smell really good go on really fine and doe the trick i will say it feel like you need a lot of it though to get the texture i want i have a lot of hair medium thickne...,such a lovely scent but not overpowering,positive,"[this, spray, is, really, nice, it, smells, really, good, goes, on, really, fine, and, does, the, trick, i, will, say, it, feels, like, you, need, a, lot, of, it, though, to, get, the, texture, i,..."
1,4.0,Works great but smells a little weird.,"This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was descri...",All_Beauty,this product does what i need it to do i just wish it was odorless or had a soft coconut smell having my head smell like an orange coffee is offputting granted i did know the smell was described b...,works great but smells a little weird,this product doe what i need it to do i just wish it wa odorless or had a soft coconut smell having my head smell like an orange coffee is offputting granted i did know the smell wa described but ...,work great but smell a little weird,positive,"[this, product, does, what, i, need, it, to, do, i, just, wish, it, was, odorless, or, had, a, soft, coconut, smell, having, my, head, smell, like, an, orange, coffee, is, offputting, granted, i, ..."
2,5.0,Yes!,"Smells good, feels great!",All_Beauty,smells good feels great,yes,smell good feel great,yes,positive,"[smells, good, feels, great]"
3,1.0,Synthetic feeling,Felt synthetic,All_Beauty,felt synthetic,synthetic feeling,felt synthetic,synthetic feeling,negative,"[felt, synthetic]"
4,5.0,A+,Love it,All_Beauty,love it,a,love it,a,positive,"[love, it]"


##