In [None]:
# Imports 
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import torch
import umap
import hdbscan
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

EDA Will Later be Moved to a seperate file. 

In [None]:
# Using the clean data and embedding model to create BERTopic model

# Loading the clean df and gettign EDA viz  
df_path = '../cleaned_data/amazon_clean/amazon_reviewsAll_Beauty.jsonl'

#Reading to get the first rows
records = []
with open(df_path, 'r') as f:
    for i, line in enumerate(f):
        if i < 5:  # Read first 5 records
            records.append(json.loads(line))
        else:
            break

#Structure of the df 
print("Sample records:")
for i, record in enumerate(records):
    print(f"\nRecord {i+1}:")
    print(json.dumps(record, indent=2))

# Load all data into DataFrame
df_reviews = pd.read_json(df_path, lines=True)
print(f"\n\nDataset shape: {df_reviews.shape}")
print(f"\nColumn names: {df_reviews.columns.tolist()}")
print(f"\nFirst few rows:")
print(df_reviews.head())
print(f"\nData types:\n{df_reviews.dtypes}")


Sample records:

Record 1:
{
  "rating": 5.0,
  "title": "such a lovely scent but not overpowering",
  "text": "This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!",
  "images": [],
  "asin": "B00YQ6X8EO",
  "parent_asin": "B00YQ6X8EO",
  "user_id": "AGKHLEW2SOWHNMFQIJGBECAF7INQ",
  "timestamp": 1588687728923,
  "helpful_vote": 0,
  "verified_purchase": true
}

Record 2:
{
  "rating": 4.0,
  "title": "works great but smells a little weird",
  "text": "This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was described but I was hoping it would be light)",
  "images": [],
  "asin": "B081TJ8YS3",
  "parent_asin": 

In [9]:
# Inspecting the embeddings  


emb_path = "../cleaned_data/amazon_vectors/embeddings/embeddings.npy"
embeddings = np.load(emb_path)

print("Embedding shape:", embeddings.shape)
print("First vector:", embeddings[0][:10])
print("Number of reviews:", len(df_reviews))




Embedding shape: (5000, 384)
First vector: [-0.02544238 -0.05447146  0.06444004 -0.07990921 -0.01119874  0.04295129
  0.06123012  0.05920129 -0.00847748 -0.06535   ]
Number of reviews: 701528


In [10]:
# Matching the embedings with the reviews since the count was  off 

# Getting the first 5k reviews for testing
df_5k = df_reviews.iloc[:5000].copy()
documents = df_5k["text"].astype(str).tolist()

#Checking if shapes match 
print("Subset shape:", df_5k.shape)
print("Docs:", len(documents))
print("Embeddings:", embeddings.shape[0])


Subset shape: (5000, 10)
Docs: 5000
Embeddings: 5000


In [None]:
# Building the BERT Model 


topic_model = BERTopic(
    embedding_model=None,        # Set to none -> we already have embeddings (Looking at the first 5000)
    calculate_probabilities=True,
    verbose=True,
)


In [12]:
#Fitting it to get the following 

# UMAP -> To reduce the dimensionality
#HDBSCAN → clustering

#c-TF-IDF → topic extraction // Will allow us to get the UMAP progress and DBSCAN cluster count and the topic reduction logs

topics, probs = topic_model.fit_transform(documents, embeddings)


2025-12-05 06:33:01,848 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-05 06:33:39,238 - BERTopic - Dimensionality - Completed ✓
2025-12-05 06:33:39,239 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-05 06:33:40,394 - BERTopic - Cluster - Completed ✓
2025-12-05 06:33:40,403 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-05 06:33:40,747 - BERTopic - Representation - Completed ✓


In [None]:
# Inspecting the topics that were generated

topic_info = topic_model.get_topic_info()
topic_info.head(15)
#topic_info.shape -> 78 Topics were identified. 

# Adding the topics back to the df 

df_5k["topic"] = topics
df_5k.head()

# Saving this df with topics
output_path = "../cleaned_data/amazon_clean/



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1430,-1_it_and_skin_the,"[it, and, skin, the, to, my, is, this, for, of]",[I really like this OGANA CELL Peptide Concent...
1,0,338,0_nail_nails_polish_gel,"[nail, nails, polish, gel, the, to, coat, and,...",[This RSTYLE UV Gel Nail Polish Set is beautif...
2,1,303,1_brush_brushes_bristles_the,"[brush, brushes, bristles, the, teeth, to, is,...",[We have been through several Clarisonic brush...
3,2,188,2_hair_shampoo_conditioner_it,"[hair, shampoo, conditioner, it, my, and, this...",[It works! Comes down to preference. I use dry...
4,3,173,3_lashes_mascara_eyelashes_eyeliner,"[lashes, mascara, eyelashes, eyeliner, liner, ...",[NOTE: I received a free sample of this produ...
5,4,116,4_cleanser_scrub_face_it,"[cleanser, scrub, face, it, skin, body, wash, ...",[This is a fantastic cleanser by Higher Educat...
6,5,107,5_headbands_headband_head_are,"[headbands, headband, head, are, they, these, ...",[If I have a choice between a plain headband a...
7,6,99,6_bottles_spray_bottle_sprayer,"[bottles, spray, bottle, sprayer, are, these, ...",[These little spray bottles are awesome. We f...
8,7,96,7_mask_masks_sheet_face,"[mask, masks, sheet, face, skin, you, serum, t...","[In this pack you get 3 masks for $18, so this..."
9,8,91,8_serum_skin_retinol_this,"[serum, skin, retinol, this, moisturizer, is, ...",[This is a nice Retinol cream that is a little...
