In [1]:
# Imports 
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import torch
import umap
import hdbscan
import pandas as pd
import numpy as np
import json

  from .autonotebook import tqdm as notebook_tqdm


# Inspecting cleaned dataset & embeddings 

In [3]:
# Using the clean data and embedding model to create BERTopic model

# Loading the clean df and gettign EDA viz  
df_path = '../cleaned_data/amazon_clean/amazon_reviewsAll_Beauty.jsonl'

#Reading to get the first rows
records = []
with open(df_path, 'r') as f:
    for i, line in enumerate(f):
        if i < 5:  # Read first 5 records
            records.append(json.loads(line))
        else:
            break

#Structure of the df 
print("Sample records:")
for i, record in enumerate(records):
    print(f"\nRecord {i+1}:")
    print(json.dumps(record, indent=2))

# Load all data into DataFrame
df_reviews = pd.read_json(df_path, lines=True)
print(f"\n\nDataset shape: {df_reviews.shape}")
print(f"\nColumn names: {df_reviews.columns.tolist()}")
print(f"\nFirst few rows:")
print(df_reviews.head())
print(f"\nData types:\n{df_reviews.dtypes}")


Sample records:

Record 1:
{
  "rating": 5.0,
  "title": "such a lovely scent but not overpowering",
  "text": "This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!",
  "images": [],
  "asin": "B00YQ6X8EO",
  "parent_asin": "B00YQ6X8EO",
  "user_id": "AGKHLEW2SOWHNMFQIJGBECAF7INQ",
  "timestamp": 1588687728923,
  "helpful_vote": 0,
  "verified_purchase": true
}

Record 2:
{
  "rating": 4.0,
  "title": "works great but smells a little weird",
  "text": "This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was described but I was hoping it would be light)",
  "images": [],
  "asin": "B081TJ8YS3",
  "parent_asin": 

In [4]:
# Inspecting the embeddings  


emb_path = "../cleaned_data/amazon_vectors/embeddings/embeddings.npy"
embeddings = np.load(emb_path)

print("Embedding shape:", embeddings.shape)
print("First vector:", embeddings[0][:10])
print("Number of reviews:", len(df_reviews))




Embedding shape: (5000, 384)
First vector: [-0.02544238 -0.05447146  0.06444004 -0.07990921 -0.01119874  0.04295129
  0.06123012  0.05920129 -0.00847748 -0.06535   ]
Number of reviews: 701528


In [5]:
# Matching the embedings with the reviews since the count was  off 

# Getting the first 5k reviews for testing
df_5k = df_reviews.iloc[:5000].copy()
documents = df_5k["text"].astype(str).tolist()

#Checking if shapes match 
print("Subset shape:", df_5k.shape)
print("Docs:", len(documents))
print("Embeddings:", embeddings.shape[0])


Subset shape: (5000, 10)
Docs: 5000
Embeddings: 5000


# Building and Fitting the Bert model

In [6]:
# Building the BERT Model 


topic_model = BERTopic(
    embedding_model=None,        # Set to none -> we already have embeddings (Looking at the first 5000)
    calculate_probabilities=True,
    verbose=True,
)


In [7]:
#Fitting it to get the following 

# UMAP -> To reduce the dimensionality
#HDBSCAN → clustering

#c-TF-IDF → topic extraction // Will allow us to get the UMAP progress and DBSCAN cluster count and the topic reduction logs

topics, probs = topic_model.fit_transform(documents, embeddings)


2025-12-05 08:12:44,212 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


2025-12-05 08:13:32,322 - BERTopic - Dimensionality - Completed ✓
2025-12-05 08:13:32,325 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-05 08:13:34,046 - BERTopic - Cluster - Completed ✓
2025-12-05 08:13:34,064 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-05 08:13:34,543 - BERTopic - Representation - Completed ✓


In [8]:
# Inspecting the topics that were generated

topic_info = topic_model.get_topic_info()
topic_info.head(15)
#topic_info.shape -> 78 Topics were identified. 

# Adding the topics back to the df 

df_5k["topic"] = topics
df_5k.head()



Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,topic
0,5,such a lovely scent but not overpowering,This spray is really nice. It smells really go...,[],B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,True,2
1,4,works great but smells a little weird,"This product does what I need it to do, I just...",[],B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,True,-1
2,5,yes,"Smells good, feels great!",[],B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,True,4
3,1,synthetic feeling,Felt synthetic,[],B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,True,47
4,5,a,Love it,[],B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,True,50


## Saving results for reproducability 

In [10]:
# Saving all the data and models so that we dont have to redo the steps above 

#reviews with topics 
df_5k.to_csv("../data/bert_results/bertopic_5k_reviews_with_topics.csv", index=False)

#Topic assignments 
with open("../data/bert_results/bertopic_topic_ids.txt", "w") as f:
    for t in topics:
        f.write(str(t) + "\n")

# topic rep + weights


with open("../data/bert_results/bertopic_topic_words.txt", "w") as f:
    for topic_id in topic_model.get_topics().keys():
        words = topic_model.get_topic(topic_id)
        f.write(f"Topic {topic_id}:\n")
        for word, weight in words:
            f.write(f"   {word}: {weight}\n")
        f.write("\n")


# Document-Topic prop matrix vectors
 
np.save("../data/bert_results/bertopic_topic_probabilities.npy", probs)


#reduced Embeddings -> (UMAP output)
umap_embeddings = topic_model.umap_model.embedding_
np.save("../data/bert_results/bertopic_umap_embeddings.npy", umap_embeddings)

# Saving the model 
topic_model.save("../data/bert_results/bertopic_model")



# Visuals of the Topic Modeling

In [None]:
# Topic Overview (UMAP SCATTER)

topic_model = BERTopic.load("../data/bert_results/bertopic_model")



#Limiting to top 10 visuals
fig = topic_model.visualize_topics(top_n_topics=10)
fig.write_html("../data/bert_results/visuals/vis_topics_overview.html")





#Saving as png 
#fig.write_image("../data/bert_results/visuals/vis_topics_overview.png") -> Kept running into errors so just manually saved it. 


In [None]:
fig2 = topic_model.visualize_topics()
fig.write_html("../data/bert_results/visuals/vis_topics_overview_60.html")




In [6]:
#Topic hierarchy #Limiting to 10

fig = topic_model.visualize_hierarchy(top_n_topics=10)
fig.write_html("../data/bert_results/visuals/vis_topic_hierarchy.html")

#saving as png
#fig.write_image("/data/bert_results/visuals/vis_topic_hierarchy.png")

In [7]:
#Topic similarity heatmap # Limit to 10 

fig_heatmap = topic_model.visualize_heatmap(top_n_topics=10)
fig_heatmap.write_html("../data/bert_results/visuals/vis_topics_heatmap.html")

##
# fig_heatmap.write_image("../data/bert_results/visuals/vis_topics_heatmap.png")