<a href="https://colab.research.google.com/github/AnkitaSK/Proj3_RoboReviews/blob/main/ClusterProductCategoryOnReviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
import polars as pl

In [74]:
# Load large CSV file
df = pl.read_csv("Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv")

In [75]:
#print(df.head())  # Preview the first 5 rows
#print(df.shape)

In [76]:
reviews_text = df["reviews.text"]
reviews_text_list = [review for review in reviews_text if review is not None]
subset = reviews_text_list
#subset = reviews_text_list[2100:3200]
#print(subset)

In [77]:
#!pip install bertopic

In [78]:
from sklearn.feature_extraction.text import CountVectorizer

# remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1,2), stop_words='english')

In [79]:
from sentence_transformers import SentenceTransformer

# custom embeddings
embedding_model = SentenceTransformer('all-MiniLM-l6-v2')

In [80]:
#!pip install umap-learn

In [81]:
from umap import UMAP

umap_model = UMAP(
    n_neighbors=3,
    n_components=3,
    min_dist=0.05
    )

In [82]:
#!pip install hdbscan

In [83]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(
    min_cluster_size=80,
    min_samples=40,
    prediction_data=True,
    gen_min_span_tree=True
    )

In [84]:
#!pip install bertopic

In [85]:
from bertopic import BERTopic

In [86]:
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    language='english',
    calculate_probabilities=True,
    verbose=True
    )
topics, probabilities = topic_model.fit_transform(subset)

2024-12-07 11:45:00,660 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/886 [00:00<?, ?it/s]

2024-12-07 11:45:08,620 - BERTopic - Embedding - Completed ✓
2024-12-07 11:45:08,621 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-07 11:45:26,210 - BERTopic - Dimensionality - Completed ✓
2024-12-07 11:45:26,212 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-07 11:45:33,318 - BERTopic - Cluster - Completed ✓
2024-12-07 11:45:33,328 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-07 11:45:34,503 - BERTopic - Representation - Completed ✓


In [87]:
#assert all(doc in topics for doc in reviews_text_list), "Subset must be part of the original dataset!"

In [88]:
# Group similar topics
#topic_model.reduce_topics(subset, nr_topics=2)

In [89]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,13831,-1_tablet_great_use_kids,"[tablet, great, use, kids, love, bought, good,...","[Great, these are great, great]"
1,0,4194,0_batteries_brand_long_price,"[batteries, brand, long, price, battery, work,...","[Great price for batteries and they work!, Gre..."
2,1,1055,1_kindle_read_reading_reader,"[kindle, read, reading, reader, light, books, ...",[I read a lot and have loved my kindle paperwh...
3,2,630,2_echo_tap_alexa_sound,"[echo, tap, alexa, sound, speaker, music, port...",[I bought the Amazon Tap when it was on sale. ...
4,3,524,3_tablet_apps_kids_games,"[tablet, apps, kids, games, great, storage, bo...",[This has the all speed and storage I needed f...
5,4,483,4_quality_price_good_good quality,"[quality, price, good, good quality, great, gr...","[good price. good quality, Good Price! Good qu..."
6,5,369,5_old_year old_year_games,"[old, year old, year, games, case, loves, kids...","[I bought this for my 6 year old daughter, it ..."
7,6,343,6_work_work great_far_great,"[work, work great, far, great, price, good, fa...","[They work great!, They work well and they are..."
8,7,333,7_tablet_great tablet_price_tablet price,"[tablet, great tablet, price, tablet price, gr...",[This is a great tablet. Great for the price. ...
9,8,312,8_long_48_lasting_long lasting,"[long, 48, lasting, long lasting, don long, pr...",[I love these batteries. I am so happy that I ...


In [90]:
topic_model.get_topic(0)

[('batteries', 0.043265806671289646),
 ('brand', 0.017479015879301146),
 ('long', 0.015191622403645058),
 ('price', 0.014189677219227221),
 ('battery', 0.013547394872205002),
 ('work', 0.012731002235219117),
 ('just', 0.012095115795158973),
 ('amazon', 0.012087891624878576),
 ('brands', 0.011337247693943285),
 ('good', 0.011136937264587268)]

In [91]:
topic_model.visualize_barchart(top_n_topics=10)

In [92]:
topic_model.visualize_topics()

In [93]:
topic_model.visualize_heatmap()

In [94]:
topic_model.visualize_hierarchy()

In [95]:
topic_model.merge_topics(subset, topics_to_merge=[3, 7])

In [96]:
print("After merging topics:")
print(topic_model.get_topic_info())

After merging topics:
    Topic  Count                                               Name  \
0      -1  13831                           -1_tablet_great_use_kids   
1       0   4194                       0_batteries_brand_long_price   
2       1   1055                       1_kindle_read_reading_reader   
3       2    857                  2_tablet_great_great tablet_price   
4       3    630                             3_echo_tap_alexa_sound   
5       4    483                  4_quality_price_good_good quality   
6       5    369                          5_old_year old_year_games   
7       6    343                        6_work_work great_far_great   
8       7    312                     7_long_48_lasting_long lasting   
9       8    311                8_tablet_product_great_great tablet   
10      9    304  9_duracell_batteries_duracell batteries_great ...   
11     10    287                     10_good good_gift_loves_bought   
12     11    278               11_energizer_xbox_durace

In [97]:
topic_model.visualize_barchart(top_n_topics=10)