<a href="https://colab.research.google.com/github/AnkitaSK/Proj3_RoboReviews/blob/main/ClusterProductCategoryOnReviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import polars as pl

In [32]:
#!unzip fine_tuned_bert.zip

unzip:  cannot find or open ./fine_tuned_bert.zip, ./fine_tuned_bert.zip.zip or ./fine_tuned_bert.zip.ZIP.


In [33]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from transformers import BertTokenizer

# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained("./fine_tuned_bert")
tokenizer = BertTokenizer.from_pretrained("./fine_tuned_bert")

In [34]:
df = pl.read_csv("reviews_with_sentiments.csv")

In [36]:
df = df.filter(df["reviews.text"].is_not_null() & (df["reviews.rating"].is_not_null()))

In [39]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [42]:
from torch.utils.data import DataLoader
import torch

# Prepare DataLoader
class Dataset(torch.utils.data.Dataset):
    def __init__(self, reviews):
        self.reviews = reviews

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        return self.reviews[idx]

dataset = Dataset(df["reviews.text"].to_list())
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Predict sentiments in batches
sentiments = []
label_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}
for batch in dataloader:
    inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True)

    # Move input tensors to the same device as the model
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Forward pass
    outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    batch_labels = torch.argmax(probabilities, dim=-1).tolist()

    # Map numerical labels to sentiment classes
    sentiments.extend([label_mapping[label] for label in batch_labels])

# df["Sentiment"] = sentiments
df = df.with_columns(pl.Series(name="Sentiment", values=sentiments))

In [43]:
print(df)

shape: (28_332, 26)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ id        ┆ dateAdded ┆ dateUpdat ┆ name      ┆ … ┆ reviews.u ┆ sourceURL ┆ reviews.r ┆ Sentimen │
│ ---       ┆ ---       ┆ ed        ┆ ---       ┆   ┆ sername   ┆ s         ┆ ating_upd ┆ t        │
│ str       ┆ str       ┆ ---       ┆ str       ┆   ┆ ---       ┆ ---       ┆ ate       ┆ ---      │
│           ┆           ┆ str       ┆           ┆   ┆ str       ┆ str       ┆ ---       ┆ str      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆ i64       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ AVpgNzjwL ┆ 2015-10-3 ┆ 2019-04-2 ┆ AmazonBas ┆ … ┆ Byger     ┆ https://w ┆ 1         ┆ Negative │
│ JeJML43Kp ┆ 0T08:59:3 ┆ 5T09:08:1 ┆ ics AAA   ┆   ┆ yang      ┆ ww.barcod ┆           ┆          │
│ xn        ┆ 2Z        ┆ 6Z        ┆ Performan ┆   ┆           ┆ able.

In [46]:
from sklearn.feature_extraction.text import CountVectorizer

# remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1,2), stop_words='english')

In [47]:
from sentence_transformers import SentenceTransformer

# custom embeddings
embedding_model = SentenceTransformer('all-MiniLM-l6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [49]:
#!pip install umap-learn

Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.7


In [50]:
from umap import UMAP

umap_model = UMAP(
    n_neighbors=3,
    n_components=3,
    min_dist=0.05
    )

In [51]:
#!pip install hdbscan

Collecting hdbscan
  Downloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Downloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/4.2 MB[0m [31m20.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.2/4.2 MB[0m [31m74.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hdbscan
Successfully installed hdbscan-0.8.40


In [52]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(
    min_cluster_size=80,
    min_samples=40,
    prediction_data=True,
    gen_min_span_tree=True
    )

In [53]:
#!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.16.4


In [54]:
from bertopic import BERTopic

topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    language='english',
    calculate_probabilities=True,
    verbose=True
    )

In [55]:
# filter by sentiment
sentiment_to_cluster = "Positive"  # Options: Positive, Neutral, Negative
filtered_reviews = df.filter(df["Sentiment"] == sentiment_to_cluster)["reviews.text"].to_list()

In [59]:
# Fit BERTopic model on filtered reviews
topics, probs = topic_model.fit_transform(filtered_reviews)

# Display the discovered topics
print(topic_model.get_topic_info())

2024-12-13 16:16:27,719 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/813 [00:00<?, ?it/s]

2024-12-13 16:16:34,689 - BERTopic - Embedding - Completed ✓
2024-12-13 16:16:34,690 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-13 16:16:45,081 - BERTopic - Dimensionality - Completed ✓
2024-12-13 16:16:45,083 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-13 16:16:50,738 - BERTopic - Cluster - Completed ✓
2024-12-13 16:16:50,747 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-13 16:16:51,740 - BERTopic - Representation - Completed ✓


    Topic  Count                                               Name  \
0      -1  14245                     -1_tablet_great_good_batteries   
1       0   1104                       0_kindle_reading_reader_read   
2       1    739                              1_loves_old_kids_year   
3       2    676                             2_tap_echo_alexa_sound   
4       3    489                    3_long_far_long lasting_lasting   
5       4    418                4_work_price_great price_price work   
6       5    331           5_tablet_tablet price_great tablet_price   
7       6    317         6_product_great product_good product_great   
8       7    276  7_duracell_batteries_duracell batteries_good d...   
9       8    269                   8_tablet_movies_great tablet_web   
10      9    267                          9_tablet_easy_prime_great   
11     10    260           10_batteries_love batteries_brand_brands   
12     11    244                      11_games_play_kids_play games   
13    

In [57]:
# Visualize the topic distributions
topic_model.visualize_barchart()

In [58]:
#topic_model.merge_topics(filtered_reviews, topics_to_merge=[3, 6])

In [67]:
topic_model.visualize_topics()

In [68]:
topic_model.visualize_heatmap()

In [69]:
topic_model.visualize_hierarchy()

In [61]:
# Map topics to reviews
df_filtered = df.filter(df["Sentiment"] == sentiment_to_cluster)


In [62]:
df_filtered = df_filtered.with_columns(pl.Series("Topic", topics))

In [63]:
print(df_filtered)

shape: (25_993, 27)
┌────────────┬────────────┬────────────┬───────────┬───┬───────────┬───────────┬───────────┬───────┐
│ id         ┆ dateAdded  ┆ dateUpdate ┆ name      ┆ … ┆ sourceURL ┆ reviews.r ┆ Sentiment ┆ Topic │
│ ---        ┆ ---        ┆ d          ┆ ---       ┆   ┆ s         ┆ ating_upd ┆ ---       ┆ ---   │
│ str        ┆ str        ┆ ---        ┆ str       ┆   ┆ ---       ┆ ate       ┆ str       ┆ i64   │
│            ┆            ┆ str        ┆           ┆   ┆ str       ┆ ---       ┆           ┆       │
│            ┆            ┆            ┆           ┆   ┆           ┆ i64       ┆           ┆       │
╞════════════╪════════════╪════════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════╡
│ AVpgNzjwLJ ┆ 2015-10-30 ┆ 2019-04-25 ┆ AmazonBas ┆ … ┆ https://w ┆ 2         ┆ Positive  ┆ 3     │
│ eJML43Kpxn ┆ T08:59:32Z ┆ T09:08:16Z ┆ ics AAA   ┆   ┆ ww.barcod ┆           ┆           ┆       │
│            ┆            ┆            ┆ Performan ┆   ┆ able.com/ ┆   

In [66]:
df_filtered.write_csv(f"reviews_with_{sentiment_to_cluster}_topics.csv")