
# Product Review Clustering

In this notebook only done on a sample, to see if we can find right settings and have an idea of possible categories of full data 

# Downloads

In [1]:
# Downloads
!pip -q install sentence-transformers umap-learn

# Libraries

In [4]:
#imports
import os, json, math, gc, random
from pathlib import Path
from datetime import datetime as _dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
import torch
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import umap

## Setup: mount Drive and configure paths

In [7]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

device = "cuda" if ("torch" in globals() and hasattr(__import__('torch'), "cuda") and __import__('torch').cuda.is_available()) else "cpu"
print("Device:", device)

# Base project folder on Drive
BASE = "/content/drive/MyDrive/Project_NLP"

PARQUET_FULL   = f"{BASE}/video_games_preprocessed.parquet"
PREPROCESS_CFG = f"{BASE}/preprocess_config.json"

# Output folder for this run
RUN_TAG = _dt.now().strftime("clustering_%Y%m%d_%H%M")
OUT = f"{BASE}/runs/{RUN_TAG}"
os.makedirs(OUT, exist_ok=True)

print("OUT:", OUT)
for name, p in [("PARQUET_FULL", PARQUET_FULL), ("PREPROCESS_CFG", PREPROCESS_CFG)]:
    print(f"{name:15} {'OK' if os.path.exists(p) else 'MISSING'} -> {p}")

# Load preprocessing config for column names
with open(PREPROCESS_CFG, "r") as f:
    cfg = json.load(f)
text_col  = cfg.get("text_col", "reviewText")
label_col = cfg.get("label_col", "sentiment")
print("text_col:", text_col, "| label_col:", label_col)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Device: cuda
OUT: /content/drive/MyDrive/Project_NLP/runs/clustering_20250828_1725
PARQUET_FULL    OK -> /content/drive/MyDrive/Project_NLP/video_games_preprocessed.parquet
PREPROCESS_CFG  OK -> /content/drive/MyDrive/Project_NLP/preprocess_config.json
text_col: clean_text | label_col: sentiment


## Load a manageable slice of reviews

In [None]:
# Loading only a small part of data set.
SAMPLE_N = 250_000 

use_cols = [text_col, label_col]
df = (
    pd.read_parquet(PARQUET_FULL, columns=use_cols)
      .dropna(subset=[text_col])
      .reset_index(drop=True)
)

if SAMPLE_N is not None and SAMPLE_N < len(df):
    df = df.sample(SAMPLE_N, random_state=42).reset_index(drop=True)

print("Data shape:", df.shape)
df[text_col] = df[text_col].astype(str)  # to ensure text is string
df.head(2)


Data shape: (250000, 2)


Unnamed: 0,clean_text,sentiment
0,The origional LOZ was and still is my favorite...,positive
1,2K has brought it back with this edition. I ha...,positive


## Build sentence embeddings (MiniLM)

In [9]:
MODEL_EMB = "sentence-transformers/all-MiniLM-L6-v2"
batch_size = 512

emb_model = SentenceTransformer(MODEL_EMB, device="cuda" if torch.cuda.is_available() else "cpu")

embeddings = emb_model.encode(
    df[text_col].tolist(),
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,
)

print("Embeddings shape:", embeddings.shape)

# Save embeddings for reuse
np.save(f"{OUT}/embeddings.npy", embeddings)
df.to_parquet(f"{OUT}/corpus.parquet", index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/489 [00:00<?, ?it/s]

Embeddings shape: (250000, 384)


## Dimensionality reduction with PCA

In [10]:
emb = embeddings if 'embeddings' in globals() else np.load(f"{OUT}/embeddings.npy")

PCA_N = 50   # keep 50 principal components
pca = PCA(n_components=PCA_N, random_state=42)
emb_pca = pca.fit_transform(emb)

print("Original shape:", emb.shape,
      "| PCA shape:", emb_pca.shape,
      "| variance kept:", round(pca.explained_variance_ratio_.sum(), 4))

# Save PCA-reduced embeddings
np.save(f"{OUT}/embeddings_pca.npy", emb_pca)


Original shape: (250000, 384) | PCA shape: (250000, 50) | variance kept: 0.5921


## Clustering with Kmeans with k=6

In [11]:
K = 6   #setting to 6 categories first to try.

kmeans = MiniBatchKMeans(
    n_clusters=K,
    batch_size=8192,
    n_init="auto",
    random_state=42
)
kmeans.fit(emb_pca)
labels = kmeans.predict(emb_pca)

# Attach cluster labels to df
df["cluster"] = labels

print("Cluster sizes:")
print(df["cluster"].value_counts().sort_index())

# quick silhouette score
sil = silhouette_score(emb_pca, labels, metric="euclidean")
print(f"Silhouette score: {sil:.4f}")

Cluster sizes:
cluster
0    18868
1    72691
2    45908
3    48969
4    23823
5    39741
Name: count, dtype: int64
Silhouette score: 0.0984


## Label clusters with top terms (TF‑IDF)

In [12]:
vectorizer = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1,2),
    min_df=3
)
X_tfidf = vectorizer.fit_transform(df[text_col].astype(str).tolist())
vocab = np.array(vectorizer.get_feature_names_out())

def top_terms_for_cluster(c_id, top_n=12):
    idx = np.where(df["cluster"].values == c_id)[0]
    if len(idx) == 0:
        return []
    mean_tfidf = X_tfidf[idx].mean(axis=0).A1
    top_idx = np.argsort(mean_tfidf)[::-1][:top_n]
    return vocab[top_idx].tolist()

cluster_terms = {c: top_terms_for_cluster(c) for c in range(K)}

for c, terms in cluster_terms.items():
    print(f"Cluster {c}:", ", ".join(terms))


Cluster 0: the, headset, and, sound, is, to, it, for, headphones, mic, my, quality
Cluster 1: game, the, and, to, it, this, is, of, you, this game, for, the game
Cluster 2: it, the, works, and, to, great, for, not, this, was, work, is
Cluster 3: great, good, it, love, love it, excellent, product, perfect, works, awesome, very, loves
Cluster 4: the, mouse, it, and, keyboard, to, is, for, this, of, my, but
Cluster 5: the, controller, it, to, and, for, my, this, is, with, on, of


Seems that we have a category for headsets/sound accessories, games, mouse/keyboard, and controllers. and cluster 2 and 3 do not seem to have a specific category.

In [14]:
# inspecting clusters
centroids_n = normalize(kmeans.cluster_centers_)
emb_pca_n = normalize(emb_pca)

def exemplars_for_cluster(c_id, top_m=5):
    idx = np.where(df["cluster"].values == c_id)[0]
    if len(idx) == 0:
        return pd.DataFrame(columns=[text_col, "similarity"])
    sims = emb_pca_n[idx] @ centroids_n[c_id].reshape(-1,1)
    sims = sims.ravel()
    order = np.argsort(-sims)[:top_m]
    return df.iloc[idx[order]][[text_col]].assign(similarity=sims[order])

for c in range(K):
    print(f"\n=== Cluster {c} ===")
    display(exemplars_for_cluster(c, top_m=5))


=== Cluster 0 ===


Unnamed: 0,clean_text,similarity
181559,Honestly...These are probably some of the best...,0.9056
100723,This is a nice headset; I usually forgo using ...,0.903763
184244,So I got this head set for work and gaming it ...,0.899619
2984,I night this headset to use at work. I have ha...,0.897688
169120,I bought the headset to use in meetings when t...,0.897651



=== Cluster 1 ===


Unnamed: 0,clean_text,similarity
36990,"What a great game, I wish the 6th game was mor...",0.89396
115973,"When I ordered this game, I was expecting a fu...",0.882689
178376,I was excited when this game was released sinc...,0.873032
89743,I got this game because I had played it before...,0.866506
242515,Having now put in over 60 hours into this game...,0.865576



=== Cluster 2 ===


Unnamed: 0,clean_text,similarity
212620,It's worked as advertised. It just came in dam...,0.795555
135868,Piece of crap!!! Worked for about 2 days @ the...,0.771417
54276,"Didn't work, why would you sell something that...",0.759157
141023,This worked less than 4 weeks after purchase. ...,0.755014
74053,Terrible product. Possibly going to return if ...,0.751767



=== Cluster 3 ===


Unnamed: 0,clean_text,similarity
106690,wonderfull,0.838165
48002,WONDERFULL,0.838165
145938,fabulous,0.817928
109643,fabulous,0.817927
78735,Fabulous,0.817927



=== Cluster 4 ===


Unnamed: 0,clean_text,similarity
183957,Truly impressed with the quality of both the k...,0.900223
226680,This mechanical keyboard and mouse are awesome...,0.88974
247813,The Mechanical Keyboard and Mouse Combo is an ...,0.878787
116584,I actually use this for work. I have so many k...,0.878561
194511,"This is truly a great mouse, and Logitech is t...",0.877875



=== Cluster 5 ===


Unnamed: 0,clean_text,similarity
122758,"I bought 2 of these controllers, and one didn'...",0.8541
47885,The left joystick has a huge dead spot in it. ...,0.840303
10585,not compatible with next generation controller...,0.833773
243868,Console works great but the controller came wi...,0.826988
131276,I was a little bit worried if it would work wi...,0.821821


After inspection it seems that cluster 2 is about broken/defect product, negative sentiment

and cluster 3 is short positive reviews, positive sentiment.

Probably better to merge these in to the other categories, to keep it product focused and not sentiment focused

## Merging clusters

In [16]:
# Prepare normalized embeddings and centroids in PCA space
emb_pca_n = normalize(emb_pca)                          # (N, d)
centroids_n = normalize(kmeans.cluster_centers_)        # (K, d)

product_clusters = np.array([0, 1, 4, 5])               # keeping these as final product groups
to_merge = np.array([2, 3])                             # sentiment/noise clusters to be merged to nearest cluster

# Reassign every point currently in {2,3} to the *nearest* product centroid
labels_merged = labels.copy()
idx_noise = np.where(np.isin(labels, to_merge))[0]
if idx_noise.size > 0:
    sims = emb_pca_n[idx_noise] @ centroids_n[product_clusters].T    # (n_noise, 4)
    nearest = product_clusters[np.argmax(sims, axis=1)]
    labels_merged[idx_noise] = nearest

# Attach merged labels
df["cluster_merged"] = labels_merged

print("Counts BEFORE merge:")
print(pd.Series(labels).value_counts().sort_index(), "\n")
print("Counts AFTER merge:")
print(pd.Series(labels_merged).value_counts().sort_index())

# Optional: silhouette after merge
sil_merged = silhouette_score(emb_pca, labels_merged, metric="euclidean")
print(f"\nSilhouette (merged): {sil_merged:.4f}")

# 4) Recompute top TF-IDF terms for the merged clusters (uses X_tfidf from earlier TF-IDF cell)
vocab = np.array(vectorizer.get_feature_names_out())  # reuse the fitted vectorizer

def top_terms_for_cluster_merged(c_id, top_n=12):
    idx = np.where(df["cluster_merged"].values == c_id)[0]
    if len(idx) == 0:
        return []
    mean_tfidf = X_tfidf[idx].mean(axis=0).A1
    top_idx = np.argsort(mean_tfidf)[::-1][:top_n]
    return vocab[top_idx].tolist()

final_clusters = sorted(product_clusters.tolist())      # [0,1,4,5]
final_terms = {c: top_terms_for_cluster_merged(c) for c in final_clusters}

print("\n=== Top terms per FINAL product cluster ===")
for c in final_clusters:
    print(f"Cluster {c}: {', '.join(final_terms[c])}")

# 5) (Optional) Simple readable names and save final artifacts
def make_label(terms, n=4):
    return ", ".join(terms[:n])

final_names = {c: make_label(final_terms[c]) for c in final_clusters}
df["cluster_name_final"] = df["cluster_merged"].map(final_names)

# Save
OUT = Path(BASE) / "runs" / RUN_TAG
OUT.mkdir(parents=True, exist_ok=True)

print("Saving outputs to:", OUT)

(df[[text_col, "cluster_merged", "cluster_name_final"]]
 .to_csv(OUT / "cluster_assignments_final.csv", index=False))

import json
with open(OUT / "cluster_summary_final.json", "w") as f:
    json.dump({
        "k_final": len(final_clusters),
        "clusters": final_clusters,
        "counts": pd.Series(labels_merged).value_counts().sort_index().to_dict(),
        "names": final_names,
        "top_terms": final_terms,
    }, f, indent=2)

print("\nSaved:")
print(OUT / "cluster_assignments_final.csv")
print(OUT / "cluster_summary_final.json")

Counts BEFORE merge:
0    18868
1    72691
2    45908
3    48969
4    23823
5    39741
Name: count, dtype: int64 

Counts AFTER merge:
0    59401
1    88243
4    47935
5    54421
Name: count, dtype: int64

Silhouette (merged): 0.0751

=== Top terms per FINAL product cluster ===
Cluster 0: good, the, great, and, works, it, product, to, for, headset, excellent, is
Cluster 1: game, the, it, and, to, this, is, great, of, you, this game, for
Cluster 4: the, it, and, to, mouse, is, for, great, this, keyboard, my, perfect
Cluster 5: the, it, to, and, controller, for, my, this, works, is, with, on
Saving outputs to: /content/drive/MyDrive/Project_NLP/runs/clustering_20250828_1725

Saved:
/content/drive/MyDrive/Project_NLP/runs/clustering_20250828_1725/cluster_assignments_final.csv
/content/drive/MyDrive/Project_NLP/runs/clustering_20250828_1725/cluster_summary_final.json


In [17]:
# inspecting new clusters
centroids_final = []
for c in [0,1,4,5]:
    idx = np.where(df["cluster_merged"] == c)[0]
    if len(idx) > 0:
        centroids_final.append(emb_pca[idx].mean(axis=0))
centroids_final = normalize(np.vstack(centroids_final))
emb_pca_n = normalize(emb_pca)

def exemplars_for_final_cluster(c_id, top_m=5):
    idx = np.where(df["cluster_merged"].values == c_id)[0]
    if len(idx) == 0:
        return pd.DataFrame(columns=[text_col, "similarity"])
    sims = emb_pca_n[idx] @ centroids_final[[[0,1,2,3].index([0,1,4,5].index(c_id))]].T
    sims = sims.ravel()
    order = np.argsort(-sims)[:top_m]
    return df.iloc[idx[order]][[text_col]].assign(similarity=sims[order])

for c in [0,1,4,5]:
    print(f"\n=== Final Cluster {c} ===")
    display(exemplars_for_final_cluster(c, top_m=5))


=== Final Cluster 0 ===


Unnamed: 0,clean_text,similarity
36644,Love these earbuds. Work well. Timely delivery...,0.735193
4747,"Good ear cup, good mic",0.729336
77824,Excellent quality headphones,0.722914
62967,Great quality headphones,0.72025
128132,Great earphones.,0.719036



=== Final Cluster 1 ===


Unnamed: 0,clean_text,similarity
36990,"What a great game, I wish the 6th game was mor...",0.899982
178376,I was excited when this game was released sinc...,0.873349
192905,"this is a very great game, i saw the movie and...",0.855328
89743,I got this game because I had played it before...,0.853519
143535,This game was a gift for my son in law. I neve...,0.850776



=== Final Cluster 4 ===


Unnamed: 0,clean_text,similarity
93607,I got this for my office at work. I figured if...,0.782008
176124,Love how it feels and the brown switches are p...,0.776985
29975,Note: i did get a used one on purpose I would ...,0.773736
231325,"I have to say, for a keyboard this is really n...",0.773154
193626,Very nice and clicky. I wish is was customize-...,0.767829



=== Final Cluster 5 ===


Unnamed: 0,clean_text,similarity
154102,would not work with xbox360 controller,0.838274
192057,"Does not work with my 360 controllers, the ana...",0.83593
122758,"I bought 2 of these controllers, and one didn'...",0.825507
243868,Console works great but the controller came wi...,0.81898
10585,not compatible with next generation controller...,0.817808


Now it seems that the remaining 4 clusters each have a category. headset/audio, games, keyboard/mice, controllers