
# Product Review Clustering
After doing the sample of 250k reviews we are now doing clustering on the entire data set. We are doing it bit different since checkpoints to save are needed incase of crashes.


# Downloads

In [21]:
# Downloads
!pip -q install sentence-transformers umap-learn
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=d744623677c6879b5f7b5654e7b4066742fb80ed46bb44e67e4fd35bc877d5da
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


# Libraries

In [None]:
#imports
import os, json, math, gc, random
from pathlib import Path
from datetime import datetime as _dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
import torch
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from joblib import dump, load
import umap
import time
from langdetect import detect


## Setup: mount Drive and configure paths

In [4]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

device = "cuda" if ("torch" in globals() and hasattr(__import__('torch'), "cuda") and __import__('torch').cuda.is_available()) else "cpu"
print("Device:", device)

# Base project folder on Drive
BASE = "/content/drive/MyDrive/Project_NLP"

PARQUET_FULL   = f"{BASE}/video_games_preprocessed.parquet"
PREPROCESS_CFG = f"{BASE}/preprocess_config.json"

# Output folder for this run
RUN_TAG = _dt.now().strftime("clustering_full_%Y%m%d_%H%M")
OUT = Path(BASE) / "runs" / RUN_TAG
os.makedirs(OUT, exist_ok=True)

print("OUT:", OUT)
for name, p in [("PARQUET_FULL", PARQUET_FULL), ("PREPROCESS_CFG", PREPROCESS_CFG)]:
    print(f"{name:15} {'OK' if os.path.exists(p) else 'MISSING'} -> {p}")

# Load preprocessing config for column names
with open(PREPROCESS_CFG, "r") as f:
    cfg = json.load(f)
text_col  = cfg.get("text_col", "reviewText")
label_col = cfg.get("label_col", "sentiment")
print("text_col:", text_col, "| label_col:", label_col)

# --- Checkpoint helpers (for long full run) ---
CKPT = OUT / "checkpoint.json"
def save_ckpt(**kv):
    ck = {}
    if CKPT.exists():
        try: ck = json.loads(CKPT.read_text())
        except: ck = {}
    ck.update(kv)
    CKPT.write_text(json.dumps(ck, indent=2))

def load_ckpt():
    return json.loads(CKPT.read_text()) if CKPT.exists() else {}

print("Checkpoint file:", CKPT)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Device: cuda
OUT: /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924
PARQUET_FULL    OK -> /content/drive/MyDrive/Project_NLP/video_games_preprocessed.parquet
PREPROCESS_CFG  OK -> /content/drive/MyDrive/Project_NLP/preprocess_config.json
text_col: clean_text | label_col: sentiment
Checkpoint file: /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/checkpoint.json


## Loading full data set of reviews

In [5]:
# Loading data set.
SAMPLE_N = None

use_cols = [text_col]
df = (
    pd.read_parquet(PARQUET_FULL, columns=use_cols)
      .dropna(subset=[text_col])
      .reset_index(drop=True)
)

if SAMPLE_N is not None and SAMPLE_N < len(df):
    df = df.sample(SAMPLE_N, random_state=42).reset_index(drop=True)

print("Data shape:", df.shape)
df[text_col] = df[text_col].astype(str)  # to ensure text is string
df.head(2)


Data shape: (4624615, 1)


Unnamed: 0,clean_text
0,I’m playing on ps5 and it’s interesting. It’s ...
1,Nostalgic fun. A bit slow. I hope they don’t s...


## Build sentence embeddings (MiniLM)

In [10]:
MODEL_EMB  = "sentence-transformers/all-MiniLM-L6-v2"
ENC_BATCH  = 768          # trying 768/1024 if VRAM allows; lower to 256 if OOM
READ_CHUNK = 20_000       # rows per outer write
EMB_DIM    = 384          # MiniLM-L6-v2 dimension

device = "cuda" if torch.cuda.is_available() else "cpu"
emb_model = SentenceTransformer(MODEL_EMB, device=device)

N = len(df)

# Crash-safe memmap on Drive
EMB_PATH = OUT / "embeddings.memmap"
if EMB_PATH.exists():
    emb_mm = np.memmap(EMB_PATH, dtype="float32", mode="r+", shape=(N, EMB_DIM))
else:
    emb_mm = np.memmap(EMB_PATH, dtype="float32", mode="w+", shape=(N, EMB_DIM))

# Resume from checkpoint if present
ck = load_ckpt()
start_i = int(ck.get("embeddings_written", 0))
print(f"Resuming at index: {start_i:,}/{N:,}")

t0 = time.time()
for i in range(start_i, N, READ_CHUNK):
    j = min(i + READ_CHUNK, N)
    texts = df[text_col].iloc[i:j].astype(str).tolist()

    pos = i
    for k in range(0, len(texts), ENC_BATCH):
        batch = texts[k:k+ENC_BATCH]
        out = emb_model.encode(
            batch,
            batch_size=ENC_BATCH,
            convert_to_numpy=True,
            normalize_embeddings=True,
            show_progress_bar=False,
            device=device
        ).astype("float32")
        emb_mm[pos:pos+len(out)] = out
        pos += len(out)

    emb_mm.flush()
    save_ckpt(embeddings_written=j)

    done = j
    elapsed = time.time() - t0
    rate = done / max(elapsed, 1)
    eta  = (N - done) / max(rate, 1)
    print(f"[{done:,}/{N:,}] ~{rate:,.0f} rows/s | elapsed {elapsed/60:.1f}m | ETA {eta/60:.1f}m")

print("Embeddings saved to:", EMB_PATH)


Resuming at index: 0/4,624,615
[20,000/4,624,615] ~387 rows/s | elapsed 0.9m | ETA 198.5m
[40,000/4,624,615] ~390 rows/s | elapsed 1.7m | ETA 195.8m
[60,000/4,624,615] ~391 rows/s | elapsed 2.6m | ETA 194.5m
[80,000/4,624,615] ~390 rows/s | elapsed 3.4m | ETA 194.0m
[100,000/4,624,615] ~394 rows/s | elapsed 4.2m | ETA 191.2m
[120,000/4,624,615] ~396 rows/s | elapsed 5.1m | ETA 189.7m
[140,000/4,624,615] ~397 rows/s | elapsed 5.9m | ETA 188.3m
[160,000/4,624,615] ~398 rows/s | elapsed 6.7m | ETA 186.8m
[180,000/4,624,615] ~399 rows/s | elapsed 7.5m | ETA 185.5m
[200,000/4,624,615] ~401 rows/s | elapsed 8.3m | ETA 184.1m
[220,000/4,624,615] ~401 rows/s | elapsed 9.1m | ETA 182.9m
[240,000/4,624,615] ~403 rows/s | elapsed 9.9m | ETA 181.5m
[260,000/4,624,615] ~403 rows/s | elapsed 10.8m | ETA 180.6m
[280,000/4,624,615] ~403 rows/s | elapsed 11.6m | ETA 179.5m
[300,000/4,624,615] ~404 rows/s | elapsed 12.4m | ETA 178.4m
[320,000/4,624,615] ~403 rows/s | elapsed 13.2m | ETA 177.9m
[340,000/

Took 3 hours to fully embed the data.

## Dimensionality reduction with PCA

In [11]:
from sklearn.decomposition import IncrementalPCA
from joblib import dump, load

PCA_D   = 50          # keep 50 principal components
EMB_DIM = 384         # MiniLM embedding size
BATCH   = 100_000     # chunk size for IPCA fit/transform (tune if needed)

N = len(df)
EMB_PATH = OUT / "embeddings.memmap"         # produced by the embeddings cell
IPCA_PATH = OUT / "ipca.joblib"              # checkpoint for the fitted IPCA
PCA_PATH  = OUT / "embeddings_pca.memmap"    # output memmap (N x PCA_D)

# load embeddings memmap (read-only)
emb_mm = np.memmap(EMB_PATH, dtype="float32", mode="r", shape=(N, EMB_DIM))

ck = load_ckpt()

# Fit IncrementalPCA (resume if already fitted)
if IPCA_PATH.exists() and ck.get("ipca_fitted", False):
    ipca = load(IPCA_PATH)
    print("Loaded fitted IPCA from checkpoint.")
else:
    ipca = IncrementalPCA(n_components=PCA_D, batch_size=BATCH)
    print("Fitting IPCA (partial_fit over chunks)...")
    for i in range(0, N, BATCH):
        j = min(i + BATCH, N)
        ipca.partial_fit(emb_mm[i:j])
        if (j % 500_000) == 0 or j == N:
            print("  fitted:", j)
    dump(ipca, IPCA_PATH)
    save_ckpt(ipca_fitted=True)
    print("IPCA fitted & saved:", IPCA_PATH)

# Transform to PCA space (resume)
if PCA_PATH.exists():
    emb_pca = np.memmap(PCA_PATH, dtype="float32", mode="r+", shape=(N, PCA_D))
else:
    emb_pca = np.memmap(PCA_PATH, dtype="float32", mode="w+", shape=(N, PCA_D))

start_t = int(ck.get("ipca_transformed", 0))
print(f"Transform resume index: {start_t}/{N}")

for i in range(start_t, N, BATCH):
    j = min(i + BATCH, N)
    emb_pca[i:j] = ipca.transform(emb_mm[i:j]).astype("float32")
    emb_pca.flush()
    save_ckpt(ipca_transformed=j)
    if (j % 500_000) == 0 or j == N:
        print("  transformed:", j)

var_kept = float(ipca.explained_variance_ratio_.sum())
print(f"PCA variance kept: {var_kept:.4f}")
print("PCA memmap saved to:", PCA_PATH)

Fitting IPCA (partial_fit over chunks)...
  fitted: 500000
  fitted: 1000000
  fitted: 1500000
  fitted: 2000000
  fitted: 2500000
  fitted: 3000000
  fitted: 3500000
  fitted: 4000000
  fitted: 4500000
  fitted: 4624615
IPCA fitted & saved: /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/ipca.joblib
Transform resume index: 0/4624615
  transformed: 500000
  transformed: 1000000
  transformed: 1500000
  transformed: 2000000
  transformed: 2500000
  transformed: 3000000
  transformed: 3500000
  transformed: 4000000
  transformed: 4500000
  transformed: 4624615
PCA variance kept: 0.5914
PCA memmap saved to: /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/embeddings_pca.memmap


## Clustering with Kmeans with K=6
I first tried to directly do k=4, since sample showed 4 categories is the expected outcome. But this did not cluster well when done directly to 4.

In [None]:
# Cluster with K = 6 (checkpointed, streaming)
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from joblib import dump, load
import numpy as np
import pandas as pd
from pathlib import Path

# Assumes these already exist:
# - OUT (Path)                 -> base run folder for this run tag
# - df (DataFrame)             -> full reviews
# - text_col (str)             -> review text column
# - PCA memmap at OUT/"embeddings_pca.memmap" from your IPCA step

K = 6
BATCH = 100_000
PCA_D = 50

# Paths
PCA_PATH   = OUT / "embeddings_pca.memmap"          # produced by IPCA step
KM_PATH    = OUT / "kmeans_k6.joblib"
LAB_PATH   = OUT / "labels_k6.memmap"
ASSIGN_CSV = OUT / "cluster_assignments_raw_k6.csv"

# load PCA memmap
N = len(df)
emb_pca = np.memmap(PCA_PATH, dtype="float32", mode="r", shape=(N, PCA_D))

# train / resume MiniBatchKMeans(K=6)
if KM_PATH.exists():
    kmeans6 = load(KM_PATH)
    print("Loaded existing K=6 KMeans checkpoint:", KM_PATH)
else:
    kmeans6 = MiniBatchKMeans(
        n_clusters=K,
        batch_size=100_000,
        n_init="auto",
        random_state=42,
        verbose=0,
    )

print("Fitting MiniBatchKMeans(K=6) with partial_fit...")
for i in range(0, N, BATCH):
    j = min(i + BATCH, N)
    kmeans6.partial_fit(emb_pca[i:j])
    if (j % 500_000) == 0 or j == N:
        dump(kmeans6, KM_PATH)
        print("  checkpoint @", j)
dump(kmeans6, KM_PATH)
print("K=6 model saved ->", KM_PATH)

# predict labels -> memmap + streaming CSV
if LAB_PATH.exists():
    labels_mm = np.memmap(LAB_PATH, dtype="int32", mode="r+", shape=(N,))
else:
    labels_mm = np.memmap(LAB_PATH, dtype="int32", mode="w+", shape=(N,))

if ASSIGN_CSV.exists():
    ASSIGN_CSV.unlink()

print("Predicting labels (K=6) & writing CSV...")
for i in range(0, N, BATCH):
    j = min(i + BATCH, N)
    block_labels = kmeans6.predict(emb_pca[i:j]).astype("int32")
    labels_mm[i:j] = block_labels
    labels_mm.flush()

    # stream (text + cluster6) to CSV
    pd.DataFrame({
        text_col: df[text_col].iloc[i:j].values,
        "cluster6": block_labels,
    }).to_csv(ASSIGN_CSV, mode="a", header=(i == 0), index=False)

    if (j % 500_000) == 0 or j == N:
        print("  predicted @", j)

# basic stats
labels_np = np.array(labels_mm)
sizes = pd.Series(labels_np).value_counts().sort_index()
print("\nK=6 cluster sizes:", sizes.to_dict())

# optional: quick silhouette on a sample (full data too big)
sample_n = min(50_000, N)
rng = np.random.default_rng(42)
idx = rng.choice(N, size=sample_n, replace=False)
sil = silhouette_score(emb_pca[idx], labels_np[idx], metric="euclidean")
print(f"Silhouette (sample {sample_n}): {sil:.4f}")

print("\nSaved:")
print("  labels memmap  ->", LAB_PATH)
print("  raw assignments ->", ASSIGN_CSV)
print("  model           ->", KM_PATH)


Fitting MiniBatchKMeans(K=6) with partial_fit...
  checkpoint @ 500000
  checkpoint @ 1000000
  checkpoint @ 1500000
  checkpoint @ 2000000
  checkpoint @ 2500000
  checkpoint @ 3000000
  checkpoint @ 3500000
  checkpoint @ 4000000
  checkpoint @ 4500000
  checkpoint @ 4624615
K=6 model saved -> /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/kmeans_k6.joblib
Predicting labels (K=6) & writing CSV...
  predicted @ 500000
  predicted @ 1000000
  predicted @ 1500000
  predicted @ 2000000
  predicted @ 2500000
  predicted @ 3000000
  predicted @ 3500000
  predicted @ 4000000
  predicted @ 4500000
  predicted @ 4624615

K=6 cluster sizes: {0: 965934, 1: 441221, 2: 410243, 3: 1237926, 4: 990317, 5: 578974}
Silhouette (sample 50000): 0.0963

Saved:
  labels memmap  -> /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/labels_k6.memmap
  raw assignments -> /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/cluster_assignments_raw_k6.c

## Label clusters with top terms (TF‑IDF)

In [None]:
# c-TF-IDF labeling for K = 6 clusters
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

K = 6
LAB_PATH = OUT / "labels_k6.memmap"   # from K=6 KMeans step

# 1) Load K=6 labels and attach to df
labels_k6 = np.memmap(LAB_PATH, dtype="int32", mode="r", shape=(len(df),))
labels_np = np.array(labels_k6)  # for convenience
df["cluster6"] = labels_np

# 2) Stratified sample per cluster (keeps it fast and memory-safe)
sample_per_cluster = 30_000          
rng = np.random.default_rng(42)
idx_keep = []

for c in range(K):
    idx_c = np.where(labels_np == c)[0]
    if len(idx_c) == 0:
        continue
    take = min(sample_per_cluster, len(idx_c))
    if take > 0:
        idx_keep.append(rng.choice(idx_c, size=take, replace=False))

idx_keep = np.concatenate(idx_keep) if len(idx_keep) else np.array([], dtype=int)

df_sample = pd.DataFrame({
    text_col: df[text_col].iloc[idx_keep].astype(str).values,
    "cluster6": labels_np[idx_keep],
})

print("TF-IDF sample size:", len(df_sample))

# 3) Build one "big document" per cluster (c-TF-IDF)
docs_per_cluster = (
    df_sample.groupby("cluster6")[text_col]
    .apply(lambda s: " ".join(s))             # concatenate reviews in the cluster
    .reindex(range(K), fill_value="")
)

# 4) Vectorize
EN = []  # keep empty to avoid dropping English
ES = [
    "el","la","los","las","de","del","al","y","o","u","a","en","con","por","para","muy",
    "es","son","ser","un","una","unos","unas","que","como","pero","si","no","ya","mas",
    "bien","mal","buen","bueno","buena","buenas","buenos","gracias","producto",
    "este","esta"
]
custom_stop = list(set(EN) | set(ES))

vectorizer = TfidfVectorizer(
    stop_words=custom_stop,
    ngram_range=(1, 2),
    token_pattern=r"(?u)\b[a-z][a-z]+\b",
    min_df=5,
    max_df=0.90,
    max_features=150_000,
)

X_ctfidf = vectorizer.fit_transform(docs_per_cluster.values)  # shape: (K, V)
vocab = np.array(vectorizer.get_feature_names_out())

def top_terms_ctfidf(c_id, top_n=15):
    row = X_ctfidf[c_id].toarray().ravel()
    top_idx = np.argsort(row)[::-1][:top_n]
    return vocab[top_idx].tolist()

# 5) Print top terms
cluster_terms = {c: top_terms_ctfidf(c, top_n=15) for c in range(K)}
for c, terms in cluster_terms.items():
    print(f"Cluster {c}: {', '.join(terms)}")


TF-IDF sample size: 180000
Cluster 0: calidad, perfecto, product good, this book, recomendado, mucho, great thank, price works, great perfect, for grandson, described works, funciona, product excellent, nice great, awesome works
Cluster 1: the mouse, the keys, logitech, mouse is, keyboards, keyboard is, dpi, the buttons, keyboard and, keys are, keyboard for, gaming mouse, scroll wheel, buttons are, bluetooth
Cluster 2: headset, mic, headphones, the headset, the mic, this headset, microphone, ear, volume, headsets, the microphone, headset is, headset for, the ear, headphone
Cluster 3: gameplay, game but, multiplayer, enemies, game has, the gameplay, rpg, game you, games and, game with, the main, play this, puzzles, halo, story is
Cluster 4: the xbox, oculus, games and, xbox one, ea, hdmi, disk, hard drive, error, my xbox, the disc, data, my computer, memory card, codes
Cluster 5: controllers, the controller, the xbox, xbox one, controller is, controller and, joystick, the controllers, t

First cluster (0) was only spanish, now a bit better, but still seems sentiment and not categorical

## Merging to 4 clusters to find only categorical clusters

In [None]:
# Paths
PCA_PATH = OUT / "embeddings_pca.memmap"
LAB6_PATH = OUT / "labels_k6.memmap"
KM6_PATH = OUT / "kmeans_k6.joblib"

# Load data
N = len(df)
emb_pca = np.memmap(PCA_PATH, dtype="float32", mode="r", shape=(N, 50))
labels_k6 = np.memmap(LAB6_PATH, dtype="int32", mode="r", shape=(N,))
labels6_np = np.array(labels_k6)

from joblib import load
kmeans6 = load(KM6_PATH)

# Normalized centroids (for cosine sim)
centroids_k6_n = normalize(kmeans6.cluster_centers_)
emb_pca_n = normalize(emb_pca)

# Which clusters to KEEP (product categories) → [1,2,3,5]
keep = np.array([1,2,3,5])
drop = np.array([c for c in range(6) if c not in keep])
print("Keeping clusters:", keep.tolist(), " — reassigning:", drop.tolist())

# Reassign dropped clusters to nearest of kept centroids
labels_merged = labels6_np.copy()
idx_drop = np.where(np.isin(labels_merged, drop))[0]
print("Points to reassign:", idx_drop.size)

if idx_drop.size > 0:
    sims = emb_pca_n[idx_drop] @ centroids_k6_n[keep].T   # (n_drop, 4)
    nearest = keep[np.argmax(sims, axis=1)]
    labels_merged[idx_drop] = nearest

# Remap final cluster IDs to [0..3]
uniq_kept = sorted(keep.tolist())
id_map = {old:i for i, old in enumerate(uniq_kept)}
labels_final = np.array([id_map[l] for l in labels_merged], dtype="int32")

# Save to memmap
LABF_PATH = OUT / "labels_final_k4.memmap"
labels_final_mm = np.memmap(LABF_PATH, dtype="int32", mode="w+", shape=(N,))
labels_final_mm[:] = labels_final
labels_final_mm.flush()

# Sizes
counts = pd.Series(labels_final).value_counts().sort_index().to_dict()
print("Final K=4 sizes:", counts)

# Quick CSV + JSON
FINAL_CSV = OUT / "cluster_assignments_final_k4.csv"
SUMMARY_JSON = OUT / "cluster_summary_final_k4.json"

if FINAL_CSV.exists(): FINAL_CSV.unlink()
name_map = {0:"Keyboards & Mice", 1:"Headsets & Audio", 2:"Games", 3:"Controllers"}

batch = 100_000
for i in range(0, N, batch):
    j = min(i+batch, N)
    block = labels_final[i:j]
    pd.DataFrame({
        text_col: df[text_col].iloc[i:j].values,
        "cluster": block,
        "cluster_name": [name_map[b] for b in block],
    }).to_csv(FINAL_CSV, mode="a", header=(i==0), index=False)
    if (j % 500_000)==0 or j==N:
        print("  wrote @", j)

json.dump({
    "k_final": 4,
    "counts": counts,
    "names": name_map,
    "kept_from_k6": uniq_kept,
}, open(SUMMARY_JSON,"w"), indent=2)

print("\nSaved:")
print("  labels_final memmap ->", LABF_PATH)
print("  assignments CSV     ->", FINAL_CSV)
print("  summary JSON        ->", SUMMARY_JSON)


Keeping clusters: [1, 2, 3, 5]  — reassigning: [0, 4]
Points to reassign: 1956251
Final K=4 sizes: {0: 913087, 1: 1185858, 2: 1618488, 3: 907182}
  wrote @ 500000
  wrote @ 1000000
  wrote @ 1500000
  wrote @ 2000000
  wrote @ 2500000
  wrote @ 3000000
  wrote @ 3500000
  wrote @ 4000000
  wrote @ 4500000
  wrote @ 4624615

Saved:
  labels_final memmap -> /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/labels_final_k4.memmap
  assignments CSV     -> /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/cluster_assignments_final_k4.csv
  summary JSON        -> /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/cluster_summary_final_k4.json


In [30]:
# === Label FINAL K=4 clusters with c-TF-IDF (after merge) ===
import numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

K = 4
LABF_PATH = OUT / "labels_final_k4.memmap"   # from the merge step
labels_k4 = np.memmap(LABF_PATH, dtype="int32", mode="r", shape=(len(df),))

# Stratified sample per cluster
sample_per_cluster = 30_000
idx_keep, rng = [], np.random.default_rng(42)
for c in range(K):
    idx_c = np.where(labels_k4 == c)[0]
    take = min(sample_per_cluster, len(idx_c))
    if take > 0:
        idx_keep.append(rng.choice(idx_c, size=take, replace=False))
idx_keep = np.concatenate(idx_keep) if idx_keep else np.array([], dtype=int)

df_sample = pd.DataFrame({
    text_col: df[text_col].iloc[idx_keep].astype(str).values,
    "cluster": labels_k4[idx_keep]
})
print("TF-IDF sample size:", len(df_sample))

# One big “document” per cluster
docs_per_cluster = (
    df_sample.groupby("cluster")[text_col]
      .apply(lambda s: " ".join(s))
      .reindex(range(K), fill_value="")
)

# Vectorize (min_df must be <= K; here K=4 so use 1)
vectorizer = TfidfVectorizer(
    stop_words=None,
    ngram_range=(1, 2),
    token_pattern=r"(?u)\b[a-z][a-z]+\b",
    min_df=1,                    # <-- changed from 5
    max_df=0.90,
    max_features=150_000
)
X_ctfidf = vectorizer.fit_transform(docs_per_cluster.values)   # (K, V)
vocab = np.array(vectorizer.get_feature_names_out())

def top_terms(c_id, top_n=15):
    row = X_ctfidf[c_id].toarray().ravel()
    top_idx = np.argsort(row)[::-1][:top_n]
    return vocab[top_idx].tolist()

final_terms = {c: top_terms(c) for c in range(K)}
for c, terms in final_terms.items():
    print(f"Cluster {c}: {', '.join(terms)}")


TF-IDF sample size: 120000
Cluster 0: this mouse, this keyboard, razer, keyboard is, dpi, gaming mouse, great mouse, mechanical keyboard, keyboard for, keycaps, mouse it, great keyboard, mx, good mouse, mouse pad
Cluster 1: this headset, these headphones, gaming headset, great headset, headphones are, noise cancellation, cancellation, mic quality, headphones for, headset the, earbuds, headset that, dongle, noise canceling, headset it
Cluster 2: missions, fun game, campaign, quests, storyline, story line, plot, madden, sequel, the combat, the campaign, lego, souls, co op, voice acting
Cluster 3: this controller, these controllers, great controller, controller works, the joysticks, the dock, wii remote, good controller, controller has, genesis, dongle, original controller, wii and, controllers for, usb port


In [None]:
#  Save final summary with names and top terms ===
final_names = {
    0: "Keyboards & Mice",
    1: "Headsets & Audio",
    2: "Games",
    3: "Controllers"
}

SUMMARY_JSON = OUT / "cluster_summary_final_k4.json"

summary = {
    "k_final": 4,
    "counts": {int(k): int(v) for k, v in counts.items()},
    "names": final_names,
    "top_terms": {int(c): terms for c, terms in final_terms.items()}
}

with open(SUMMARY_JSON, "w") as f:
    json.dump(summary, f, indent=2)

print("Saved final summary JSON ->", SUMMARY_JSON)


Saved final summary JSON -> /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/cluster_summary_final_k4.json


# Inspecting the merged clusters

In [None]:
# --- Load final labels, attach to df, and inspect exemplars ---
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize

# Paths (must match what is writen in the merge step / IPCA step)
PCA_D = 50
PCA_PATH = OUT / "embeddings_pca.memmap"            # from the IPCA cell
LABF_PATH = OUT / "labels_final_k4.memmap"          # from the merge cell (k=4 final); adjust if you chose a different name

# Load PCA memmap and final labels
N = len(df)
emb_pca = np.memmap(PCA_PATH, dtype="float32", mode="r", shape=(N, PCA_D))
labels_final = np.memmap(LABF_PATH, dtype="int32", mode="r", shape=(N,))

# Attach to df
df["cluster_final"] = labels_final

# Compute normalized centroids for each final cluster
final_ids = sorted(np.unique(labels_final).tolist())
centroids = []
for c in final_ids:
    idx = np.where(labels_final == c)[0]
    # guard for empty (shouldn't happen)
    if idx.size == 0:
        centroids.append(np.zeros((PCA_D,), dtype="float32"))
    else:
        centroids.append(emb_pca[idx].mean(axis=0))

centroids = normalize(np.vstack(centroids))  # shape (K_final, d)
emb_pca_n = normalize(emb_pca)               # normalize rows once for cosine-sim

def top_exemplars_final(c_id, top_m=5, block=200_000):
    """
    Find top_m reviews (their text + cosine similarity) closest to the final cluster c_id.
    Done in blocks so it is memory safe and fast.
    """
    idx_all = np.where(labels_final == c_id)[0]
    if idx_all.size == 0:
        return pd.DataFrame(columns=[text_col, "similarity"])

    best_idx = np.empty((0,), dtype=int)
    best_sim = np.empty((0,), dtype=float)

    # process only rows in this cluster to keep it fast
    # (you can also score all rows if you prefer)
    for s in range(0, idx_all.size, block):
        e = min(s + block, idx_all.size)
        idx = idx_all[s:e]
        sims = emb_pca_n[idx] @ centroids[c_id].reshape(-1, 1)   # (b, 1)
        sims = sims.ravel()

        if best_idx.size == 0:
            best_idx, best_sim = idx, sims
        else:
            best_idx = np.concatenate([best_idx, idx])
            best_sim = np.concatenate([best_sim, sims])

        # keep only the global top_m so far
        if best_idx.size > top_m:
            keep = np.argpartition(best_sim, -top_m)[-top_m:]
            best_idx, best_sim = best_idx[keep], best_sim[keep]

    order = np.argsort(-best_sim)
    top_idx = best_idx[order][:top_m]
    top_sim = best_sim[order][:top_m]
    return df.loc[top_idx, [text_col]].assign(similarity=np.round(top_sim, 6))

# (optional) nice names from the merge summary, if you loaded it into `sumn` earlier
name_lookup = None
try:
    name_lookup = {c: sumn["names"][str(c)] for c in final_ids}
except Exception:
    pass

# Display exemplars
TOP_M = 5
for c in final_ids:
    title = f"Final Cluster {c}" + (f" — {name_lookup[c]}" if (name_lookup and c in name_lookup) else "")
    print(f"\n=== {title} ===")
    display(top_exemplars_final(c, top_m=TOP_M))



=== Final Cluster 0 ===


Unnamed: 0,clean_text,similarity
264036,...but I definitely dig this keyboard! I actua...,0.816083
4160593,This is a solid built mous. feels btter than m...,0.80493
4332816,Love the mouse that came with it the keyboard ...,0.800335
2957102,"nice snappy keyboard, no problems with it, has...",0.792271
1949122,Great! I've had this for about a month now. I'...,0.790883



=== Final Cluster 1 ===


Unnamed: 0,clean_text,similarity
1921227,Very good customer service.good earbuds Get th...,0.779915
3286839,Great quality. Ears don’t get hot.,0.77917
876871,Excellent ear phoned,0.775858
1332680,Great cumfy headset,0.772496
4043244,The product is excellent it's my 2nd pair of 8...,0.768255



=== Final Cluster 2 ===


Unnamed: 0,clean_text,similarity
3541618,"What a great game, I wish the 6th game was mor...",0.900225
1255382,Enjoyed this game of very much. It has several...,0.897458
1978803,This game is so much bigger than the first one...,0.890624
2696791,Amazing and fun game. You have full control on...,0.88184
1714051,I knew that I wanted this game due to loving m...,0.881204



=== Final Cluster 3 ===


Unnamed: 0,clean_text,similarity
4019803,Bought 2 of the product. Only of one them work...,0.865142
3536719,"I bought 2 of these controllers, and one didn'...",0.8558
898249,The magnetic dongle made my controller stop wo...,0.853049
320878,The replacement joy stick didn’t work (neither...,0.848465
396057,"The console works perfectly, but the controlle...",0.847356


Now seems to be the 4 categorical clusters again. Keyboard/mice, audio/headset, games, controllers

In [None]:
# Saving final cluster assignments and summary (project-ready outputs) ---
import numpy as np, pandas as pd, json

labels_np = np.array(labels_mm)  # from the labels.memmap

# Make short human-readable labels from top TF-IDF terms
def make_label(terms, n=4):
    return ", ".join(terms[:n])

final_names = {c: make_label(cluster_terms[c]) for c in range(K)}
counts      = pd.Series(labels_np).value_counts().sort_index()

# Stream assignments to CSV (text, cluster, cluster_name)
FINAL_CSV = OUT / "cluster_assignments_final.csv"
if FINAL_CSV.exists(): FINAL_CSV.unlink()

name_map = np.array([final_names[i] for i in range(K)], dtype=object)

print("Writing final assignments to:", FINAL_CSV)
BATCH = 100_000
N = len(df)
for i in range(0, N, BATCH):
    j = min(i + BATCH, N)
    block_labels = labels_np[i:j]
    pd.DataFrame({
        text_col: df[text_col].iloc[i:j].values,
        "cluster": block_labels,
        "cluster_name": name_map[block_labels],
    }).to_csv(FINAL_CSV, mode="a", header=(i==0), index=False)
    if (j % 500_000) == 0 or j == N:
        print("  wrote @", j)

# Compact summary for the next stage (summarization)
SUMMARY_JSON = OUT / "cluster_summary_final.json"
json.dump(
    {
        "k_final": K,
        "counts": counts.to_dict(),
        "names": final_names,
        "top_terms": cluster_terms,   # from TF-IDF cell
    },
    open(SUMMARY_JSON, "w"),
    indent=2,
)

print("\nSaved:")
print(FINAL_CSV)
print(SUMMARY_JSON)


In [None]:
# Inspect top-5 most representative reviews per cluster (chunked)
# Inputs we already have:
# - df, text_col
# - K (4)
# - kmeans (fitted MiniBatchKMeans)
# - labels_mm (memmap of shape (N,))
# - OUT / "embeddings_pca.memmap"

N = len(df)
PCA_D = 50
PCA_PATH = OUT / "embeddings_pca.memmap"

# load PCA memmap and labels
emb_pca  = np.memmap(PCA_PATH, dtype="float32", mode="r", shape=(N, PCA_D))
labels_np = np.array(labels_mm)

# normalize centroids and rows for cosine-like similarity in PCA space
centroids = normalize(kmeans.cluster_centers_)   # (K, 50)

def top_exemplars_for_cluster(c_id, top_m=5, block=200_000):
    """Return (idx, sims) of the top_m reviews closest to centroid c_id."""
    idx_all = np.where(labels_np == c_id)[0]
    if idx_all.size == 0:
        return np.array([], dtype=int), np.array([], dtype=float)

    best_idx = np.empty((0,), dtype=int)
    best_sim = np.empty((0,), dtype=float)

    for s in range(0, idx_all.size, block):
        e = min(s + block, idx_all.size)
        block_idx = idx_all[s:e]
        X = emb_pca[block_idx]                   # (b, 50)
        Xn = normalize(np.asarray(X))            # row-normalize
        sims = (Xn @ centroids[c_id].reshape(-1)).astype("float32")  # (b,)

        # keep only the global top_m so far
        cand_idx = np.concatenate([best_idx, block_idx])
        cand_sim = np.concatenate([best_sim, sims])
        if cand_sim.size > top_m:
            keep = np.argpartition(cand_sim, -top_m)[-top_m:]
            best_idx, best_sim = cand_idx[keep], cand_sim[keep]
        else:
            best_idx, best_sim = cand_idx, cand_sim

    # final sort descending
    order = np.argsort(-best_sim)
    return best_idx[order][:top_m], best_sim[order][:top_m]

# Pretty print with cluster names if available (from your save-artifacts cell)
name_lookup = None
try:
    name_lookup = {c: final_names[c] for c in range(K)}
except Exception:
    pass

for c in range(K):
    title = f"Cluster {c}" + (f" — {name_lookup[c]}" if name_lookup else "")
    print(f"\n=== {title} ===")
    idx, sim = top_exemplars_for_cluster(c, top_m=5)
    if idx.size == 0:
        print("(empty)")
        continue
    display(pd.DataFrame({
        text_col: df[text_col].iloc[idx].values,
        "similarity": np.round(sim, 6),
    }))


=== Cluster 0 ===


Unnamed: 0,clean_text,similarity
181559,Honestly...These are probably some of the best...,0.9056
100723,This is a nice headset; I usually forgo using ...,0.903763
184244,So I got this head set for work and gaming it ...,0.899619
2984,I night this headset to use at work. I have ha...,0.897688
169120,I bought the headset to use in meetings when t...,0.897651



=== Cluster 1 ===


Unnamed: 0,clean_text,similarity
36990,"What a great game, I wish the 6th game was mor...",0.89396
115973,"When I ordered this game, I was expecting a fu...",0.882689
178376,I was excited when this game was released sinc...,0.873032
89743,I got this game because I had played it before...,0.866506
242515,Having now put in over 60 hours into this game...,0.865576



=== Cluster 2 ===


Unnamed: 0,clean_text,similarity
212620,It's worked as advertised. It just came in dam...,0.795555
135868,Piece of crap!!! Worked for about 2 days @ the...,0.771417
54276,"Didn't work, why would you sell something that...",0.759157
141023,This worked less than 4 weeks after purchase. ...,0.755014
74053,Terrible product. Possibly going to return if ...,0.751767



=== Cluster 3 ===


Unnamed: 0,clean_text,similarity
106690,wonderfull,0.838165
48002,WONDERFULL,0.838165
145938,fabulous,0.817928
109643,fabulous,0.817927
78735,Fabulous,0.817927



=== Cluster 4 ===


Unnamed: 0,clean_text,similarity
183957,Truly impressed with the quality of both the k...,0.900223
226680,This mechanical keyboard and mouse are awesome...,0.88974
247813,The Mechanical Keyboard and Mouse Combo is an ...,0.878787
116584,I actually use this for work. I have so many k...,0.878561
194511,"This is truly a great mouse, and Logitech is t...",0.877875



=== Cluster 5 ===


Unnamed: 0,clean_text,similarity
122758,"I bought 2 of these controllers, and one didn'...",0.8541
47885,The left joystick has a huge dead spot in it. ...,0.840303
10585,not compatible with next generation controller...,0.833773
243868,Console works great but the controller came wi...,0.826988
131276,I was a little bit worried if it would work wi...,0.821821


In [None]:
# --- attach final labels & export artifacts (exemplars + top terms) ---
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize

# Paths produced by the merge cell — change if you used other names
LABF_PATH     = OUT / "labels_final_k4.memmap"
SUMMARY_JSON  = OUT / "cluster_summary_final_k4.json"

# 0) Re-attach the final labels to the full dataframe
labels_final = np.memmap(LABF_PATH, dtype="int32", mode="r", shape=(len(df),))
df["final_cluster"] = np.asarray(labels_final)

# 1) Exemplars (top-m most representative reviews) per final cluster
TOP_M = 5

# Use the same PCA-reduced embeddings we already have on disk
PCA_PATH = OUT / "embeddings_pca.memmap"         # created in the IPCA step
emb_pca  = np.memmap(PCA_PATH, dtype="float32", mode="r", shape=(len(df), 50))

# Compute centroids for the kept clusters (0,1,2,3) in PCA space and normalise for cosine sim
final_ids = [0, 1, 2, 3]
centroids_final = []
for c in final_ids:
    idx = np.where(df["final_cluster"].values == c)[0]
    if idx.size:
        centroids_final.append(emb_pca[idx].mean(axis=0))
centroids_final = normalize(np.vstack(centroids_final))
emb_pca_n       = normalize(emb_pca)

def exemplars_for_final(c_id: int, top_m: int = TOP_M) -> pd.DataFrame:
    idx = np.where(df["final_cluster"].values == c_id)[0]
    if idx.size == 0:
        return pd.DataFrame(columns=[text_col, "similarity"])
    sims  = emb_pca_n[idx] @ centroids_final[final_ids.index(c_id)].reshape(-1, 1)
    sims  = sims.ravel()
    order = np.argsort(-sims)[:top_m]
    return (
        df.iloc[idx[order]][[text_col]]
          .assign(similarity=sims[order])
    )

# Write all exemplars to a single CSV (stacked)
ex_rows = []
for c in final_ids:
    ex = exemplars_for_final(c, TOP_M)
    if not ex.empty:
        ex.insert(0, "final_cluster", c)
        ex_rows.append(ex)
exemplars_all = pd.concat(ex_rows, ignore_index=True) if ex_rows else pd.DataFrame(columns=["final_cluster", text_col, "similarity"])
EXEMPLARS_CSV = OUT / "final_cluster_exemplars.csv"
exemplars_all.to_csv(EXEMPLARS_CSV, index=False)

# 2) Save final cluster names & top terms (read from the summary JSON we already saved)
with open(SUMMARY_JSON, "r") as f:
    summ = json.load(f)

# JSON has string keys — make int-keyed accessors
names_map = {int(k): v for k, v in summ.get("names", {}).items()}
terms_map = {int(k): v for k, v in summ.get("top_terms", {}).items()}

rows = []
for c in final_ids:
    name  = names_map.get(c, f"Cluster {c}")
    terms = terms_map.get(c, [])
    rows.append({
        "final_cluster": c,
        "cluster_name": name,
        "top_terms": ", ".join(terms)
    })
TERMS_CSV = OUT / "final_cluster_top_terms.csv"
pd.DataFrame(rows).to_csv(TERMS_CSV, index=False)

print("Saved:")
print(" - labels reattached from ->", LABF_PATH)
print(" - exemplars           ->", EXEMPLARS_CSV)
print(" - final cluster terms ->", TERMS_CSV)
print(" - summary json        ->", SUMMARY_JSON)


Saved:
 - labels reattached from -> /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/labels_final_k4.memmap
 - exemplars           -> /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/final_cluster_exemplars.csv
 - final cluster terms -> /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/final_cluster_top_terms.csv
 - summary json        -> /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/cluster_summary_final_k4.json
