## Review Summarization Using Generative AI

## Imports

In [None]:
import json, pathlib, pandas as pd
import re
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer

## Loading needed files from google drive

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Defining run folder (adjusted the timestamp part to match saved run)
RUN_DIR = pathlib.Path("/content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924")

# Load the final review assignments

assignments = pd.read_csv(RUN_DIR / "cluster_assignments_final_k4.csv")
print("Assignments shape:", assignments.shape)
print(assignments.head())

# Load the cluster summary JSON
with open(RUN_DIR / "cluster_summary_final_k4.json", "r") as f:
    summary = json.load(f)

print("Summary keys:", summary.keys())
print("Cluster counts:", summary["counts"])
print("Cluster names:", summary["names"])

# load exemplars and top terms CSVs for inspection
exemplars = pd.read_csv(RUN_DIR / "final_cluster_exemplars.csv")
top_terms = pd.read_csv(RUN_DIR / "final_cluster_top_terms.csv")

print("Exemplars sample:\n", exemplars.head())
print("Top terms:\n", top_terms)


Mounted at /content/drive
Assignments shape: (4624615, 3)
                                          clean_text  cluster  \
0  I’m playing on ps5 and it’s interesting. It’s ...        2   
1  Nostalgic fun. A bit slow. I hope they don’t s...        2   
2  This was an order for my kids & they have real...        2   
3  These work great, They use batteries which is ...        3   
4  I would recommend to anyone looking to add jus...        0   

       cluster_name  
0             Games  
1             Games  
2             Games  
3       Controllers  
4  Keyboards & Mice  
Summary keys: dict_keys(['k_final', 'counts', 'names', 'top_terms'])
Cluster counts: {'0': 913087, '1': 1185858, '2': 1618488, '3': 907182}
Cluster names: {'0': 'Keyboards & Mice', '1': 'Headsets & Audio', '2': 'Games', '3': 'Controllers'}
Exemplars sample:
    final_cluster                                         clean_text  \
0              0  ...but I definitely dig this keyboard! I actua...   
1              0  

## Load clustering outputs, attach to the full review corpus, make product stats, save corpus for generation

In [None]:
# Paths (edit RUN_DIR to run folder; the parquet path is the same one used in clustering)
RUN_DIR       = pathlib.Path("/content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924")
ASSIGN_CSV    = RUN_DIR / "cluster_assignments_final_k4.csv"     # produced by merge step
SUMMARY_JSON  = RUN_DIR / "cluster_summary_final_k4.json"        # produced by merge step
PARQUET_FULL  = pathlib.Path("/content/drive/MyDrive/Project_NLP/video_games_preprocessed.parquet")

# Load clustering artifacts
assignments = pd.read_csv(ASSIGN_CSV)
with open(SUMMARY_JSON, "r") as f:
    summary = json.load(f)

# Detect the cluster column name in assignments
cluster_col = "cluster" if "cluster" in assignments.columns else (
    "cluster_merged" if "cluster_merged" in assignments.columns else None
)
if cluster_col is None:
    raise KeyError(
        f"Could not find 'cluster' or 'cluster_merged' in {ASSIGN_CSV.name}. "
        f"Columns present: {assignments.columns.tolist()}"
    )

# Load the original review parquet (review-level; contains clean_text + product metadata)
reviews = pd.read_parquet(PARQUET_FULL)

# Map column names robustly
cols = set(reviews.columns.str.lower())

def pick(*candidates, required=False):
    for c in candidates:
        if c in reviews.columns:
            return c
        # also allow case-insensitive match
        for col in reviews.columns:
            if col.lower() == c.lower():
                return col
    if required:
        raise KeyError(f"Could not find any of the columns: {candidates}")
    return None

text_col         = pick("clean_text", required=True)
product_id_col   = pick("product_id", "asin", "productId", "product_id_str", required=True)
product_title_col= pick("product_title", "title", "product_name", required=True)
rating_col       = pick("rating", "overall", "stars", required=True)

# Sanity check alignment (assignments should align row-for-row with the parquet)
if len(assignments) != len(reviews):
    raise ValueError(
        f"Row count mismatch: assignments={len(assignments)} vs reviews={len(reviews)}. "
        "They should be identical because clustering preserved the review order."
    )

# Attach clusters to the review-level DataFrame
reviews = reviews.copy()
reviews["cluster"] = assignments[cluster_col].values

# names in summary have string keys; make them ints -> str
names_map = {int(k): v for k, v in summary.get("names", {}).items()}
reviews["cluster_name"] = reviews["cluster"].map(names_map)

# Build product-level stats for display (top products per cluster)
prod_stats = (
    reviews
    .groupby([product_id_col, product_title_col, "cluster", "cluster_name"], dropna=False)
    .agg(n=(rating_col, "count"), avg_rating=(rating_col, "mean"))
    .reset_index()
)

# Show top-10 per cluster to sanity-check
for c in sorted(reviews["cluster"].dropna().unique()):
    title = names_map.get(int(c), f"Cluster {c}")
    print(f"\n== Cluster {c} ({title}) — top 10 products ==")
    display(
        prod_stats.loc[prod_stats["cluster"] == c]
                  .sort_values(["n", "avg_rating"], ascending=[False, False])
                  .head(10)
                  [[product_id_col, product_title_col, "n", "avg_rating"]]
    )

# Save slim review-level corpus for generation
summ_cols = [
    "cluster", "cluster_name",
    product_id_col, product_title_col,
    rating_col, text_col
]
# keeping columns
summ_cols = [c for c in summ_cols if c in reviews.columns]

summ_df = reviews[summ_cols].copy()
OUT_CORPUS = RUN_DIR / "summarization_corpus.parquet"
summ_df.to_parquet(OUT_CORPUS, index=False)

print("\nSaved review-level generation corpus ->", OUT_CORPUS)
print("Preview:")
display(summ_df.sample(5, random_state=42))



== Cluster 0 (Keyboards & Mice) — top 10 products ==


Unnamed: 0,asin,title,n,avg_rating
169408,B01N3ASPNV,amFilm Tempered Glass Screen Protector for Nin...,11418,4.713873
116833,B00HTK1NCS,Redragon M601 RGB Gaming Mouse Backlit Wired E...,5429,4.135016
108479,B00E4MQODC,Logitech G602 Lag-Free Wireless Gaming Mouse –...,4527,4.066932
69658,B003DZ165W,"Kindle Lighted Leather Cover, Black (Fits Kind...",4414,4.338242
127034,B00NLZUM36,"Redragon S101 Gaming Keyboard, M601 Mouse, RGB...",4130,3.83414
206892,B07GBZ4Q68,Logitech G502 HERO High Performance Wired Gami...,3799,3.77731
111727,B00FNKMVUO,UtechSmart Venus Pro RGB Wireless MMO Gaming M...,3791,4.361118
144716,B016MAK38U,Redragon K552 Mechanical Gaming Keyboard Rainb...,3592,3.998051
93965,B0086UK7IQ,"Logitech G600 MMO Gaming Mouse, RGB Backlit, 2...",3404,3.918625
240896,B07XP4K152,UtechSmart Venus Pro RGB Wireless MMO Gaming M...,2994,4.480294



== Cluster 1 (Headsets & Audio) — top 10 products ==


Unnamed: 0,asin,title,n,avg_rating
158026,B01H6GUCCQ,"BENGOO Stereo Pro Gaming Headset for PS4, PC, ...",13142,4.163826
105871,B00CQ35C1Q,Logitech 981-000536 G430 7.1 Gaming Headset wi...,4908,3.700896
169409,B01N3ASPNV,amFilm Tempered Glass Screen Protector for Nin...,4635,4.570658
231320,B07TC8J6HK,Jeecoo V20U USB Pro Gaming Headset for PC - 7....,4588,4.228858
116953,B00HVBPRUO,Gold Wireless Stereo Headset - PlayStation 4,4545,4.039384
80542,B004RMK5QG,PlayStation Plus: 12 Month Membership [Digital...,4291,4.762992
162161,B01L2ZRYVE,"HyperX Cloud Stinger – Gaming Headset, Lightwe...",4183,3.688262
107979,B00DU2CHE2,Stereo Gaming Headphone Headset with Microphon...,4116,3.576774
131737,B00SAYCXWG,HyperX Cloud II Wireless - Gaming Headset for ...,3844,4.067638
140281,B012DFI02O,VersionTECH. G2000 Gaming Headset for PS5 PS4 ...,3485,4.078336



== Cluster 2 (Games) — top 10 products ==


Unnamed: 0,asin,title,n,avg_rating
85940,B005GFPZYK,American Sniper: The Autobiography of the Most...,4804,4.547669
167068,B01MS6MO77,The Legend of Zelda: Breath of the Wild Master...,4166,4.797888
91518,B007FTE2VW,SimCity: Limited Edition,3564,1.471661
82872,B0050SYILE,Grand Theft Auto V: Premium Edition - Xbox One...,3350,4.497015
57,0375869026,Wonder,3143,4.851098
170001,B01N5OKGLH,Super Smash Bros. Ultimate: Challenger Pack 2 ...,2793,4.650913
91325,B007CM0K86,The Last of Us - PlayStation 3,2751,4.683751
82748,B0050SXKU4,Grand Theft Auto V: Premium Edition - Xbox One...,2726,4.552458
168740,B01N1037CV,Mario Kart 8 Deluxe – Booster Course Pass - Ni...,2649,4.730087
168033,B01MY7GHKJ,Super Mario Odyssey - Nintendo Switch,2538,4.756895



== Cluster 3 (Controllers) — top 10 products ==


Unnamed: 0,asin,title,n,avg_rating
103012,B00BGA9WK2,PlayStation 4 500GB Console [Old Model][Discon...,3917,3.880266
171261,B01NAWKYZ0,Nintendo Switch Pro Controller,3589,4.452494
228915,B07SFKTLZM,Xbox Elite Wireless Controller Series 2 – Black,3167,2.580676
66074,B002VBWIP6,Xbox Live Gold: 1 Month Membership [Digital Code],3051,4.074402
80399,B004QRKWLA,Microsoft Xbox 360 Wired Controller for Window...,2927,4.295524
138104,B00ZDNNRB8,Xbox Elite Wireless Controller,2722,3.143277
163377,B01LWVX2RG,DualShock 4 Wireless Controller for PlayStatio...,2692,3.479941
162809,B01LPZM7VI,Xbox Wireless Controller – Black,2490,3.235743
157326,B01GW3H3U8,Xbox Wireless Controller – White,2318,3.57981
48943,B0015AARJI,PlayStation 3 Dualshock 3 Wireless Controller ...,2209,3.668628



Saved review-level generation corpus -> /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/summarization_corpus.parquet
Preview:


Unnamed: 0,cluster,cluster_name,asin,title,rating,clean_text
1112677,2,Games,B00006LELB,The Legend of Zelda: A Link to the Past,5,The origional LOZ was and still is my favorite...
612761,2,Games,B07DM3LYVV,NBA 2K19 20th Anniversary Edition - Pre-load -...,5,2K has brought it back with this edition. I ha...
1402617,0,Keyboards & Mice,B072V9ZBSK,J&TOP Portable DIY Replacement Dock Mount Case...,5,It is very important to note that this device ...
3777127,2,Games,B002Q21X7Y,Fable II: Limited Collectors Edition,5,Let me be the first to say that this is not a ...
1549429,2,Games,B000066TS2,Super Ghouls'n Ghosts - Game Boy Advance,3,"I know what it is now, IT'S ALMOST IMPOSSIBLE ..."


## Summarization prep: Top products, worst product, and top complaints (per cluster)
So that we know which products should used later for the gen ai model steps

In [11]:
# === Summarization prep with category-guard rails (top products, worst, complaints) ===
import re, json, numpy as np, pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer

# ---- Paths (OUT_CORPUS is created by the previous corpus cell) ----
# If you restarted the session and RUN_DIR/OUT_CORPUS are not defined, uncomment and set:
# RUN_DIR   = Path("/content/drive/MyDrive/Project_NLP/runs/clustering_full_YYYYMMDD_HHMM")
OUT_CORPUS  = RUN_DIR / "summarization_corpus.parquet"   # from previous cell
OUT_JSONL   = RUN_DIR / "cluster_summaries_draft.jsonl"  # output (one JSON per cluster)

# ---- Tunables ----
MIN_REVIEWS_PER_PRODUCT = 150   # ignore tiny-review products
TOP_N                    = 3     # how many top products to report per cluster
LOW_RATING_MAX           = 2     # 1–2 star reviews are "complaints"
COMPLAINT_TERMS          = 12    # how many complaint terms to extract

# ---- Simple include/exclude rules by cluster (titles only) ----
# Edit terms to your naming if you change cluster labels.
# c=0: Keyboards & Mice, c=1: Headsets & Audio, c=2: Games, c=3: Controllers
CLUSTER_RULES = {
    0: {  # Keyboards & Mice
        "include": [
            r"\b(mouse|mice|keyboard|keycap|keycaps|switch(es)?|dpi|mechanical)\b",
        ],
        "exclude": [
            r"\b(case|cover|cable|cord|dock|stand|adapter|skin|sticker|bag|pouch|protector|screen)\b",
            r"\b(extender|extension|grip|repair|tool|kit|mount|holder|card|oled|switch)\b",
        ],
    },
    1: {  # Headsets & Audio
        "include": [
            r"\b(headset|headsets|headphone(s)?|earbud(s)?|microphone|mic)\b",
        ],
        "exclude": [
            r"\b(controller|mouse|keyboard|case|cover|cable|adapter|charger|dock)\b",
        ],
    },
    2: {  # Games
        "include": [
            r"\b(game|edition|digital code|dlc|pack|collection|bundle)\b",
            r"\b(nintendo|switch|playstation|ps[345]|xbox|wii|3ds|psp|vita)\b",
        ],
        "exclude": [
            r"\b(controller|headset|headphone|mouse|keyboard|case|grip|cable|card|micro ?sd|sd card|charger|battery)\b",
        ],
    },
    3: {  # Controllers
        "include": [
            r"\b(controller|gamepad|joystick|pro controller|dualshock|elite controller)\b",
        ],
        "exclude": [
            r"\b(dock|charger|battery|shell|case|stand|repair|kit|cable|tool)\b",
        ],
    },
}

def compile_rules(rules):
    """Compile include/exclude regex for speed."""
    inc = [re.compile(pat, re.I) for pat in rules.get("include", [])]
    exc = [re.compile(pat, re.I) for pat in rules.get("exclude", [])]
    return inc, exc

COMPILED_RULES = {c: compile_rules(r) for c, r in CLUSTER_RULES.items()}

def title_matches(title: str, inc, exc) -> bool:
    """True if title matches any include and none of exclude."""
    t = title or ""
    if inc and not any(p.search(t) for p in inc):
        return False
    if exc and any(p.search(t) for p in exc):
        return False
    return True

def filter_by_rules(df, cluster_id, title_col, verbose=False):
    """Apply CLUSTER_RULES to a candidate product-frame for the given cluster."""
    inc, exc = COMPILED_RULES.get(cluster_id, ([], []))
    if not inc and not exc:
        return df
    mask = df[title_col].astype(str).apply(lambda s: title_matches(s, inc, exc))
    out = df.loc[mask].copy()
    if verbose:
        print(f"  · Filtered by rules: kept {len(out):,}/{len(df):,} for cluster {cluster_id}")
    return out

# ---- TF-IDF helper for complaint terms ----
def top_terms_tfidf(texts, n_terms=COMPLAINT_TERMS):
    """Robust TF-IDF top terms for a list of strings (unigrams+bigrams)."""
    texts = [t.strip() for t in (texts or []) if isinstance(t, str) and t.strip()]
    if not texts:
        return []
    # Start at min_df=2; relax to 1 for very tiny corpora
    for min_df in (2, 1):
        try:
            vec = TfidfVectorizer(
                stop_words="english",
                ngram_range=(1, 2),
                token_pattern=r"(?u)\b[a-z][a-z]+\b",
                min_df=min_df,
                max_df=0.9,
                max_features=150_000,
            )
            X = vec.fit_transform(texts)
            scores = np.asarray(X.sum(axis=0)).ravel()
            idx = np.argsort(scores)[::-1][:n_terms]
            vocab = np.array(vec.get_feature_names_out())
            return vocab[idx].tolist()
        except ValueError:
            # continue and try a looser min_df
            pass
    return []

# ---- Load review-level generation corpus ----
df = pd.read_parquet(OUT_CORPUS)

# ---- Column names (case-insensitive safety) ----
cols = set(df.columns.str.lower())
def pick(col, *candidates, required=False):
    lower = {c.lower(): c for c in cols}
    for c in candidates + (col,):
        if c.lower() in lower:
            return lower[c.lower()]
    if required:
        raise KeyError(f"Could not find any of {candidates+(col,)} in columns:\n{sorted(cols)[:20]} ...")
    return None

cluster_col      = pick("cluster",       required=True)
cluster_name_col = pick("cluster_name",  required=True)
product_id_col   = pick("asin", "product_id", "productid", "product_id_str", required=True)
product_title_col= pick("title", "product_title", required=True)
rating_col       = pick("rating", "overall", "stars", required=True)
text_col         = pick("clean_text", required=True)

# ---- Build product-level stats per cluster ----
prod_stats = (
    df.groupby([product_id_col, product_title_col, cluster_col, cluster_name_col], dropna=False)
      .agg(n=(rating_col, "count"), avg_rating=(rating_col, "mean"))
      .reset_index()
)

# Keep only products with enough reviews for ranking / worst
eligible = prod_stats.loc[prod_stats["n"] >= MIN_REVIEWS_PER_PRODUCT].copy()

summaries = []
for c in sorted(eligible[cluster_col].dropna().unique()):
    c_int = int(c)
    c_name = eligible.loc[eligible[cluster_col] == c, cluster_name_col].dropna().iloc[0]

    # Candidates for this cluster
    cand = eligible.loc[eligible[cluster_col] == c_int].copy()

    # Apply category rules on titles (with fallback if all filtered out)
    cand_f = filter_by_rules(cand, c_int, product_title_col, verbose=True)
    if len(cand_f) == 0:
        cand_f = cand  # fallback

    # --- top-N products (tie-break: more reviews) ---
    cand_top = cand_f.sort_values(["avg_rating", "n"], ascending=[False, False]).head(TOP_N)
    top_list = []
    for _, row in cand_top.iterrows():
        p_asin  = row[product_id_col]
        p_title = row[product_title_col]
        p_n     = int(row["n"])
        p_avg   = float(row["avg_rating"])

        # Complaint reviews for this product (1–2 stars) from the full review DF
        mask = (df[product_id_col] == p_asin) & (df[rating_col] <= LOW_RATING_MAX)
        complaint_texts = df.loc[mask, text_col].astype(str).tolist()
        complaints = top_terms_tfidf(complaint_texts, n_terms=COMPLAINT_TERMS)

        top_list.append({
            "asin": p_asin,
            "title": p_title,
            "n_reviews": p_n,
            "avg_rating": round(p_avg, 4),
            "top_complaints": complaints,
        })

    # --- worst product (lowest avg among filtered eligible) ---
    worst = cand_f.sort_values(["avg_rating", "n"], ascending=[True, False]).head(1)
    if len(worst) == 1:
        wp      = worst.iloc[0]
        wp_asin = wp[product_id_col]
        wp_title= wp[product_title_col]
        wp_n    = int(wp["n"])
        wp_avg  = float(wp["avg_rating"])

        mask_w = (df[product_id_col] == wp_asin) & (df[rating_col] <= LOW_RATING_MAX)
        worst_complaints = top_terms_tfidf(df.loc[mask_w, text_col].astype(str).tolist(),
                                           n_terms=COMPLAINT_TERMS)

        worst_prod = {
            "asin": wp_asin,
            "title": wp_title,
            "n_reviews": wp_n,
            "avg_rating": round(wp_avg, 4),
            "top_complaints": worst_complaints,
        }
    else:
        worst_prod = None

    summaries.append({
        "cluster": c_int,
        "cluster_name": c_name,
        "top_products": top_list,
        "worst_product": worst_prod,
    })

# ---- Save as JSONL (one JSON per line) ----
OUT_JSONL.parent.mkdir(parents=True, exist_ok=True)
with open(OUT_JSONL, "w") as f:
    for rec in summaries:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"\nSaved cluster summaries -> {OUT_JSONL}")
print("\nSample (cluster 0) preview:")
print(json.dumps(next((s for s in summaries if s['cluster']==0), summaries[0]), indent=2, ensure_ascii=False))


  · Filtered by rules: kept 370/812 for cluster 0
  · Filtered by rules: kept 460/1,143 for cluster 1
  · Filtered by rules: kept 1,421/1,925 for cluster 2
  · Filtered by rules: kept 334/930 for cluster 3

Saved cluster summaries -> /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/cluster_summaries_draft.jsonl

Sample (cluster 0) preview:
{
  "cluster": 0,
  "cluster_name": "Keyboards & Mice",
  "top_products": [
    {
      "asin": "B07FL2LSBH",
      "title": "Razer Gaming Mouse Bungee v2: Drag-Free Wired Mouse Support - for Esports-Level Performance - Classic Black (RC21-01210100-R3M1)",
      "n_reviews": 339,
      "avg_rating": 4.8732,
      "top_complaints": [
        "like",
        "cord",
        "weight",
        "money",
        "actually"
      ]
    },
    {
      "asin": "B07NP6QQ9C",
      "title": "JUEYINGBAILI JYZZ RGB Gaming Mouse Pad - Large Led Keyboard Pad, Mouse Mat with HD Map, Smoothly Waterproof Surface, Non-Slip Rubber Base, 31.5inin X 1

## Full inspection on all clusters

In [12]:
# --- Inspect cluster summaries + sanity check content for ALL clusters ---

import json, re, numpy as np, pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer

# Reuse paths if already defined; otherwise set sensible defaults (edit RUN_DIR if needed)
try:
    RUN_DIR
except NameError:
    RUN_DIR = Path("/content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924")  # <-- adjust if needed

OUT_JSONL = RUN_DIR / "cluster_summaries_draft.jsonl"
OUT_CORPUS = RUN_DIR / "summarization_corpus.parquet"

def trunc(s, n=95):
    s = re.sub(r"\s+", " ", str(s)).strip()
    return (s[:n] + "…") if len(s) > n else s

# Load summaries
summaries = []
with open(OUT_JSONL, "r") as f:
    for line in f:
        summaries.append(json.loads(line))
sum_df = pd.DataFrame(summaries).sort_values("cluster")

# Load corpus with review-level rows
corpus = pd.read_parquet(OUT_CORPUS)
# Normalize expected columns
corpus_cols = {c.lower(): c for c in corpus.columns}
def pick(*cands, required=True):
    for c in cands:
        if c in corpus_cols:
            return corpus_cols[c]
    if required:
        raise KeyError(f"Could not find any of {cands} in corpus columns: {list(corpus.columns)[:20]}…")
    return None

cluster_col = pick("cluster")
title_col   = pick("title", "product_title")
text_col    = pick("clean_text")
rating_col  = pick("rating", "overall", "stars", required=False)

print(f"Loaded summaries: {len(sum_df)} clusters")
print(f"Corpus shape: {corpus.shape}, columns used -> "
      f"cluster='{cluster_col}', title='{title_col}', text='{text_col}', rating='{rating_col}'\n")

# Iterate clusters
for _, row in sum_df.iterrows():
    cid = row["cluster"]
    cname = row.get("cluster_name", f"Cluster {cid}")
    top_products = pd.DataFrame(row.get("top_products", []))
    worst = row.get("worst_product", {})

    print("="*110)
    print(f"CLUSTER {cid} — {cname}")
    print("-"*110)

    # --- Show top products table (trimmed for readability) ---
    if not top_products.empty:
        tmp = top_products.copy()
        tmp["title"] = tmp["title"].apply(lambda s: trunc(s, 110))
        tmp["top_complaints"] = tmp["top_complaints"].apply(lambda xs: ", ".join(xs[:8]) if isinstance(xs, list) else "")
        cols = ["asin", "title"]
        if "n_reviews" in tmp.columns: cols.append("n_reviews")
        if "avg_rating" in tmp.columns: cols.append("avg_rating")
        cols.append("top_complaints")
        display(tmp[cols].head(3))
    else:
        print("No top_products found in summary.")

    # --- Worst product block ---
    if worst:
        print("\nWorst product:")
        print(f"  • {trunc(worst.get('title', ''))}")
        print(f"    asin={worst.get('asin','?')}, "
              f"n={worst.get('n_reviews','?')}, "
              f"avg={worst.get('avg_rating','?')}")
        if worst.get("top_complaints"):
            print("    complaints:", ", ".join(worst["top_complaints"][:10]))
    else:
        print("\nWorst product: (not available / not enough data)")

    # --- Keyword sanity check from reviews in this cluster ---
    c_mask = (corpus[cluster_col] == cid)
    n_in_cluster = int(c_mask.sum())
    print(f"\nReviews in this cluster: {n_in_cluster}")

    if n_in_cluster > 0:
        # Sample (keeps it fast even for big clusters)
        sample_n = min(20000, n_in_cluster)
        sample_df = corpus.loc[c_mask, [title_col, text_col]].sample(sample_n, random_state=42)

        texts = sample_df[text_col].astype(str).tolist()
        # Light stopwords (English), unigrams+bigrams, tiny min_df so we don't crash on tiny clusters
        min_df = max(3, int(sample_n * 0.0005))  # ~0.05% of sample, but at least 3 docs
        vec = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=15000, min_df=min_df)
        try:
            X = vec.fit_transform(texts)
            terms = np.array(vec.get_feature_names_out())
            # Use mean TF-IDF to find “representative” terms
            scores = np.asarray(X.mean(axis=0)).ravel()
            top_idx = scores.argsort()[::-1][:20]
            print("Top TF-IDF terms:", ", ".join(terms[top_idx]))
        except ValueError as e:
            print(f"(TF-IDF skipped: {e})")

        # Show a few review examples (title + first 140 chars)
        print("\nSample reviews:")
        ex = sample_df.head(5).copy()
        ex[title_col] = ex[title_col].apply(trunc)
        ex[text_col]  = ex[text_col].apply(lambda s: trunc(s, 140))
        display(ex.rename(columns={title_col: "title", text_col: "clean_text"}))
    else:
        print("(No reviews found for this cluster in the corpus.)")

    print("\n")


Loaded summaries: 4 clusters
Corpus shape: (4624615, 6), columns used -> cluster='cluster', title='title', text='clean_text', rating='rating'

CLUSTER 0 — Keyboards & Mice
--------------------------------------------------------------------------------------------------------------


Unnamed: 0,asin,title,n_reviews,avg_rating,top_complaints
0,B07FL2LSBH,Razer Gaming Mouse Bungee v2: Drag-Free Wired ...,339,4.8732,"like, cord, weight, money, actually"
1,B07NP6QQ9C,JUEYINGBAILI JYZZ RGB Gaming Mouse Pad - Large...,286,4.7902,"lights, mousepad, pad, item, led, rgb, work, l..."
2,B08FMNP771,"EDJO Mechanical Gaming Keyboard, 87 Keys Blue ...",158,4.7532,"keyboard, work, type, pc, like, really, nice, ..."



Worst product:
  • Wireless Gaming Keyboard and Mouse Combo Rainbow Backlight Quiet Ergonomic Mechanical Feeling A…
    asin=B08CD8G3BL, n=163, avg=2.6074
    complaints: keyboard, mouse, work, keys, product, use, don, like, charge, keyboard mouse

Reviews in this cluster: 913087
Top TF-IDF terms: great, mouse, keyboard, works, perfect, good, love, like, use, case, product, just, easy, switch, nice, really, price, works great, gaming, work

Sample reviews:


Unnamed: 0,title,clean_text
4071264,Razer BlackWidow TE Chroma v2 TKL Tenkeyless M...,"this is a really good keyboard, i recommend bu..."
2532836,"LTC Mosh Pit 16,000 DPI RGB Wireless Ambidextr...","The mouse is comfortable to hold, but the wire..."
990816,Sintron Arcade Game RGB/CGA/EGA/YUV to VGA HD ...,Works perfect.
3865266,CM Storm QuickFire XT - Full Size Mechanical G...,Great keyboard.
1738236,"AMALEN Wireless Vertical Mouse, 2.4G Ergonomic...",I have very small hands and this is the smalle...




CLUSTER 1 — Headsets & Audio
--------------------------------------------------------------------------------------------------------------


Unnamed: 0,asin,title,n_reviews,avg_rating,top_complaints
0,B01KQDL4D2,"6amLifestyle Headphone Headset Holder Hanger, ...",270,4.837,use
1,B09QKTBFTR,"Gaming Headset for PS5 PS4 PC, Gaming Headphon...",427,4.8244,"use, mic, work, headphones, headset, ear, does..."
2,B0BB78PHW2,Battery Head Strap for Quest 2 - YOGES 5000mAh...,202,4.7228,"headset, head, just, fit, wanted, doesn, refun..."



Worst product:
  • JAMSWALL Stereo Gaming Headset for Xbox one PS4-3.5mm Wired Over-Head Stereo Gaming Headset Hea…
    asin=B07CJXBFFG, n=160, avg=2.1938
    complaints: work, working, stopped, stopped working, xbox, mic, sound, hear, headset, headphones

Reviews in this cluster: 1185858
Top TF-IDF terms: good, great, works, product, headset, quality, excellent, sound, work, nice, price, like, just, works great, love, headphones, use, mic, buy, great product

Sample reviews:


Unnamed: 0,title,clean_text
3568280,"Beastron Wii AV Cable, 6 FT AV Composite Cable...",Great product super fast shipping
563477,Sentey GS-4441 Black Orbeat Gaming Headset wit...,Great deal for the discounted price
3022506,HyperX Cloud II Wireless - Gaming Headset for ...,I absolutely adore this headset. I bought my f...
137848,Nyko Power Adaptor for Wii,Came in handy
601904,Xbox One Wireless Controller [Without Bluetooth],Works great but doesn't have the headset jack




CLUSTER 2 — Games
--------------------------------------------------------------------------------------------------------------


Unnamed: 0,asin,title,n_reviews,avg_rating,top_complaints
0,B014R4ZZ0E,Uncharted 4: A Thief's End - PlayStation 4,176,4.892,"game, games, story, uncharted, just, end, comp..."
1,B007W8S2MG,Persona 4 Golden - PlayStation Vita,910,4.8615,"game, just, story, got, book, fun, good, boring"
2,B01GKHJPG6,"Persona 5 - PlayStation 4 ""Take Your Heart"" Pr...",283,4.8587,"case, hours, playing, play, bought, reviews, w..."



Worst product:
  • SimCity: Limited Edition
    asin=B007FTE2VW, n=3564, avg=1.4717
    complaints: game, play, ea, city, servers, server, simcity, just, buy, online

Reviews in this cluster: 1618488
Top TF-IDF terms: game, love, great, fun, games, play, like, good, great game, just, loves, awesome, really, time, son, loved, playing, best, played, graphics

Sample reviews:


Unnamed: 0,title,clean_text
3324766,Fallout 4 - Xbox One,awesome game.
2164096,Crimson Sea 2 - PlayStation 2,I'm a very picky gamer; I finish very few game...
2394742,Wii Nunchuk Controller - White,I bought this because I thought that it was a ...
1993395,Bendy and the Ink Machine (PS4) - PlayStation 4,This game isn’t that great. It’s too edgy and ...
2809645,Ghostbusters: The Video Game - Playstation 3,the game is great but sometimes it freezes




CLUSTER 3 — Controllers
--------------------------------------------------------------------------------------------------------------


Unnamed: 0,asin,title,n_reviews,avg_rating,top_complaints
0,B00L3LQ4B4,Super Smash Bros. Edition GameCube Controller,222,4.7703,"work, controller, button, just, buttons"
1,B0017KIBAI,Official Nintendo White Classic Gamecube Contr...,479,4.737,"controller, product, gamecube, smash, nintendo..."
2,B00O9GW8VK,Nintendo Super Smash Bros. Black Classic Gamec...,387,4.6925,"controller, smash, use, garbage, stopped, don,..."



Worst product:
  • Zamia Wireless Controller for Xbox One, Enhanced Gamepad 2.4GHZ Game Controller Compatible with…
    asin=B08GBXWGYR, n=212, avg=2.033
    complaints: controller, work, xbox, working, charge, product, money, didn, use, connect

Reviews in this cluster: 907182
Top TF-IDF terms: controller, works, great, work, xbox, just, controllers, like, use, good, switch, games, game, product, worked, play, does, bought, ps4, got

Sample reviews:


Unnamed: 0,title,clean_text
3233259,DS Lite Action Replay,Doesn’t even work. I get an error code
3472350,Xbox LIVE 1600 Microsoft Points - Xbox 360 Dig...,The exact amount I needed for the mass effect ...
2771640,Microsoft Xbox 360 Wired Controller for Window...,Works great. Plug and play. Use it for console...
3398877,Xbox 360 Charging Station with 2 Battery Packs,"I think it is a great product. It looks good, ..."
2303498,NBA 2K15 - PlayStation 3,"Although it is for PC, it has great graphics a..."






## First testing and summarizing the games cluster with GPT-4o-mini

In [None]:
# --- Test GPT-4o-mini summarization for one cluster ---

import os, json
from pathlib import Path
from dotenv import load_dotenv

# 1. Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. Load API key from .env file in Drive
env_path = "/content/drive/MyDrive/secret_folder/.env"
load_dotenv(env_path)
assert "OPENAI_API_KEY" in os.environ, "API key not loaded from .env!"

# 3. Paths to inputs (filtered summaries is best)
RUN_DIR   = Path("/content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924")
SUMM_JSON = RUN_DIR / "cluster_summaries_draft.jsonl"

# 4. Load one cluster summary (example: Games cluster)
summaries = []
with open(SUMM_JSON, "r") as f:
    for line in f:
        summaries.append(json.loads(line))

# Pick cluster 2 (Games) as test
test_cluster = next(s for s in summaries if s["cluster"] == 2)
print("Loaded cluster:", test_cluster["cluster_name"])

# 5. Build a prompt from this structured summary
def build_prompt(cluster_summary):
    cname = cluster_summary["cluster_name"]
    top_products = cluster_summary["top_products"]
    worst_product = cluster_summary.get("worst_product")

    lines = [f"**{cname}**\n"]
    lines.append("Top 3 Products:")
    for i, p in enumerate(top_products, 1):
        lines.append(
            f"{i}. {p['title']} (avg {p['avg_rating']:.2f}★, {p['n_reviews']} reviews)\n"
            f"   Complaints: {', '.join(p.get('top_complaints', [])[:6]) or '—'}"
        )
    if worst_product:
        lines.append(f"\nWorst Product:\n- {worst_product['title']} "
                     f"(avg {worst_product['avg_rating']:.2f}★, {worst_product['n_reviews']} reviews)\n"
                     f"  Complaints: {', '.join(worst_product.get('top_complaints', [])[:8]) or '—'}")
    return "\n".join(lines)

prompt = f"""
You are a product reviewer writing a short blog-style article for shoppers.

{build_prompt(test_cluster)}

Write ~200 words with clear sections:
- Intro to this category
- Compare the top 3 products (strengths, differences, who they’re for)
- Summarize key complaints buyers should consider
- Explain the worst product and why to avoid it

Constraints:
- Neutral and factual tone
- Markdown formatting with headings
- Avoid exaggeration or making up numbers
"""

# 6. Call OpenAI
from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

resp = client.chat.completions.create(
    model="gpt-4o-mini",
    temperature=0.4,
    messages=[
        {"role": "system", "content": "You are a careful, neutral product reviewer."},
        {"role": "user", "content": prompt}
    ]
)

article = resp.choices[0].message.content.strip()
print("\n=== Generated Blog Article ===\n")
print(article)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded cluster: Games

=== Generated Blog Article ===

# Game Reviews: Top Picks and a Cautionary Tale

The gaming world offers a plethora of titles across various platforms, catering to diverse tastes and preferences. In this article, we will explore the top three games currently available, along with a notable outlier that shoppers should approach with caution.

## Top 3 Games

### 1. Uncharted 4: A Thief's End - PlayStation 4
With an impressive average rating of 4.89 stars from 176 reviews, "Uncharted 4" is celebrated for its engaging story and stunning visuals. It is ideal for players who enjoy action-adventure games with rich narratives.

### 2. Persona 4 Golden - PlayStation Vita
"Persona 4 Golden" boasts a strong average rating of 4.86 stars from 910 reviews. This title is known for its deep character development and engaging gameplay, making it perfec

After some tweaking on prompt the results look pretty good

# Now checking for all clusters with GPT-4o-mini

In [None]:
import os, json, time, re
from pathlib import Path
from textwrap import dedent
from google.colab import drive
from dotenv import load_dotenv
from openai import OpenAI

# 1) Mount Drive
drive.mount("/content/drive")

# 2) Load your API key from the .env in Drive
env_path = "/content/drive/MyDrive/secret_folder/.env"
load_dotenv(env_path)
assert os.getenv("OPENAI_API_KEY"), "OPENAI_API_KEY not loaded – check your .env path/content."

# 3) Paths (edit RUN_DIR to match your run)
RUN_DIR   = Path("/content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924")
SUMM_JSON = RUN_DIR / "cluster_summaries_draft.jsonl"

# Outputs
OUT_DIR_MD   = RUN_DIR / "articles_gpt4omini_md"
OUT_DIR_MD.mkdir(parents=True, exist_ok=True)
OUT_JSONL    = RUN_DIR / "articles_gpt4omini.jsonl"   # all articles in one file (machine-friendly)

# 4) Load all cluster summaries (already filtered)
summaries = []
with open(SUMM_JSON, "r") as f:
  for line in f:
    summaries.append(json.loads(line))

print(f"Loaded {len(summaries)} cluster summaries from {SUMM_JSON.name}")

# 5) Build prompt (same style used for the games test)
def build_prompt(cluster_summary: dict) -> str:
    cname         = cluster_summary.get("cluster_name", "")
    top_products  = cluster_summary.get("top_products", [])
    worst_product = cluster_summary.get("worst_product", {})

    lines = [f"## {cname}"]

    lines.append("### Top 3 Products:")
    for i, p in enumerate(top_products, 1):
        # List first ~6 complaints if present; else '-'
        complaints = ", ".join(p.get("top_complaints", [])[:6]) or "-"
        lines.append(
            f"- **{i}. {p.get('title','')}** "
            f"(avg ★ {p.get('avg_rating', 0):.2f}, {p.get('n_reviews', 0)} reviews)\n"
            f"  - Common complaints: {complaints}"
        )

    if worst_product:
        c = ", ".join(worst_product.get("top_complaints", [])[:8]) or "-"
        lines.append("\n### Worst Product")
        lines.append(
            f"- **{worst_product.get('title','')}** "
            f"(avg ★ {worst_product.get('avg_rating', 0):.2f}, {worst_product.get('n_reviews', 0)} reviews)\n"
            f"  - Common complaints: {c}"
        )

    # The exact instruction block used before on games
    instruction = dedent("""
    You are a product reviewer writing a short blog-style article for shoppers.

    Write ~200 words with clear sections:
    - Intro to this category
    - Compare the top 3 products (strengths, differences, who they’re for)
    - Summarize key complaints buyers should consider
    - Explain the worst product and why to avoid it

    Constraints:
    - Neutral and factual tone
    - Markdown formatting with headings
    - Avoid exaggeration or making up numbers
    """).strip()

    return f"{'\n'.join(lines)}\n\n{instruction}"

# 6) OpenAI client
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# helper to make safe filenames
def slugify(text, max_len=60):
    s = re.sub(r"[^a-zA-Z0-9\- ]+", "", text).strip().lower().replace(" ", "-")
    return s[:max_len] if s else "cluster"

# 7) Generate for all clusters, save Markdown + JSONL
temperature = 0.4
model = "gpt-4o-mini"

written = 0
with open(OUT_JSONL, "w", encoding="utf-8") as fout:
    for s in sorted(summaries, key=lambda x: x.get("cluster", 0)):
        cid   = s.get("cluster")
        cname = s.get("cluster_name", f"Cluster {cid}")

        prompt = build_prompt(s)

        # Call the model (same endpoint/settings used for Games)
        resp = client.chat.completions.create(
            model=model,
            temperature=temperature,
            messages=[
                {"role": "system", "content": "You are a careful, neutral product reviewer."},
                {"role": "user",   "content": prompt},
            ],
        )

        article = resp.choices[0].message.content.strip()

       # --- FIX: make 'usage' JSON-serializable and safe to print ---
        usage = getattr(resp, "usage", None)
        usage_dict = None
        total_tokens = None

        def _to_json_safe(obj):
            # Keep only JSON primitives recursively; convert pydantic-ish objects via model_dump()
            if obj is None or isinstance(obj, (str, int, float, bool)):
                return obj
            if isinstance(obj, dict):
                return {k: _to_json_safe(v) for k, v in obj.items()}
            if isinstance(obj, (list, tuple)):
                return [_to_json_safe(v) for v in obj]
            if hasattr(obj, "model_dump"):             # OpenAI SDK >= 1.0 (pydantic BaseModel)
                return _to_json_safe(obj.model_dump())
            if hasattr(obj, "__dict__"):               # last resort: shallow attrs
                return _to_json_safe({k: getattr(obj, k) for k in dir(obj) if not k.startswith("_") and not callable(getattr(obj, k))})
            return str(obj)  # absolute fallback

        if usage:
            # Prefer known scalar fields; fall back to model_dump if available
            simple = {
                "prompt_tokens":     getattr(usage, "prompt_tokens", None),
                "completion_tokens": getattr(usage, "completion_tokens", None),
                "total_tokens":      getattr(usage, "total_tokens", None),
            }
            if hasattr(usage, "model_dump"):
                # This may include *_details; _to_json_safe will prune to plain types
                usage_dict = _to_json_safe(usage.model_dump())
                # Ensure top-level simple fields are present
                usage_dict.update({k: v for k, v in simple.items() if v is not None})
            else:
                usage_dict = simple

            total_tokens = usage_dict.get("total_tokens")

        # Save per-cluster markdown
        md_name = f"cluster_{cid:02d}_{slugify(cname)}.md"
        (OUT_DIR_MD / md_name).write_text(article, encoding="utf-8")

        # Save to JSONL as well
        rec = {
            "cluster": cid,
            "cluster_name": cname,
            "model": model,
            "temperature": temperature,
            "prompt": prompt,
            "article": article,
            "usage": usage_dict,  # <-- safe dict or None
        }
        fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

        written += 1
        print(f"✓ Wrote {md_name}  "
              f"{'(tokens: ' + str(total_tokens) + ')' if total_tokens is not None else ''}")

print(f"\nDone. {written} articles saved to:")
print(f" - Markdown folder: {OUT_DIR_MD}")
print(f" - JSONL file:      {OUT_JSONL}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded 4 cluster summaries from cluster_summaries_draft.jsonl
✓ Wrote cluster_00_keyboards--mice.md  (tokens: 798)
✓ Wrote cluster_01_headsets--audio.md  (tokens: 802)
✓ Wrote cluster_02_games.md  (tokens: 727)
✓ Wrote cluster_03_controllers.md  (tokens: 692)

Done. 4 articles saved to:
 - Markdown folder: /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/articles_gpt4omini_md
 - JSONL file:      /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/articles_gpt4omini.jsonl


In [None]:
# checking results / Inspecting generated articles 
from pathlib import Path
import json

ART_DIR = RUN_DIR / "articles_gpt4omini_md"
ALL_JSON = RUN_DIR / "articles_gpt4omini.jsonl"

print("Markdown files in:", ART_DIR)
for p in sorted(ART_DIR.glob("*.md")):
    print(" -", p.name)

# Preview first few lines of each Markdown
for p in sorted(ART_DIR.glob("*.md")):
    print("\n\n=== Preview:", p.name, "===")
    txt = p.read_text(encoding="utf-8").splitlines()
    for line in txt[:30]:   # show first 20 lines
        print(line)

# Or read the JSONL if you want structured access
print("\n\n=== JSONL content (first record) ===")
with open(ALL_JSON, "r", encoding="utf-8") as f:
    first = json.loads(next(f))
print(json.dumps(first, indent=2, ensure_ascii=False)[:800], "...")


Markdown files in: /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/articles_gpt4omini_md
 - cluster_00_keyboards--mice.md
 - cluster_01_headsets--audio.md
 - cluster_02_games.md
 - cluster_03_controllers.md


=== Preview: cluster_00_keyboards--mice.md ===
# Keyboards & Mice: A Shopper's Guide

In the world of gaming and productivity, the right keyboard and mouse can significantly enhance your experience. With a plethora of options available, it's essential to choose products that align with your needs and preferences. Here, we compare three top-rated products and highlight a poorly rated option to avoid.

## Top 3 Products

### 1. Razer Gaming Mouse Bungee v2
With an average rating of 4.87 from 339 reviews, the Razer Gaming Mouse Bungee v2 is designed for esports-level performance. Its drag-free wired support helps prevent cable tangling, making it ideal for serious gamers who prioritize precision and speed.

### 2. JUEYINGBAILI JYZZ RGB Gaming Mouse Pad
This larg

## Now trying with transformer model from Hugging Face. facebook/bart-large-cnn

In [21]:
# --- Summarize clusters with a pretrained HF model (BART or FLAN-T5) ---

!pip -q install "transformers>=4.42.0" "accelerate>=0.31.0" "torch>=2.1.0" sentencepiece

import os, json, re, math
from pathlib import Path
from textwrap import dedent
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# -----------------------
# Paths (match run)
# -----------------------
RUN_DIR   = Path("/content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924")
SUMM_JSON = RUN_DIR / "cluster_summaries_draft.jsonl"

# Choose model: "facebook/bart-large-cnn"
MODEL_NAME = "facebook/bart-large-cnn"   # we will change to "google/flan-t5-large" in other cell later

# Tag output folders based on model
tag = "bart" if "bart" in MODEL_NAME.lower() else "flant5"
OUT_DIR_MD = RUN_DIR / f"articles_{tag}_md"
OUT_DIR_MD.mkdir(parents=True, exist_ok=True)
OUT_JSONL  = RUN_DIR / f"articles_{tag}.jsonl"

# -----------------------
# Load cluster summaries
# -----------------------
summaries = []
with open(SUMM_JSON, "r", encoding="utf-8") as f:
    for line in f:
        summaries.append(json.loads(line))
print(f"Loaded {len(summaries)} cluster summaries from {SUMM_JSON.name}")

# -----------------------
# Prompt builder (same style)
# -----------------------
def build_prompt(cluster_summary: dict) -> str:
    cname         = cluster_summary.get("cluster_name", "")
    top_products  = cluster_summary.get("top_products", [])
    worst_product = cluster_summary.get("worst_product", {})

    lines = [f"## {cname}", "### Top 3 Products:"]
    for i, p in enumerate(top_products, 1):
        complaints = ", ".join(p.get("top_complaints", [])[:6]) or "-"
        lines.append(
            f"- **{i}. {p.get('title','')}** "
            f"(avg ★ {p.get('avg_rating', 0):.2f}, {p.get('n_reviews', 0)} reviews)\n"
            f"  - Common complaints: {complaints}"
        )

    if worst_product:
        c = ", ".join(worst_product.get("top_complaints", [])[:8]) or "-"
        lines.append("\n### Worst Product")
        lines.append(
            f"- **{worst_product.get('title','')}** "
            f"(avg ★ {worst_product.get('avg_rating', 0):.2f}, {worst_product.get('n_reviews', 0)} reviews)\n"
            f"  - Common complaints: {c}"
        )

    instruction = dedent("""
    You are a product reviewer writing a short blog-style article for shoppers.

    Write ~200 words with clear sections:
    - Intro to this category
    - Compare the top 3 products (strengths, differences, who they’re for)
    - Summarize key complaints buyers should consider
    - Explain the worst product and why to avoid it

    Constraints:
    - Neutral and factual tone
    - Markdown formatting with headings
    - Avoid exaggeration or making up numbers
    """).strip()

    return f"{'\n'.join(lines)}\n\n{instruction}"

# -----------------------
# HF pipeline setup (FIXED)
# -----------------------
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

use_cuda = torch.cuda.is_available()
fp16 = use_cuda

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Use Accelerate only when CUDA is available; otherwise plain CPU
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if fp16 else None,
    device_map="auto" if use_cuda else None,   # <-- this triggers Accelerate on GPU
)

# IMPORTANT: do NOT pass `device=` when Accelerate/device_map is used.
if "bart" in MODEL_NAME.lower():
    generator = pipeline(
        "summarization",
        model=model,
        tokenizer=tokenizer,
        # device=...  <-- removed
    )
    GEN_KW = dict(max_length=320, min_length=140, no_repeat_ngram_size=3, do_sample=False)
else:
    generator = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        # device=...  <-- removed
    )
    GEN_KW = dict(max_new_tokens=280, do_sample=False)

def slugify(text, max_len=60):
    s = re.sub(r"[^a-zA-Z0-9\- ]+", "", text).strip().lower().replace(" ", "-")
    return s[:max_len] if s else "cluster"

# To avoid hitting model max input length, trim the prompt if needed
def trim_to_tokens(text, max_tokens=900):
    # crude but effective: cut by characters relative to tokenizer limits
    # reserve some room for instructions/formatting
    ids = tokenizer(text, return_tensors=None, add_special_tokens=True, truncation=True, max_length=max_tokens)
    return tokenizer.decode(ids["input_ids"], skip_special_tokens=True)

def make_model_input(prompt: str) -> str:
    if "bart" in MODEL_NAME.lower():
        # BART expects raw doc to summarize
        return trim_to_tokens(prompt, max_tokens=min(tokenizer.model_max_length, 1024))
    else:
        # FLAN-T5 benefits from instruction prefix
        prefix = (
            "Summarize the following into ~200 words with Markdown headings:\n"
            "Sections: Intro, Compare top 3 products, Key complaints, Worst product.\n\n"
        )
        return trim_to_tokens(prefix + prompt, max_tokens=min(tokenizer.model_max_length, 1024))

# -----------------------
# Generate & save
# -----------------------
written = 0
with open(OUT_JSONL, "w", encoding="utf-8") as fout:
    for s in sorted(summaries, key=lambda x: x.get("cluster", 0)):
        cid   = s.get("cluster")
        cname = s.get("cluster_name", f"Cluster {cid}")

        prompt = build_prompt(s)
        model_input = make_model_input(prompt)

        outs = generator(model_input, **GEN_KW)
        article = outs[0]["summary_text"] if "bart" in MODEL_NAME.lower() else outs[0]["generated_text"]
        article = article.strip()

        # Ensure headings exist (some models may drop them)
        if "## " not in article:
            article = f"## {cname}\n\n" + article

        # Save per-cluster markdown
        md_name = f"cluster_{cid:02d}_{slugify(cname)}.md"
        (OUT_DIR_MD / md_name).write_text(article, encoding="utf-8")

        # Save JSONL record (no SDK usage object here)
        rec = {
            "cluster": cid,
            "cluster_name": cname,
            "model": MODEL_NAME,
            "params": GEN_KW,
            "prompt": prompt,
            "article": article,
        }
        fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

        written += 1
        print(f"✓ Wrote {md_name}")

print(f"\nDone. {written} articles saved to:")
print(f" - Markdown folder: {OUT_DIR_MD}")
print(f" - JSONL file:      {OUT_JSONL}")


Loaded 4 cluster summaries from cluster_summaries_draft.jsonl


Device set to use cuda:0


✓ Wrote cluster_00_keyboards--mice.md


Your max_length is set to 320, but your input_length is only 294. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=147)


✓ Wrote cluster_01_headsets--audio.md
✓ Wrote cluster_02_games.md
✓ Wrote cluster_03_controllers.md

Done. 4 articles saved to:
 - Markdown folder: /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/articles_bart_md
 - JSONL file:      /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/articles_bart.jsonl


# Inspecting results from facebook/bart-large-cnn

In [None]:
# Inspect BART results
from pathlib import Path
import json

ART_DIR = RUN_DIR / "articles_bart_md"
ALL_JSON = RUN_DIR / "articles_bart.jsonl"

print("Markdown files in:", ART_DIR)
for p in sorted(ART_DIR.glob("*.md")):
    print(" -", p.name)

# Preview first few lines of each Markdown
for p in sorted(ART_DIR.glob("*.md")):
    print("\n=== Preview:", p.name, "===")
    txt = p.read_text(encoding="utf-8").splitlines()
    for line in txt[:20]:   # show first 20 lines
        print(line)

# Or preview JSONL content (first record)
print("\n=== JSONL first record ===")
with open(ALL_JSON, "r", encoding="utf-8") as f:
    first = json.loads(next(f))
print(json.dumps(first, indent=2, ensure_ascii=False)[:800], "...")


Markdown files in: /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/articles_bart_md
 - cluster_00_keyboards--mice.md
 - cluster_01_headsets--audio.md
 - cluster_02_games.md
 - cluster_03_controllers.md

=== Preview: cluster_00_keyboards--mice.md ===
## Keyboards & Mice

You are a product reviewer writing a short blog-style article for shoppers. Write ~200 words with clear sections. Compare the top 3 products (strengths, differences, who they’re for) Summarize key complaints buyers should consider. Explain the worst product and why to avoid it. Use Markdown formatting with headings. Avoid exaggeration or making up numbers. Use the weekly Newsquiz to test your knowledge of stories you saw on this site. Back to the page you came from. Share your thoughts on our product reviews and suggestions for future coverage of this type of article. Share this article with your friends and family on Facebook, Twitter, and other social media sites.

=== Preview: cluster_01_headset

Can not seem to get decent results with this transfer model

## Now trying with other transfer model from HuggingFAce, google/flan-t5-large

In [25]:
# --- Summarize clusters with a pretrained HF model (FLAN-T5, structured context) ---

!pip -q install "transformers>=4.42.0" "accelerate>=0.31.0" "torch>=2.1.0" sentencepiece

import os, json, re
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# -----------------------
# Paths (match your run)
# -----------------------
RUN_DIR   = Path("/content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924")
SUMM_JSON = RUN_DIR / "cluster_summaries_draft.jsonl"

# Use instruction-tuned model
MODEL_NAME = "google/flan-t5-large"

# Tag output folders
tag = "flant5"
OUT_DIR_MD = RUN_DIR / f"articles_{tag}_md"
OUT_DIR_MD.mkdir(parents=True, exist_ok=True)
OUT_JSONL  = RUN_DIR / f"articles_{tag}.jsonl"

# -----------------------
# Load cluster summaries
# -----------------------
summaries = []
with open(SUMM_JSON, "r", encoding="utf-8") as f:
    for line in f:
        summaries.append(json.loads(line))
print(f"Loaded {len(summaries)} cluster summaries from {SUMM_JSON.name}")

# -----------------------
# Structured context builders
# -----------------------
def build_context(cluster_summary: dict) -> str:
    cname         = cluster_summary.get("cluster_name", "")
    top_products  = cluster_summary.get("top_products", [])[:3]
    worst_product = cluster_summary.get("worst_product", {}) or {}

    lines = [f"CATEGORY: {cname}"]

    lines.append("TOP_3_PRODUCTS:")
    for i, p in enumerate(top_products, 1):
        lines.append(
            f"- {i}. title={p.get('title','')}, "
            f"avg_rating={p.get('avg_rating', 0):.2f}, "
            f"n_reviews={p.get('n_reviews', 0)}, "
            f"complaints=[{'; '.join(p.get('top_complaints', [])[:6])}]"
        )

    if worst_product:
        lines.append("WORST_PRODUCT:")
        lines.append(
            f"- title={worst_product.get('title','')}, "
            f"avg_rating={worst_product.get('avg_rating', 0):.2f}, "
            f"n_reviews={worst_product.get('n_reviews', 0)}, "
            f"complaints=[{'; '.join(worst_product.get('top_complaints', [])[:8])}]"
        )

    return "\n".join(lines)

def make_model_input(cluster_summary: dict) -> str:
    context = build_context(cluster_summary)
    instruction = (
        "Write ~200 words in Markdown with these headings: "
        "Intro; Compare top 3 products; Key complaints; Worst product. "
        "Be neutral and factual. Use only the context. Do not repeat instructions.\n\n"
        "CONTEXT:\n"
    )
    return instruction + context

def slugify(text, max_len=60):
    s = re.sub(r"[^a-zA-Z0-9\- ]+", "", text).strip().lower().replace(" ", "-")
    return s[:max_len] if s else "cluster"

# -----------------------
# HF pipeline setup
# -----------------------
use_cuda = torch.cuda.is_available()
fp16 = use_cuda

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if fp16 else None,
    device_map="auto" if use_cuda else None,   # Accelerate on GPU
)

generator = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
)

# Decoding tuned for quality & low repetition
GEN_KW = dict(
    max_new_tokens=240,
    do_sample=False,
    num_beams=4,
    no_repeat_ngram_size=3,
    length_penalty=1.0,
    early_stopping=True,
)

# -----------------------
# Generate & save
# -----------------------
written = 0
with open(OUT_JSONL, "w", encoding="utf-8") as fout:
    for s in sorted(summaries, key=lambda x: x.get("cluster", 0)):
        cid   = s.get("cluster")
        cname = s.get("cluster_name", f"Cluster {cid}")

        model_input = make_model_input(s)

        with torch.inference_mode():
            outs = generator(model_input, **GEN_KW)

        article = outs[0]["generated_text"].strip()

        # Ensure a top heading exists
        if "## " not in article:
            article = f"## {cname}\n\n" + article

        # Save per-cluster markdown
        md_name = f"cluster_{cid:02d}_{slugify(cname)}.md"
        (OUT_DIR_MD / md_name).write_text(article, encoding="utf-8")

        # Save JSONL record
        rec = {
            "cluster": cid,
            "cluster_name": cname,
            "model": MODEL_NAME,
            "params": GEN_KW,
            "context": build_context(s),
            "article": article,
        }
        fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

        written += 1
        print(f"✓ Wrote {md_name}")

print(f"\nDone. {written} articles saved to:")
print(f" - Markdown folder: {OUT_DIR_MD}")
print(f" - JSONL file:      {OUT_JSONL}")


Loaded 4 cluster summaries from cluster_summaries_draft.jsonl


Device set to use cuda:0


✓ Wrote cluster_00_keyboards--mice.md
✓ Wrote cluster_01_headsets--audio.md
✓ Wrote cluster_02_games.md
✓ Wrote cluster_03_controllers.md

Done. 4 articles saved to:
 - Markdown folder: /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/articles_flant5_md
 - JSONL file:      /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/articles_flant5.jsonl


In [26]:
# --- Inspect FLAN-T5 results ---
from pathlib import Path
import json

ART_DIR = RUN_DIR / "articles_flant5_md"
ALL_JSON = RUN_DIR / "articles_flant5.jsonl"

print("Markdown files in:", ART_DIR)
for p in sorted(ART_DIR.glob("*.md")):
    print(" -", p.name)

# Preview first few lines of each Markdown
for p in sorted(ART_DIR.glob("*.md")):
    print("\n=== Preview:", p.name, "===")
    txt = p.read_text(encoding="utf-8").splitlines()
    for line in txt[:20]:   # show first 20 lines
        print(line)

# Or preview JSONL content (first record)
print("\n=== JSONL first record ===")
with open(ALL_JSON, "r", encoding="utf-8") as f:
    first = json.loads(next(f))
print(json.dumps(first, indent=2, ensure_ascii=False)[:800], "...")


Markdown files in: /content/drive/MyDrive/Project_NLP/runs/clustering_full_20250828_1924/articles_flant5_md
 - cluster_00_keyboards--mice.md
 - cluster_01_headsets--audio.md
 - cluster_02_games.md
 - cluster_03_controllers.md

=== Preview: cluster_00_keyboards--mice.md ===
## Keyboards & Mice

The Wireless Gaming Keyboard and Mouse Combo Rainbow Backlight Quiet Ergonomic Mechanical Feeling Anti-ghosting Keyboard Mouse with Rechargeable 4000mAh Battery Mouse Pad for Computer Mac Gamer is the worst product.

=== Preview: cluster_01_headsets--audio.md ===
## Headsets & Audio

The JAMSWALL Stereo Gaming Headset for Xbox one PS4-3.5mm Wired over-head stereo gaming headset Headphone with Mic Microphone, Volume Control for PS4 PC Tablet Laptop Smartphone Xbox One (Black with red) is the worst product.

=== Preview: cluster_02_games.md ===
## Games

[game; just; city; servers; server; simcity; just]

=== Preview: cluster_03_controllers.md ===
## Controllers

The Zamia Wireless Controller for X

Actually writes sort of about the correct things but, can not seem to get it form coherent sentences