In [1]:
import os
import re
import pandas as pd
from tqdm import tqdm

# -----------------------
# File paths
# -----------------------
messages_path = r"D:\Darryl\Coding\s_p\data\processed\messages_with_risk_v4.csv"
out_messages_clusters = r"D:\Darryl\Coding\s_p\data\processed\messages_with_clusters_v2.csv"
out_cluster_summary = r"D:\Darryl\Coding\s_p\data\processed\cluster_templates_v2.csv"
out_cluster_examples = r"D:\Darryl\Coding\s_p\data\processed\cluster_examples_v2"

# -----------------------
# Load messages
# -----------------------
df = pd.read_csv(messages_path, low_memory=False)
print(f"Loaded messages: {len(df)}")

# Use text column
TEXT_COL = "text_for_model"
if TEXT_COL not in df.columns:
    raise ValueError(f"{TEXT_COL} not in dataset")

# Filter very short messages
df = df[df[TEXT_COL].fillna("").str.len() > 20].copy()
print(f"After removing very-short messages: {len(df)}")

# -----------------------
# Embeddings
# -----------------------
from sentence_transformers import SentenceTransformer, util
model_name = "all-MiniLM-L6-v2"
print(f"Loading embedding model: {model_name}")
embedder = SentenceTransformer(model_name)

embeddings = embedder.encode(df[TEXT_COL].tolist(), batch_size=64, show_progress_bar=True, convert_to_tensor=True)
print("Embeddings shape:", embeddings.shape)

# -----------------------
# Template canonicalization function
# -----------------------
def canonicalize(text: str) -> str:
    """Normalize message by stripping numbers, dates, links."""
    text = re.sub(r"http\S+|www\S+", " __URL__ ", text)
    text = re.sub(r"\d{1,4}([.,:/-]\d{1,4})*", " __NUM__ ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip().lower()

df["canonical_text"] = df[TEXT_COL].apply(lambda x: canonicalize(str(x)))

# -----------------------
# Clustering with similarity threshold
# -----------------------
clusters = []
visited = set()

cosine_threshold = 0.85  # tweakable, higher = stricter grouping
emb_np = embeddings.cpu()

for i in tqdm(range(len(df)), desc="Clustering"):
    if i in visited:
        continue
    sim_scores = util.cos_sim(emb_np[i], emb_np)[0]
    cluster_idx = (sim_scores >= cosine_threshold).nonzero().flatten().tolist()
    for j in cluster_idx:
        visited.add(j)
    clusters.append(cluster_idx)

# Assign cluster IDs
cluster_map = {}
for cid, idxs in enumerate(clusters):
    for j in idxs:
        cluster_map[j] = cid
df["cluster_id"] = df.index.map(lambda i: cluster_map.get(i, -1))

# -----------------------
# Cluster summaries
# -----------------------
summaries = []
for cid, group in df.groupby("cluster_id"):
    top_msg = group.iloc[0]
    summaries.append({
        "cluster_id": cid,
        "n_messages": len(group),
        "example_message": top_msg[TEXT_COL],
        "canonical_template": canonicalize(top_msg[TEXT_COL]),
        "high_msgs": (group["risk_label"]=="high").sum(),
        "med_msgs": (group["risk_label"]=="medium").sum(),
        "low_msgs": (group["risk_label"]=="low").sum(),
        "avg_score": group["heuristic_score"].mean(),
        "max_score": group["heuristic_score"].max(),
    })
summary_df = pd.DataFrame(summaries).sort_values("n_messages", ascending=False)

# -----------------------
# Save outputs
# -----------------------
df.to_csv(out_messages_clusters, index=False)
summary_df.to_csv(out_cluster_summary, index=False)

os.makedirs(out_cluster_examples, exist_ok=True)
for cid, group in df.groupby("cluster_id"):
    group.head(20).to_csv(os.path.join(out_cluster_examples, f"cluster_{cid}.csv"), index=False)

print(f"✅ Saved message-level clusters -> {out_messages_clusters}")
print(f"✅ Saved cluster summary -> {out_cluster_summary}")
print(f"✅ Saved per-cluster examples -> {out_cluster_examples}")

# -----------------------
# Show high-risk clusters
# -----------------------
print("\nTop clusters with high-risk dominance:")
print(summary_df.sort_values("high_msgs", ascending=False).head(10))


Loaded messages: 4098
After removing very-short messages: 2649
Loading embedding model: all-MiniLM-L6-v2


Batches:   0%|          | 0/42 [00:00<?, ?it/s]

Embeddings shape: torch.Size([2649, 384])


Clustering: 100%|████████████████████████████████████████████████████████████████| 2649/2649 [00:00<00:00, 6462.80it/s]


✅ Saved message-level clusters -> D:\Darryl\Coding\s_p\data\processed\messages_with_clusters_v2.csv
✅ Saved cluster summary -> D:\Darryl\Coding\s_p\data\processed\cluster_templates_v2.csv
✅ Saved per-cluster examples -> D:\Darryl\Coding\s_p\data\processed\cluster_examples_v2

Top clusters with high-risk dominance:
      cluster_id  n_messages  \
0             -1        1053   
1005        1529          20   
315          441           7   
452          646           2   
336          493           5   
596          971           1   
593          968           1   
592          967           1   
590          965           1   
736         1123           1   

                                        example_message  \
0                      low risk setup spic 112 5 to 118   
1005  जय श र श य म 𝗚𝗢𝗢𝗗 𝗠𝗢𝗥𝗡𝗜𝗡𝗚 𝗧𝗥𝗔𝗗𝗘𝗥𝗦 𝗨𝗡𝗠𝗨𝗧𝗘 𝗢𝗨𝗥 𝗖...   
315                          #stocktowatch by cnbc tv18   
452   join as a premium member for more sureshot tra...   
336                              pai