In [1]:
import pandas as pd

# Load clustered messages
messages_path = r"D:\Darryl\Coding\s_p\data\processed\messages_with_clusters_v2.csv"
df = pd.read_csv(messages_path)

print(f"Loaded clustered messages: {len(df)}")

# Ensure consistent string type for risk_label
df["risk_label"] = df["risk_label"].astype(str)

# === Cluster-level aggregation ===
cluster_summary = (
    df.groupby("cluster_id")
    .agg(
        n_messages=("message_id", "count"),
        canonical_template=("canonical_text", "first"),   # ✅ fixed
        example_message=("text", "first"),                # ✅ fixed
        high_msgs=("risk_label", lambda x: (x == "high").sum()),
        med_msgs=("risk_label", lambda x: (x == "medium").sum()),
        low_msgs=("risk_label", lambda x: (x == "low").sum()),
        avg_score=("heuristic_score", "mean"),
        max_score=("heuristic_score", "max"),
        n_groups=("chat_id", pd.Series.nunique),
        n_candidates=("candidate_name_norm_simple", pd.Series.nunique),
    )
    .reset_index()
)

# Compute ratios
cluster_summary["high_ratio"] = cluster_summary["high_msgs"] / cluster_summary["n_messages"]
cluster_summary["med_ratio"] = cluster_summary["med_msgs"] / cluster_summary["n_messages"]
cluster_summary["low_ratio"] = cluster_summary["low_msgs"] / cluster_summary["n_messages"]

# Sort by high-risk dominance
cluster_summary = cluster_summary.sort_values(
    ["high_ratio", "n_messages"], ascending=[False, False]
)

# Save
out_path = r"D:\Darryl\Coding\s_p\data\processed\cluster_summary_v2.csv"
cluster_summary.to_csv(out_path, index=False)

print(f"✅ Saved cluster summary -> {out_path}")
print(cluster_summary.head(10))


Loaded clustered messages: 2649
✅ Saved cluster summary -> D:\Darryl\Coding\s_p\data\processed\cluster_summary_v2.csv
     cluster_id  n_messages  \
452         646           2   
116         165           1   
182         269           1   
324         465           1   
326         476           1   
368         544           1   
385         563           1   
459         653           1   
462         661           1   
473         688           1   

                                    canonical_template  \
452  join as a premium member for more sureshot tra...   
116  few space left for account handling account ha...   
182  market closing nifty __num__ nifty opened marg...   
324  few space left for account handling account ha...   
326  few space left for account handling account ha...   
368  morning alert substantial tariff hike on india...   
385  stallion india fluorochemicals limited sifl le...   
459  maxhealth __num__ __num__ to __num__ __num__ a...   
462  auropharma __