In [None]:
import os
import pandas as pd
from collections import Counter, defaultdict

# ---------- Config ----------
INPUT = r"D:\Darryl\Coding\s_p\data\processed\messages_with_clusters_v2.csv"
OUT_HTML = r"D:\Darryl\Coding\s_p\clusters_network.html"
OUT_PNG = r"D:\Darryl\Coding\s_p\data\processed\clusters_network.png"
MAX_NODES_VIS = 1200   # limit nodes for pyvis speed; adjust as needed

# ---------- Load data ----------
if not os.path.exists(INPUT):
    INPUT = r"D:\Darryl\Coding\s_p\data\processed\messages_with_clusters.csv"
    if not os.path.exists(INPUT):
        raise FileNotFoundError("Cannot find cluster-level messages CSV. Check paths.")

print("Loading:", INPUT)
df = pd.read_csv(INPUT, low_memory=False)
print("Rows:", len(df))

# required columns
for c in ("cluster_id", "candidate_name_norm_simple", "chat_id"):
    if c not in df.columns:
        raise ValueError(f"Missing column {c} in {INPUT}")

# convert to node ids
df["cluster_id_str"] = df["cluster_id"].astype(str)
df["candidate_node"] = "cand:" + df["candidate_name_norm_simple"].astype(str)
df["cluster_node"] = "clu:" + df["cluster_id_str"]
df["group_node"] = "grp:" + df["chat_id"].astype(str)

# ---------- Build edge weights ----------
edge_cand_cluster = Counter()
edge_group_cluster = Counter()

candidate_stats = defaultdict(lambda: {"messages":0, "high":0, "avg_score":0.0, "scores":[]})
group_stats = defaultdict(lambda: {"messages":0, "high":0, "avg_score":0.0, "scores":[]})
cluster_stats = defaultdict(lambda: {"messages":0, "high":0, "avg_score":0.0, "scores":[]})

for _, r in df.iterrows():
    cand = r["candidate_node"]
    clu = r["cluster_node"]
    grp = r["group_node"]
    score = float(r.get("heuristic_score") or 0)
    risk = str(r.get("risk_label") or "").lower()

    edge_cand_cluster[(cand, clu)] += 1
    edge_group_cluster[(grp, clu)] += 1

    candidate_stats[cand]["messages"] += 1
    candidate_stats[cand]["scores"].append(score)
    if risk == "high":
        candidate_stats[cand]["high"] += 1

    group_stats[grp]["messages"] += 1
    group_stats[grp]["scores"].append(score)
    if risk == "high":
        group_stats[grp]["high"] += 1

    cluster_stats[clu]["messages"] += 1
    cluster_stats[clu]["scores"].append(score)
    if risk == "high":
        cluster_stats[clu]["high"] += 1

# finalize averages
for d in (candidate_stats, group_stats, cluster_stats):
    for k,v in d.items():
        v["avg_score"] = sum(v["scores"]) / len(v["scores"]) if v["scores"] else 0

# ---------- Build node/edge sets ----------
top_candidates = sorted(candidate_stats.items(), key=lambda x: x[1]["messages"], reverse=True)[:200]
top_clusters = sorted(cluster_stats.items(), key=lambda x: x[1]["messages"], reverse=True)[:200]
top_groups = sorted(group_stats.items(), key=lambda x: x[1]["messages"], reverse=True)[:200]

top_candidate_set = {k for k,_ in top_candidates}
top_cluster_set = {k for k,_ in top_clusters}
top_group_set = {k for k,_ in top_groups}

nodes, node_info = [], {}
for cand, stats in top_candidates:
    nodes.append(cand)
    node_info[cand] = ("candidate", stats)
for clu, stats in top_clusters:
    nodes.append(clu)
    node_info[clu] = ("cluster", stats)
for grp, stats in top_groups:
    nodes.append(grp)
    node_info[grp] = ("group", stats)

edges = []
for (a,b), w in edge_cand_cluster.items():
    if a in node_info and b in node_info:
        edges.append((a,b,w))
for (a,b), w in edge_group_cluster.items():
    if a in node_info and b in node_info:
        edges.append((a,b,w))

print("Nodes:", len(nodes), "Edges:", len(edges))

# ---------- Interactive PyVis or fallback ----------
try:
    from pyvis.network import Network
    use_pyvis = True
except Exception:
    use_pyvis = False

if use_pyvis:
    print("Building interactive pyvis graph...")
    net = Network(height="1200px", width="100%", notebook=True, cdn_resources='in_line')
    net.barnes_hut()

    color_map = {"candidate":"#1f77b4", "cluster":"#ff7f0e", "group":"#2ca02c"}

    for nid in nodes:
        ntype, stats = node_info[nid]
        size = 10 + min(80, stats["messages"])
        title = f"{nid}<br>type={ntype}<br>msgs={stats['messages']}<br>highs={stats.get('high',0)}<br>avg_score={stats.get('avg_score',0):.1f}"
        label = nid.split(":",1)[1]
        net.add_node(nid, label=label, title=title,
                     color=color_map.get(ntype,"#888888"), size=size)

    for a,b,w in edges:
        net.add_edge(a, b, value=max(1, min(10, w)),
                     title=f"messages={w}", width=min(6, 1+int(w/2)))

    net.show_buttons(filter_=['physics'])
    net.set_options("""
    var options = {
      "nodes": {
        "borderWidth": 1,
        "borderWidthSelected": 2,
        "font": {"size":12}
      }
    }
    """)

    # ✅ Manual UTF-8 safe write
    with open(OUT_HTML, "w", encoding="utf-8") as f:
        f.write(net.generate_html())

    print("Interactive graph saved to:", OUT_HTML)


else:
    print("pyvis not installed — building static network image via networkx + matplotlib")
    import networkx as nx
    import matplotlib.pyplot as plt

    G = nx.Graph()
    for nid in nodes:
        ntype, stats = node_info[nid]
        G.add_node(nid, ntype=ntype, msgs=stats["messages"])

    for a,b,w in edges:
        G.add_edge(a,b, weight=w)

    plt.figure(figsize=(16,12))
    pos = nx.spring_layout(G, k=0.15, iterations=50, seed=42)

    node_colors, sizes = [], []
    for n in G.nodes():
        ntype = node_info[n][0]
        if ntype=="candidate":
            node_colors.append("#1f77b4")
        elif ntype=="cluster":
            node_colors.append("#ff7f0e")
        else:
            node_colors.append("#2ca02c")
        sizes.append(50 + node_info[n][1]["messages"])

    nx.draw_networkx_nodes(G, pos, node_size=sizes, node_color=node_colors, alpha=0.9)
    nx.draw_networkx_edges(G, pos, alpha=0.6)
    labels = {n:n.split(":",1)[1] for n in G.nodes()}
    nx.draw_networkx_labels(G, pos, labels, font_size=8)
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(OUT_PNG, dpi=200)
    print("Static graph saved to:", OUT_PNG)


Loading: D:\Darryl\Coding\s_p\data\processed\messages_with_clusters_v2.csv
Rows: 2649
Nodes: 252 Edges: 512
Building interactive pyvis graph...
Interactive graph saved to: D:\Darryl\Coding\s_p\data\processed\clusters_network.html


In [None]:
import pandas as pd
from pyvis.network import Network
from IPython.display import display, HTML

# Load data
clusters = pd.read_csv(r"D:\Darryl\Coding\s_p\data\processed\cluster_summary_v2.csv")
messages = pd.read_csv(r"D:\Darryl\Coding\s_p\data\processed\messages_with_clusters_v2.csv")

# Create network
net = Network(height="700px", width="100%", notebook=True, directed=False)

# Add cluster nodes
for _, row in clusters.iterrows():
    cluster_id = row["cluster_id"]
    label = f"Cluster {cluster_id}\nMsgs: {row['n_messages']}\nHigh: {row['high_ratio']:.2f}"
    net.add_node(f"cluster_{cluster_id}", label=label, color="orange", shape="dot", size=10+row['n_messages']/5)

# Add candidate nodes + edges
for _, row in messages.iterrows():
    cluster_id = row["cluster_id"]
    candidate = row["candidate_name_norm_simple"]
    if pd.notna(candidate) and candidate != "nan":
        net.add_node(candidate, label=candidate, color="lightblue", shape="ellipse")
        net.add_edge(f"cluster_{cluster_id}", candidate, color="gray")

# Show in notebook
net.show("cluster_network.html")
display(HTML("cluster_network.html"))
