In [1]:
from __future__ import annotations

import json
import numpy as np
import pandas as pd
from pathlib import Path

RESULTS_ROOT = Path("../results/gmm_stability")
RUNS_CSV = RESULTS_ROOT / "runs.csv"

# ---------- utilities ----------
def load_importance_long(run_id: str) -> pd.DataFrame:
    run_dir = RESULTS_ROOT / f"run_{run_id}"
    imp = pd.read_csv(run_dir / "importance_long.csv")
    return imp

def load_summary(run_id: str) -> pd.DataFrame:
    run_dir = RESULTS_ROOT / f"run_{run_id}"
    return pd.read_csv(run_dir / "summary_clusters.csv")

def build_signatures(run_id: str, feature_universe: list[str] | None = None) -> pd.DataFrame:
    """
    Returns a wide matrix:
      index: cluster
      columns: features
      values: normalized importance (L1)
    """
    imp = load_importance_long(run_id)
    if imp.empty:
        return pd.DataFrame()

    # pivot to wide
    wide = imp.pivot_table(index="cluster", columns="feature", values="importance", aggfunc="mean").fillna(0.0)

    if feature_universe is not None:
        # ensure same columns across runs
        for f in feature_universe:
            if f not in wide.columns:
                wide[f] = 0.0
        wide = wide[feature_universe]

    # L1 normalize per cluster
    s = wide.sum(axis=1).replace(0.0, np.nan)
    wide = wide.div(s, axis=0).fillna(0.0)
    return wide

def cosine_sim_matrix(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    # rows are vectors
    An = A / (np.linalg.norm(A, axis=1, keepdims=True) + 1e-12)
    Bn = B / (np.linalg.norm(B, axis=1, keepdims=True) + 1e-12)
    return An @ Bn.T

def match_clusters_by_importance(sig_a: pd.DataFrame, sig_b: pd.DataFrame) -> pd.DataFrame:
    """
    Match clusters between two runs by maximizing cosine similarity between signatures.
    Returns DataFrame: cluster_A, cluster_B, similarity
    """
    if sig_a.empty or sig_b.empty:
        return pd.DataFrame(columns=["cluster_A", "cluster_B", "similarity"])

    A = sig_a.sort_index()
    B = sig_b.sort_index()

    S = cosine_sim_matrix(A.to_numpy(), B.to_numpy())  # similarity matrix

    try:
        from scipy.optimize import linear_sum_assignment  # type: ignore
        # Hungarian solves min cost -> use cost = 1 - similarity
        r, c = linear_sum_assignment(1.0 - S)
        pairs = [(int(A.index[i]), int(B.index[j]), float(S[i, j])) for i, j in zip(r, c)]
    except Exception:
        # greedy fallback
        pairs = []
        used_j = set()
        for i in range(S.shape[0]):
            j = int(np.argmax([S[i, jj] if jj not in used_j else -np.inf for jj in range(S.shape[1])]))
            used_j.add(j)
            pairs.append((int(A.index[i]), int(B.index[j]), float(S[i, j])))

    return pd.DataFrame(pairs, columns=["cluster_A", "cluster_B", "similarity"]).sort_values("similarity", ascending=False)

def top_feature(sig: pd.DataFrame, cl: int) -> str | None:
    if cl not in sig.index:
        return None
    s = sig.loc[cl]
    if (s.values <= 0).all():
        return None
    return str(s.idxmax())

def topk_set(sig: pd.DataFrame, cl: int, k: int = 5) -> set[str]:
    if cl not in sig.index:
        return set()
    s = sig.loc[cl].sort_values(ascending=False)
    s = s[s > 0]
    return set(map(str, s.head(k).index))

def weighted_mean_similarity(mapping: pd.DataFrame, sum_a: pd.DataFrame) -> float:
    """
    Weight similarities by cluster size from run A (n).
    """
    if mapping.empty or sum_a.empty:
        return np.nan
    w = sum_a.set_index("cluster")["n"].to_dict()
    mapping = mapping.copy()
    mapping["w"] = mapping["cluster_A"].map(w).fillna(0.0)
    if mapping["w"].sum() == 0:
        return float(mapping["similarity"].mean())
    return float((mapping["similarity"] * mapping["w"]).sum() / mapping["w"].sum())

# ---------- main: compare all pairs ----------
runs_df = pd.read_csv(RUNS_CSV)
run_ids = runs_df["run_id"].tolist()

# Build a common feature universe across all runs (ensures signatures align)
all_feats = set()
for rid in run_ids:
    imp = load_importance_long(rid)
    all_feats.update(imp["feature"].unique().tolist())
feature_universe = sorted(all_feats)

results = []
for i in range(len(run_ids)):
    for j in range(i + 1, len(run_ids)):
        ra, rb = run_ids[i], run_ids[j]
        sig_a = build_signatures(ra, feature_universe=feature_universe)
        sig_b = build_signatures(rb, feature_universe=feature_universe)
        sum_a = load_summary(ra)

        mapping = match_clusters_by_importance(sig_a, sig_b)

        # metrics
        mean_sim = float(mapping["similarity"].mean()) if not mapping.empty else np.nan
        wmean_sim = weighted_mean_similarity(mapping, sum_a)

        # top-1 agreement on matched clusters
        top1_match = []
        top5_jacc = []
        for _, row in mapping.iterrows():
            ca = int(row["cluster_A"])
            cb = int(row["cluster_B"])
            fa = top_feature(sig_a, ca)
            fb = top_feature(sig_b, cb)
            top1_match.append(1 if (fa is not None and fa == fb) else 0)

            A5 = topk_set(sig_a, ca, k=5)
            B5 = topk_set(sig_b, cb, k=5)
            top5_jacc.append(len(A5 & B5) / len(A5 | B5) if (A5 or B5) else np.nan)

        results.append({
            "run_a": ra,
            "run_b": rb,
            "mean_cluster_sim": mean_sim,
            "weighted_mean_cluster_sim": wmean_sim,
            "top1_agreement_rate": float(np.nanmean(top1_match)) if len(top1_match) else np.nan,
            "mean_top5_jaccard": float(np.nanmean(top5_jacc)) if len(top5_jacc) else np.nan,
        })

res_df = pd.DataFrame(results)
out_csv = RESULTS_ROOT / "comparisons_by_importance_signatures.csv"
res_df.to_csv(out_csv, index=False)

print(f"Saved: {out_csv}")
display(res_df.sort_values("weighted_mean_cluster_sim", ascending=False).head(10))


Saved: ..\results\gmm_stability\comparisons_by_importance_signatures.csv


Unnamed: 0,run_a,run_b,mean_cluster_sim,weighted_mean_cluster_sim,top1_agreement_rate,mean_top5_jaccard
5,20251216_150031_phi0.40_lat200_t230_iso4.5_k5_...,20251216_151757_phi0.40_lat100_t211_iso4.5_k5_...,0.902453,0.931836,1.0,0.357143
4,20251216_150031_phi0.40_lat200_t230_iso4.5_k5_...,20251216_151517_phi0.40_lat050_t335_iso4.5_k5_...,0.755676,0.791328,0.8,0.354762
27,20251216_151757_phi0.40_lat100_t211_iso4.5_k5_...,20251216_152542_phi0.40_lat100_t140_iso4.5_k5_...,0.666143,0.768228,0.4,0.404762
6,20251216_150031_phi0.40_lat200_t230_iso4.5_k5_...,20251216_152542_phi0.40_lat100_t140_iso4.5_k5_...,0.666221,0.743761,0.4,0.285714
25,20251216_151517_phi0.40_lat050_t335_iso4.5_k5_...,20251216_151757_phi0.40_lat100_t211_iso4.5_k5_...,0.772498,0.699077,0.8,0.376984
8,20251216_150226_phi0.40_lat025_t335_iso4.5_k5_...,20251216_150607_phi0.40_lat025_t100_iso4.5_k5_...,0.535983,0.586234,0.6,0.088889
1,20251216_150031_phi0.40_lat200_t230_iso4.5_k5_...,20251216_150557_phi0.40_lat025_t238_iso4.5_k5_...,0.438925,0.573945,0.4,0.215873
11,20251216_150226_phi0.40_lat025_t335_iso4.5_k5_...,20251216_151757_phi0.40_lat100_t211_iso4.5_k5_...,0.468194,0.534468,0.4,0.207937
20,20251216_150607_phi0.40_lat025_t100_iso4.5_k5_...,20251216_151757_phi0.40_lat100_t211_iso4.5_k5_...,0.479036,0.515167,0.4,0.233333
17,20251216_150557_phi0.40_lat025_t238_iso4.5_k5_...,20251216_152542_phi0.40_lat100_t140_iso4.5_k5_...,0.504918,0.506722,0.4,0.207937
