In [1]:
from google.colab import files
import os

uploaded = files.upload()

# === Input files (rename here if needed) ===
HUMAN_XLSX = "valleys_peaks_final_results.xlsx"
MODEL_CSV  = "tie_dialog_6_dialogues_combined.csv"

# If your human sheet name is different, change it here:
HUMAN_SHEET = "valleys_peaks_template.csv"

# === Output folder ===
out_dir = "tie_dialog_region_metrics_optionA_EN"
os.makedirs(out_dir, exist_ok=True)

print("Uploaded files:", list(uploaded.keys()))
print("Output folder:", out_dir)


Saving valleys_peaks_final_results.xlsx to valleys_peaks_final_results.xlsx
Saving tie_dialog_6_dialogues_combined.csv to tie_dialog_6_dialogues_combined.csv
Uploaded files: ['valleys_peaks_final_results.xlsx', 'tie_dialog_6_dialogues_combined.csv']
Output folder: tie_dialog_region_metrics_optionA_EN


In [2]:
import os, json, math, itertools, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# ==============================
# Pre-specified Option A settings
# ==============================
W = 2                 # humans: point -> region [t-W, t+W]
CONS_K = 3            # consensus: >= 3/5 annotators cover a turn
ANNOTATORS = [1,2,3,4,5]

# Model extraction (dialogue-relative quantiles)
Q_HIGH = 0.85         # peak/repair: Ct >= Q85 per dialogue
Q_DLOW = 0.15         # valley/rupture: dCt <= Q15 per dialogue

# Permutation baseline size
B = 2000

# Lag scan window
LAG_MIN, LAG_MAX = -5, 5

# Which metric defines "best lag"
BEST_LAG_METRIC = "iou"   # "iou" or "f1"

print("Settings:")
print(f"  Human window W = {W}")
print(f"  Consensus threshold K = {CONS_K}/5")
print(f"  Model peak: Ct >= Q{int(Q_HIGH*100)} per dialogue")
print(f"  Model valley: dCt <= Q{int(Q_DLOW*100)} per dialogue")
print(f"  Permutations B = {B}")
print(f"  Lag range = [{LAG_MIN}, {LAG_MAX}]")
print(f"  Best lag metric = {BEST_LAG_METRIC.upper()}")


Settings:
  Human window W = 2
  Consensus threshold K = 3/5
  Model peak: Ct >= Q85 per dialogue
  Model valley: dCt <= Q15 per dialogue
  Permutations B = 2000
  Lag range = [-5, 5]
  Best lag metric = IOU


In [3]:
def merge_intervals(intervals):
    """Merge overlapping/adjacent intervals."""
    if not intervals:
        return []
    intervals = sorted(intervals)
    merged = [intervals[0]]
    for a,b in intervals[1:]:
        la,lb = merged[-1]
        if a <= lb + 1:
            merged[-1] = (la, max(lb,b))
        else:
            merged.append((a,b))
    return merged

def points_to_intervals(points0, w, T):
    """Convert 0-indexed points into windows [t-w, t+w] clipped to [0,T-1]."""
    intervals=[]
    for t in sorted(set(points0)):
        a=max(0, t-w)
        b=min(T-1, t+w)
        intervals.append((a,b))
    return merge_intervals(intervals)

def mask_to_intervals(mask):
    """Connected components of True values."""
    T=len(mask)
    intervals=[]
    i=0
    while i<T:
        if mask[i]:
            j=i
            while j+1<T and mask[j+1]:
                j+=1
            intervals.append((i,j))
            i=j+1
        else:
            i+=1
    return intervals

def intervals_to_mask(intervals, T):
    m = np.zeros(T, dtype=bool)
    for a,b in intervals:
        a = max(0, int(a)); b = min(T-1, int(b))
        if a <= b:
            m[a:b+1] = True
    return m

def iou_masks(A, B):
    """Intersection over Union for two boolean masks."""
    if A.sum()==0 and B.sum()==0:
        return 1.0
    if A.sum()==0 or B.sum()==0:
        return 0.0
    inter = np.logical_and(A,B).sum()
    uni   = np.logical_or(A,B).sum()
    return float(inter)/float(uni) if uni else 0.0

def overlap_prf(model_mask, human_mask):
    """
    Mask-based precision/recall/F1:
    precision = fraction of model-covered turns inside human consensus
    recall    = fraction of human-covered turns covered by model
    """
    if model_mask.sum()==0 and human_mask.sum()==0:
        return 1.0, 1.0, 1.0
    if model_mask.sum()==0 or human_mask.sum()==0:
        return 0.0, 0.0, 0.0
    tp = np.logical_and(model_mask, human_mask).sum()
    prec = tp / model_mask.sum() if model_mask.sum() else 0.0
    rec  = tp / human_mask.sum() if human_mask.sum() else 0.0
    f1 = (2*prec*rec/(prec+rec)) if (prec+rec) else 0.0
    return float(prec), float(rec), float(f1)

def shift_mask(mask, k):
    """Shift mask by k turns: k>0 moves detections forward; k<0 backward."""
    T=len(mask)
    out = np.zeros(T, dtype=bool)
    if k == 0:
        return mask.copy()
    if k > 0:
        out[k:] = mask[:T-k]
    else:
        out[:T+k] = mask[-k:]
    return out

print("Helpers loaded.")


Helpers loaded.


In [4]:
def random_segments_like(intervals, T, rng):
    """
    Randomly place segments with the same lengths as `intervals`,
    then merge overlaps/adjacency.
    """
    lengths = [(b-a+1) for a,b in intervals]
    rand=[]
    for L in lengths:
        if L >= T:
            rand.append((0, T-1))
            continue
        start = rng.randrange(0, T-L+1)
        rand.append((start, start+L-1))
    return merge_intervals(rand)

def lag_scan_best(model_mask, human_mask, lag_min, lag_max, metric="iou"):
    """Compute score at each lag and return best lag + metrics at that lag."""
    best = {"k": None, "iou": -1.0, "f1": -1.0, "prec": None, "rec": None}
    for k in range(lag_min, lag_max+1):
        shifted = shift_mask(model_mask, k)
        iou_k = iou_masks(shifted, human_mask)
        p_k, r_k, f1_k = overlap_prf(shifted, human_mask)
        key = iou_k if metric=="iou" else f1_k
        best_key = best["iou"] if metric=="iou" else best["f1"]
        if key > best_key:
            best = {"k": k, "iou": float(iou_k), "f1": float(f1_k), "prec": float(p_k), "rec": float(r_k)}
    return best

def perm_baseline_k0(model_intervals, human_mask, T, B=2000, seed=0):
    """
    Permutation baseline at lag k=0.
    Returns null distributions for IoU and F1 at k=0 plus p-values.
    """
    rng = random.Random(int(seed))
    model_mask = intervals_to_mask(model_intervals, T)
    obs_iou = iou_masks(model_mask, human_mask)
    _,_, obs_f1 = overlap_prf(model_mask, human_mask)

    ious=[]
    f1s=[]
    for _ in range(B):
        rand_int = random_segments_like(model_intervals, T, rng)
        rand_mask = intervals_to_mask(rand_int, T)
        ious.append(iou_masks(rand_mask, human_mask))
        _,_, f1 = overlap_prf(rand_mask, human_mask)
        f1s.append(f1)

    ious=np.array(ious); f1s=np.array(f1s)
    p_iou = (np.sum(ious >= obs_iou) + 1) / (B + 1)
    p_f1  = (np.sum(f1s  >= obs_f1)  + 1) / (B + 1)

    return {
        "obs_iou": float(obs_iou), "obs_f1": float(obs_f1),
        "null_iou": ious, "null_f1": f1s,
        "null_iou_mean": float(ious.mean()), "null_iou_sd": float(ious.std(ddof=1)),
        "null_f1_mean": float(f1s.mean()), "null_f1_sd": float(f1s.std(ddof=1)),
        "p_iou": float(p_iou), "p_f1": float(p_f1),
    }

def perm_baseline_under_lag_scan(model_intervals, human_mask, T,
                                 B=2000, seed=0,
                                 lag_min=-5, lag_max=5,
                                 metric_for_best="iou"):
    """
    NEW: permutation baseline under lag scan.
    For each permutation:
      - randomize model regions (same lengths/count)
      - scan lags and record the best score (max-over-lags)
    Compare observed best score to null distribution of best scores.
    """
    rng = random.Random(int(seed))

    # Observed best-under-lag-scan
    model_mask = intervals_to_mask(model_intervals, T)
    obs_best = lag_scan_best(model_mask, human_mask, lag_min, lag_max, metric=metric_for_best)
    obs_best_score = obs_best["iou"] if metric_for_best=="iou" else obs_best["f1"]

    # Null distribution: best score over lags
    null_best_scores = []
    null_best_lags = []

    for _ in range(B):
        rand_int = random_segments_like(model_intervals, T, rng)
        rand_mask = intervals_to_mask(rand_int, T)
        b = lag_scan_best(rand_mask, human_mask, lag_min, lag_max, metric=metric_for_best)
        b_score = b["iou"] if metric_for_best=="iou" else b["f1"]
        null_best_scores.append(b_score)
        null_best_lags.append(b["k"])

    null_best_scores = np.array(null_best_scores, dtype=float)
    null_best_lags = np.array(null_best_lags, dtype=int)

    p_best = (np.sum(null_best_scores >= obs_best_score) + 1) / (B + 1)

    return {
        "metric_for_best": metric_for_best,
        "obs_best_lag": int(obs_best["k"]) if obs_best["k"] is not None else None,
        "obs_best_iou": float(obs_best["iou"]),
        "obs_best_f1": float(obs_best["f1"]),
        "obs_best_score": float(obs_best_score),
        "null_best_scores": null_best_scores,
        "null_best_lags": null_best_lags,
        "null_best_mean": float(null_best_scores.mean()),
        "null_best_sd": float(null_best_scores.std(ddof=1)),
        "p_best": float(p_best),
    }

print("Baselines loaded (k=0 + under lag scan).")


Baselines loaded (k=0 + under lag scan).


In [5]:
# Load human annotations (turn-level)
human_all = pd.read_excel(HUMAN_XLSX, sheet_name=HUMAN_SHEET)

# Load model data (turn-level)
model_all = pd.read_csv(MODEL_CSV)

# Detect coherence column
coh_col = None
for cand in ["Ct", "ct", "C_t", "c_t"]:
    if cand in model_all.columns:
        coh_col = cand
        break
assert coh_col is not None, "Coherence column not found. Expected one of: Ct, ct, C_t, c_t."

# Check required columns
for req in ["dialogue_id", "turn"]:
    assert req in human_all.columns, f"Missing column in human file: {req}"
    assert req in model_all.columns, f"Missing column in model file: {req}"

print("Loaded:")
print("  Human:", human_all.shape, "| columns:", len(human_all.columns))
print("  Model:", model_all.shape, "| columns:", len(model_all.columns))
print("Using coherence column:", coh_col)

# Show dialogues detected
print("Dialogues (human):", sorted(human_all["dialogue_id"].unique()))
print("Dialogues (model):", sorted(model_all["dialogue_id"].unique()))


Loaded:
  Human: (180, 14) | columns: 14
  Model: (139, 42) | columns: 42
Using coherence column: Ct
Dialogues (human): [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]
Dialogues (model): [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]


In [6]:
rows_model_vs_consensus = []
rows_iaa = []
rows_lag = []
rows_baselines = []

# For transparency: export the actual masks and intervals used
rows_masks_turnlevel = []      # per turn: consensus + model masks
rows_intervals = []            # per dialogue: intervals as JSON

pdf_path = os.path.join(out_dir, "region_metrics_report_optionA_EN.pdf")
pdf = PdfPages(pdf_path)

def safe_quantile(x, q):
    x = x[np.isfinite(x)]
    return float(np.quantile(x, q)) if len(x) else np.nan

for did in sorted(human_all["dialogue_id"].unique()):
    did_int = int(did)

    h = human_all[human_all["dialogue_id"]==did].sort_values("turn").copy()
    m = model_all[model_all["dialogue_id"]==did].sort_values("turn").copy()

    # Turns in your files look 1-indexed; we use T = max turn
    T = int(max(h["turn"].max(), m["turn"].max()))
    turns = np.arange(1, T+1)

    # Build Ct aligned to 1..T
    Ct = np.full(T, np.nan, dtype=float)
    # map turn -> Ct
    for _, r in m.iterrows():
        t = int(r["turn"])
        if 1 <= t <= T:
            Ct[t-1] = float(r[coh_col]) if pd.notnull(r[coh_col]) else np.nan

    thr_high = safe_quantile(Ct, Q_HIGH)

    # dCt[0] = nan, dCt[t] = Ct[t]-Ct[t-1]
    dCt = np.full(T, np.nan, dtype=float)
    for t in range(1, T):
        if np.isfinite(Ct[t]) and np.isfinite(Ct[t-1]):
            dCt[t] = Ct[t] - Ct[t-1]

    thr_dlow = safe_quantile(dCt, Q_DLOW)

    for label in ["valley", "peak"]:
        # ---------- Human annotator masks -> windowed regions ----------
        ann_masks = {}
        ann_intervals = {}

        for a in ANNOTATORS:
            col = f"annotator_{a}_{label}"
            assert col in h.columns, f"Missing human column: {col}"

            pts_turns = h.loc[h[col]==1, "turn"].astype(int).tolist()
            pts0 = [t-1 for t in pts_turns]  # 0-index
            intervals = points_to_intervals(pts0, W, T)
            mask = intervals_to_mask(intervals, T)

            ann_masks[a] = mask
            ann_intervals[a] = intervals

        # ---------- Human–human agreement (IAA): pairwise ----------
        pair_ious = []
        pair_f1s = []
        for a,b in itertools.combinations(ANNOTATORS, 2):
            pair_ious.append(iou_masks(ann_masks[a], ann_masks[b]))
            _,_, f1 = overlap_prf(ann_masks[a], ann_masks[b])
            pair_f1s.append(f1)

        rows_iaa.append({
            "dialogue_id": did_int,
            "label": label,
            "human_window_W": W,
            "IAA_IoU_mean": float(np.mean(pair_ious)),
            "IAA_IoU_sd": float(np.std(pair_ious, ddof=1)) if len(pair_ious)>1 else 0.0,
            "IAA_F1_mean": float(np.mean(pair_f1s)),
            "IAA_F1_sd": float(np.std(pair_f1s, ddof=1)) if len(pair_f1s)>1 else 0.0,
            "n_pairs": len(pair_ious)
        })

        # ---------- Human consensus mask ----------
        stack = np.stack([ann_masks[a] for a in ANNOTATORS], axis=0)
        consensus = (stack.sum(axis=0) >= CONS_K)
        consensus_intervals = mask_to_intervals(consensus)

        # ---------- Model mask (Option A) ----------
        if label == "peak":
            model_mask = np.isfinite(Ct) & (Ct >= thr_high)
            model_series = Ct.copy()
            threshold = thr_high
            series_name = "Ct"
            title = f"Dialogue {did_int} — PEAK (repair): Ct >= Q{int(Q_HIGH*100)}"
        else:
            model_mask = np.isfinite(dCt) & (dCt <= thr_dlow)
            model_series = dCt.copy()
            threshold = thr_dlow
            series_name = "dCt"
            title = f"Dialogue {did_int} — VALLEY (rupture): dCt <= Q{int(Q_DLOW*100)}"

        model_intervals = mask_to_intervals(model_mask)

        # ---------- Model vs Consensus metrics ----------
        iou = iou_masks(model_mask, consensus)
        prec, rec, f1 = overlap_prf(model_mask, consensus)

        rows_model_vs_consensus.append({
            "dialogue_id": did_int,
            "label": label,
            "human_window_W": W,
            "consensus_K_of_5": CONS_K,
            "Q_HIGH": Q_HIGH,
            "Q_DLOW": Q_DLOW,
            "thr_high_Ct_Q85": thr_high,
            "thr_dlow_dCt_Q15": thr_dlow,
            "model_covered_turns": int(model_mask.sum()),
            "consensus_covered_turns": int(consensus.sum()),
            "model_segments": len(model_intervals),
            "consensus_segments": len(consensus_intervals),
            "IoU_k0": float(iou),
            "Precision_k0": float(prec),
            "Recall_k0": float(rec),
            "F1_k0": float(f1),
        })

        # ---------- Baseline at k=0 ----------
        base0 = perm_baseline_k0(
            model_intervals, consensus, T,
            B=B,
            seed=2026 + did_int*10 + (0 if label=="valley" else 1),
        )

        # ---------- NEW: Baseline under lag scan (max-over-lags) ----------
        baseLag = perm_baseline_under_lag_scan(
            model_intervals, consensus, T,
            B=B,
            seed=3030 + did_int*10 + (0 if label=="valley" else 1),
            lag_min=LAG_MIN, lag_max=LAG_MAX,
            metric_for_best=BEST_LAG_METRIC
        )

        rows_baselines.append({
            "dialogue_id": did_int,
            "label": label,
            "B": B,
            "lag_min": LAG_MIN,
            "lag_max": LAG_MAX,
            "best_lag_metric": BEST_LAG_METRIC,

            # k=0 baseline
            "obs_IoU_k0": base0["obs_iou"],
            "obs_F1_k0": base0["obs_f1"],
            "null_IoU_mean_k0": base0["null_iou_mean"],
            "null_IoU_sd_k0": base0["null_iou_sd"],
            "null_F1_mean_k0": base0["null_f1_mean"],
            "null_F1_sd_k0": base0["null_f1_sd"],
            "p_IoU_k0": base0["p_iou"],
            "p_F1_k0": base0["p_f1"],

            # lag-scan baseline
            "obs_best_lag": baseLag["obs_best_lag"],
            "obs_best_score": baseLag["obs_best_score"],
            "null_best_mean": baseLag["null_best_mean"],
            "null_best_sd": baseLag["null_best_sd"],
            "p_best": baseLag["p_best"],

            # extra transparency
            "model_segments": len(model_intervals),
            "model_covered_turns": int(model_mask.sum()),
            "consensus_segments": len(consensus_intervals),
            "consensus_covered_turns": int(consensus.sum()),
        })

        # ---------- Best lag (observed curve scan) ----------
        best = lag_scan_best(model_mask, consensus, LAG_MIN, LAG_MAX, metric=BEST_LAG_METRIC)
        rows_lag.append({
            "dialogue_id": did_int,
            "label": label,
            "lag_min": LAG_MIN,
            "lag_max": LAG_MAX,
            "best_lag_metric": BEST_LAG_METRIC,
            "best_lag_k": int(best["k"]) if best["k"] is not None else None,
            "IoU_at_best_lag": float(best["iou"]),
            "F1_at_best_lag": float(best["f1"]),
            "Precision_at_best_lag": float(best["prec"]),
            "Recall_at_best_lag": float(best["rec"]),
        })

        # ---------- Transparency exports (turn-level masks) ----------
        for t in range(1, T+1):
            rows_masks_turnlevel.append({
                "dialogue_id": did_int,
                "label": label,
                "turn": t,
                "Ct": float(Ct[t-1]) if np.isfinite(Ct[t-1]) else np.nan,
                "dCt": float(dCt[t-1]) if np.isfinite(dCt[t-1]) else np.nan,
                "consensus_mask": int(consensus[t-1]),
                "model_mask": int(model_mask[t-1]),
                # raw vote count (0..5)
                "human_votes": int(stack[:, t-1].sum())
            })

        # ---------- Transparency exports (intervals as JSON) ----------
        rows_intervals.append({
            "dialogue_id": did_int,
            "label": label,
            "T": T,
            "human_window_W": W,
            "consensus_K_of_5": CONS_K,
            "model_intervals": json.dumps(model_intervals),
            "consensus_intervals": json.dumps(consensus_intervals),
            "annotator_intervals": json.dumps({str(a): ann_intervals[a] for a in ANNOTATORS})
        })

        # =========================
        # PDF pages
        # =========================

        # Page 1: signal + threshold + model regions shaded
        fig = plt.figure(figsize=(10,4))
        plt.plot(turns, model_series, label=series_name)
        if np.isfinite(threshold):
            plt.axhline(threshold, linestyle="--", label="threshold")
        # shade model intervals
        for a,b in model_intervals:
            plt.axvspan(a+1, b+1, alpha=0.2)
        plt.title(title)
        plt.xlabel("Turn")
        plt.ylabel(series_name)
        plt.legend()
        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)

        # Page 2: baseline at k=0 histogram (IoU)
        fig = plt.figure(figsize=(8,4))
        plt.hist(base0["null_iou"], bins=30)
        plt.axvline(base0["obs_iou"])
        plt.title(f"Permutation baseline at k=0 — Dialogue {did_int} — {label} (B={B})")
        plt.xlabel("IoU (random regions)")
        plt.ylabel("count")
        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)

        # Page 3: NEW baseline under lag scan histogram (best-score)
        fig = plt.figure(figsize=(8,4))
        plt.hist(baseLag["null_best_scores"], bins=30)
        plt.axvline(baseLag["obs_best_score"])
        plt.title(f"Permutation baseline under lag scan (max over lags) — Dialogue {did_int} — {label} (B={B})")
        plt.xlabel(f"Best-{BEST_LAG_METRIC.upper()} over lags [{LAG_MIN},{LAG_MAX}]")
        plt.ylabel("count")
        pdf.savefig(fig, bbox_inches="tight")
        plt.close(fig)

pdf.close()

df_model = pd.DataFrame(rows_model_vs_consensus)
df_iaa   = pd.DataFrame(rows_iaa)
df_lag   = pd.DataFrame(rows_lag)
df_base  = pd.DataFrame(rows_baselines)

df_masks = pd.DataFrame(rows_masks_turnlevel)
df_intv  = pd.DataFrame(rows_intervals)

print("Done.")
print("  model_vs_consensus:", df_model.shape)
print("  human_human_IAA:", df_iaa.shape)
print("  best_lag:", df_lag.shape)
print("  baselines:", df_base.shape)
print("  transparency_turn_masks:", df_masks.shape)
print("  transparency_intervals:", df_intv.shape)

print("\nPDF report:", pdf_path)


Done.
  model_vs_consensus: (12, 16)
  human_human_IAA: (12, 8)
  best_lag: (12, 10)
  baselines: (12, 23)
  transparency_turn_masks: (360, 8)
  transparency_intervals: (12, 8)

PDF report: tie_dialog_region_metrics_optionA_EN/region_metrics_report_optionA_EN.pdf


In [7]:
import zipfile
from google.colab import files

def agg_summary(df, cols, group="label"):
    out=[]
    for lab,g in df.groupby(group):
        row={"label": lab}
        for c in cols:
            row[c+"_mean"]=float(g[c].mean())
            row[c+"_sd"]=float(g[c].std(ddof=1)) if len(g)>1 else 0.0
        out.append(row)
    return pd.DataFrame(out)

# Summaries
sum_model = agg_summary(df_model, ["IoU_k0","F1_k0","Precision_k0","Recall_k0"])
sum_iaa   = agg_summary(df_iaa, ["IAA_IoU_mean","IAA_F1_mean"])
sum_lag   = agg_summary(df_lag, ["best_lag_k","IoU_at_best_lag","F1_at_best_lag"])
sum_base  = agg_summary(df_base, [
    "obs_IoU_k0","obs_F1_k0","p_IoU_k0","p_F1_k0",
    "obs_best_score","p_best","null_best_mean"
])

# Write per-dialogue CSVs
csv_model = os.path.join(out_dir, "per_dialogue_model_vs_consensus_optionA_EN.csv")
csv_iaa   = os.path.join(out_dir, "per_dialogue_IAA_optionA_EN.csv")
csv_lag   = os.path.join(out_dir, "per_dialogue_best_lag_optionA_EN.csv")
csv_base  = os.path.join(out_dir, "per_dialogue_baselines_optionA_EN.csv")

csv_masks = os.path.join(out_dir, "TRANSPARENCY_turn_level_masks_EN.csv")
csv_intv  = os.path.join(out_dir, "TRANSPARENCY_intervals_EN.csv")

df_model.to_csv(csv_model, index=False)
df_iaa.to_csv(csv_iaa, index=False)
df_lag.to_csv(csv_lag, index=False)
df_base.to_csv(csv_base, index=False)
df_masks.to_csv(csv_masks, index=False)
df_intv.to_csv(csv_intv, index=False)

# Excel master
xlsx_out = os.path.join(out_dir, "ALL_RESULTS_optionA_EN.xlsx")
with pd.ExcelWriter(xlsx_out, engine="openpyxl") as w:
    df_model.to_excel(w, sheet_name="model_vs_consensus", index=False)
    df_iaa.to_excel(w, sheet_name="human_human_IAA", index=False)
    df_lag.to_excel(w, sheet_name="best_lag", index=False)
    df_base.to_excel(w, sheet_name="baselines_k0_and_lagscan", index=False)
    df_masks.to_excel(w, sheet_name="TRANSPARENCY_masks", index=False)
    df_intv.to_excel(w, sheet_name="TRANSPARENCY_intervals", index=False)

    sum_model.to_excel(w, sheet_name="SUMMARY_model", index=False)
    sum_iaa.to_excel(w, sheet_name="SUMMARY_IAA", index=False)
    sum_lag.to_excel(w, sheet_name="SUMMARY_lag", index=False)
    sum_base.to_excel(w, sheet_name="SUMMARY_baselines", index=False)

print("Saved:")
print(" ", xlsx_out)
print(" ", pdf_path)

# ZIP the whole output folder for one-click download
zip_path = out_dir + ".zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for root, _, filenames in os.walk(out_dir):
        for fn in filenames:
            full = os.path.join(root, fn)
            rel = os.path.relpath(full, out_dir)
            z.write(full, arcname=os.path.join(out_dir, rel))

print("ZIP:", zip_path)

# Display summaries
print("\nSUMMARY — Model vs Consensus")
display(sum_model)

print("\nSUMMARY — Human–Human IAA")
display(sum_iaa)

print("\nSUMMARY — Best lag")
display(sum_lag)

print("\nSUMMARY — Baselines (k=0 + lag-scan)")
display(sum_base)

# Download main outputs + zip
files.download(xlsx_out)
files.download(pdf_path)
files.download(zip_path)


Saved:
  tie_dialog_region_metrics_optionA_EN/ALL_RESULTS_optionA_EN.xlsx
  tie_dialog_region_metrics_optionA_EN/region_metrics_report_optionA_EN.pdf
ZIP: tie_dialog_region_metrics_optionA_EN.zip

SUMMARY — Model vs Consensus


Unnamed: 0,label,IoU_k0_mean,IoU_k0_sd,F1_k0_mean,F1_k0_sd,Precision_k0_mean,Precision_k0_sd,Recall_k0_mean,Recall_k0_sd
0,peak,0.189947,0.075928,0.313447,0.109244,0.819444,0.213546,0.196495,0.076195
1,valley,0.121098,0.072476,0.209562,0.120377,0.527778,0.323465,0.131944,0.076924



SUMMARY — Human–Human IAA


Unnamed: 0,label,IAA_IoU_mean_mean,IAA_IoU_mean_sd,IAA_F1_mean_mean,IAA_F1_mean_sd
0,peak,0.500092,0.137199,0.629215,0.138402
1,valley,0.540088,0.090345,0.668133,0.08743



SUMMARY — Best lag


Unnamed: 0,label,best_lag_k_mean,best_lag_k_sd,IoU_at_best_lag_mean,IoU_at_best_lag_sd,F1_at_best_lag_mean,F1_at_best_lag_sd
0,peak,-3.5,1.224745,0.220055,0.041968,0.359099,0.056902
1,valley,-4.333333,0.516398,0.231672,0.064148,0.372629,0.082126



SUMMARY — Baselines (k=0 + lag-scan)


Unnamed: 0,label,obs_IoU_k0_mean,obs_IoU_k0_sd,obs_F1_k0_mean,obs_F1_k0_sd,p_IoU_k0_mean,p_IoU_k0_sd,p_F1_k0_mean,p_F1_k0_sd,obs_best_score_mean,obs_best_score_sd,p_best_mean,p_best_sd,null_best_mean_mean,null_best_mean_sd
0,peak,0.189947,0.075928,0.313447,0.109244,0.376895,0.273078,0.376895,0.273078,0.220055,0.041968,0.574879,0.15515,0.187145,0.044639
1,valley,0.121098,0.072476,0.209562,0.120377,0.586623,0.330573,0.586623,0.330573,0.231672,0.064148,0.398551,0.286531,0.166521,0.02012


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>