In [10]:
# ---------------------------------------------------------------
# Runs the statistical pipeline described in the paper:
# - Q1: Friedman (repeated measures) + Wilcoxon post-hoc (Holm)
# - Q2: FP32 vs INT8 (Shapiro -> paired t or Wilcoxon)
# Saves CSV and tex tables; prints a succinct console summary.
# ---------------------------------------------------------------

import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import shapiro, ttest_rel, wilcoxon, friedmanchisquare

In [11]:
# ========== CONFIG ==========
CSV_PATH = "./model_scores_per_fold.csv"  # <-- change path if needed

# Expected model display names (any order is fine)
MODEL_ORDER = [
    "Random Forest", "SVM", "XGBoost", "LightGBM",
    "FCNN (FP32)", "FCNN (INT8)", "AutoEncoder (FP32)", "AutoEncoder (INT8)"
]

# Mapping from CSV column prefixes to display names
PREFIX_TO_MODEL = {
    "RF": "Random Forest",
    "SVM": "SVM", 
    "XGB": "XGBoost",
    "LGB": "LightGBM",
    "FCNN32": "FCNN (FP32)",
    "FCNN8": "FCNN (INT8)",
    "AE32": "AutoEncoder (FP32)",
    "AE8": "AutoEncoder (INT8)"
}

# Metric mapping from CSV suffixes to standard names
METRIC_MAPPING = {
    "Acc": "Accuracy",
    "BalAcc": "Balanced Accuracy", 
    "F1": "Macro F1",
    "ROC": "ROC AUC",
    "Prec": "Precision (Macro)",
    "Rec": "Recall (Macro)"
}

# Quantization pairs (architecture FP32 vs INT8)
QUANT_PAIRS = [
    ("FCNN (FP32)", "FCNN (INT8)"),
    ("AutoEncoder (FP32)", "AutoEncoder (INT8)")
]

# Metric column names to analyze (will auto-detect if empty)
METRICS = []  # e.g., ["Accuracy","Balanced Accuracy","Macro F1","ROC AUC","Precision (Macro)","Recall (Macro)"]

# Column names for LONG format (will auto-infer if None)
MODEL_COL = None  # e.g., "Model"
FOLD_COL  = None  # e.g., "fold" or "fold_id"

In [12]:
# ========== Helpers ==========
def holm_bonferroni(pvals_dict):
    # dict[(a,b)] = p  -> returns adjusted p-values dict with same keys
    items = sorted(pvals_dict.items(), key=lambda kv: kv[1])
    m = len(items)
    out = {}
    for i, (k, p) in enumerate(items, start=1):
        out[k] = min((m - i + 1) * p, 1.0)
    return out

def cohen_d_paired(a, b):
    d = np.asarray(a) - np.asarray(b)
    sd = d.std(ddof=1)
    return d.mean() / (sd if sd > 0 else np.nan)

def cliffs_delta(a, b):
    a = np.asarray(a); b = np.asarray(b)
    gt = sum((x > y) for x in a for y in b)
    lt = sum((x < y) for x in a for y in b)
    n = len(a) * len(b)
    return (gt - lt) / n if n else np.nan

def write_latex(df, path, caption, label):
    with open(path, "w", encoding="utf-8") as f:
        f.write(df.to_latex(index=False, escape=True,
                            caption=caption, label=label,
                            float_format="%.4g"))

In [13]:
# ========== Load CSV and detect shape ==========
df = pd.read_csv(CSV_PATH)

# Try to detect LONG vs WIDE
def is_long_format(d: pd.DataFrame):
    nonnum = [c for c in d.columns if not np.issubdtype(d[c].dtype, np.number)]
    return len(nonnum) >= 1 and any('model' in c.lower() for c in nonnum)

long_format = is_long_format(df)
print(f"Detected format: {'LONG' if long_format else 'WIDE'}")

if long_format:
    # Infer MODEL_COL and FOLD_COL if needed
    if MODEL_COL is None:
        nonnum = [c for c in df.columns if not np.issubdtype(df[c].dtype, np.number)]
        MODEL_COL = nonnum[0]
    if FOLD_COL is None:
        candidates = [c for c in df.columns if "fold" in c.lower() or "repeat" in c.lower() or "cv" in c.lower()]
        FOLD_COL = candidates[0] if candidates else None

    # Build metric list if not provided
    if not METRICS:
        ignore = {MODEL_COL}
        if FOLD_COL: ignore.add(FOLD_COL)
        METRICS = [c for c in df.columns if c not in ignore and np.issubdtype(df[c].dtype, np.number)]

    # Clean model labels
    df[MODEL_COL] = df[MODEL_COL].astype(str).str.strip()
else:
    # WIDE format: create LONG view for processing.
    # Parse columns like "RF_Acc", "SVM_BalAcc", etc.
    
    # Build metric list from column suffixes
    if not METRICS:
        suffixes = set()
        for col in df.columns:
            if '_' in col and col != 'Fold':
                suffix = col.split('_', 1)[1]
                suffixes.add(suffix)
        METRICS = [METRIC_MAPPING.get(s, s) for s in sorted(suffixes)]
    
    print(f"Found metrics: {METRICS}")
    
    # Build LONG format by parsing column names
    rows = []
    for col in df.columns:
        if col == 'Fold':
            continue
        if '_' in col:
            prefix, suffix = col.split('_', 1)
            if prefix in PREFIX_TO_MODEL:
                model_name = PREFIX_TO_MODEL[prefix]
                metric_name = METRIC_MAPPING.get(suffix, suffix)
                
                for idx, row in df.iterrows():
                    rows.append({
                        'fold': row['Fold'],
                        'Model': model_name,
                        'metric_name': metric_name,
                        'value': row[col]
                    })
    
    long_df = pd.DataFrame(rows)
    
    # Pivot to (fold, model) x metrics
    MODEL_COL, FOLD_COL = "Model", "fold"
    df = long_df.pivot_table(index=[FOLD_COL, MODEL_COL], columns="metric_name", values="value").reset_index()
    
    # Clean up column names
    df.columns.name = None

print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Now we are in LONG-like form with MODEL_COL, FOLD_COL, METRICS
models_present = sorted(df[MODEL_COL].unique())
print(f"Models found: {models_present}")

# Keep only models we know (if present)
models_used = [m for m in MODEL_ORDER if m in models_present]
print(f"Models to use: {models_used}")

if len(models_used) < 3:
    raise ValueError(f"Fewer than 3 recognized models. Found: {models_present}")

Detected format: WIDE
Found metrics: ['Accuracy', 'Balanced Accuracy', 'Macro F1', 'Precision (Macro)', 'ROC AUC', 'Recall (Macro)']
DataFrame shape: (240, 8)
Columns: ['fold', 'Model', 'Accuracy', 'Balanced Accuracy', 'Macro F1', 'Precision (Macro)', 'ROC AUC', 'Recall (Macro)']
Models found: ['AutoEncoder (FP32)', 'AutoEncoder (INT8)', 'FCNN (FP32)', 'FCNN (INT8)', 'LightGBM', 'Random Forest', 'SVM', 'XGBoost']
Models to use: ['Random Forest', 'SVM', 'XGBoost', 'LightGBM', 'FCNN (FP32)', 'FCNN (INT8)', 'AutoEncoder (FP32)', 'AutoEncoder (INT8)']


In [14]:
# Build wide matrices per metric: rows = folds, cols = models
def metric_wide(metric):
    w = (df[[FOLD_COL, MODEL_COL, metric]]
         .pivot_table(index=FOLD_COL, columns=MODEL_COL, values=metric, aggfunc="first"))
    # keep only selected models and drop any rows with missing
    w = w.reindex(columns=models_used)
    w = w.dropna(axis=0, how="any")
    return w

friedman_rows = []
posthoc_rows  = []
quant_rows    = []

for metric in METRICS:
    print(f"\nProcessing {metric}...")
    W = metric_wide(metric)
    if W.empty or W.shape[0] < 3:
        print(f"Skipping {metric} - insufficient data")
        continue
    
    print(f"Data shape for {metric}: {W.shape}")

    # ---------- Q1: Friedman ----------
    try:
        fr_stat, fr_p = friedmanchisquare(*[W[m].values for m in W.columns])
    except Exception as e:
        print(f"Friedman test failed for {metric}: {e}")
        fr_stat, fr_p = (np.nan, np.nan)
    friedman_rows.append({
        "metric": metric,
        "n_folds": W.shape[0],
        "n_models": W.shape[1],
        "chi2": fr_stat,
        "p_value": fr_p
    })

    # Post-hoc Wilcoxon (paired) with Holm; include model1/model2 + effect
    pair_p = {}
    pair_stat = {}
    pair_eff = {}

    for m1, m2 in combinations(W.columns, 2):
        s1, s2 = W[m1].values, W[m2].values
        try:
            stat, p = wilcoxon(s1, s2, zero_method='wilcox', alternative='two-sided')
        except ValueError:
            stat, p = (0.0, 1.0)
        pair_p[(m1, m2)] = p
        pair_stat[(m1, m2)] = stat
        # Effect size for post-hoc: Cliff's delta (direction m1 vs m2)
        pair_eff[(m1, m2)] = cliffs_delta(s1, s2)

    adj = holm_bonferroni(pair_p)
    for (m1, m2), p in pair_p.items():
        posthoc_rows.append({
            "metric": metric,
            "model1": m1,
            "model2": m2,
            "pair": f"{m1} vs {m2}",
            "wilcoxon_stat": pair_stat[(m1, m2)],
            "p_raw": p,
            "p_holm": adj[(m1, m2)],
            "effect": pair_eff[(m1, m2)]
        })

    # ---------- Q2: Quantization (per architecture) ----------
    for a, b in QUANT_PAIRS:
        if a in W.columns and b in W.columns:
            diffs = W[a].values - W[b].values
            p_norm = np.nan
            test_name = "Wilcoxon"
            stat = np.nan
            pval = np.nan
            eff = np.nan
            if diffs.size >= 3 and diffs.var() > 0:
                try:
                    _, p_norm = shapiro(diffs)
                except Exception:
                    p_norm = np.nan
                if not np.isnan(p_norm) and p_norm > 0.05:
                    # paired t
                    stat, pval = ttest_rel(W[a].values, W[b].values)
                    eff = cohen_d_paired(W[a].values, W[b].values)
                    test_name = "Paired t"
                else:
                    # Wilcoxon
                    try:
                        stat, pval = wilcoxon(W[a].values, W[b].values, zero_method='wilcox', alternative='two-sided')
                    except ValueError:
                        stat, pval = (0.0, 1.0)
                    eff = cliffs_delta(W[a].values, W[b].values)
            quant_rows.append({
                "metric": metric,
                "pair": f"{a} vs {b}",
                "n_folds": W.shape[0],
                "normality_p": p_norm,
                "test": test_name,
                "stat": stat,
                "p_value": pval,
                "effect": eff
            })

print("\nProcessing complete!")


Processing Accuracy...
Data shape for Accuracy: (30, 8)

Processing Balanced Accuracy...
Data shape for Balanced Accuracy: (30, 8)

Processing Macro F1...
Data shape for Macro F1: (30, 8)

Processing Precision (Macro)...
Data shape for Precision (Macro): (30, 8)

Processing ROC AUC...
Data shape for ROC AUC: (30, 8)

Processing Recall (Macro)...
Data shape for Recall (Macro): (30, 8)

Processing complete!


In [15]:
# ---------- Save as .TXT (human-readable) + ENHANCED Q1 SUMMARY ----------

def save_txt(df, path, title):
    with open(path, "w", encoding="utf-8") as f:
        f.write(f"{title}\n")
        f.write("=" * len(title) + "\n")
        if df.empty:
            f.write("(no results)\n")
        else:
            f.write(df.to_string(index=False))
            f.write("\n")

# Create DataFrames (sorted)
friedman_df = pd.DataFrame(friedman_rows).sort_values(["metric"])
posthoc_df  = pd.DataFrame(posthoc_rows).sort_values(["metric","p_holm"])
quant_df    = pd.DataFrame(quant_rows).sort_values(["metric","pair"])

# Save to .txt files
save_txt(friedman_df, "friedman_results.txt",
         "Friedman omnibus tests across models (per metric)")
save_txt(posthoc_df,  "posthoc_wilcoxon_holm.txt",
         "Post-hoc Wilcoxon pairs after Holm correction")
save_txt(quant_df,    "quantization_tests.txt",
         "Quantization comparisons (FP32 vs INT8) with chosen test and effect size")

print("Results saved to TXT files:")
print("- friedman_results.txt")
print("- posthoc_wilcoxon_holm.txt")
print("- quantization_tests.txt")

# ---------------------- Q1: build richer summary ----------------------

# Ensure posthoc has model1/model2/effect; if not, derive
if not posthoc_df.empty:
    if "model1" not in posthoc_df.columns or "model2" not in posthoc_df.columns:
        split = posthoc_df["pair"].str.split(" vs ", n=1, expand=True)
        posthoc_df["model1"] = split[0]
        posthoc_df["model2"] = split[1]
    if "effect" not in posthoc_df.columns:
        # compute Cliff's Δ as an effect size direction for each pair/metric
        def _pair_effect(r):
            Wloc = metric_wide(r["metric"])
            m1, m2 = r["model1"], r["model2"]
            if m1 in Wloc.columns and m2 in Wloc.columns:
                return cliffs_delta(Wloc[m1].values, Wloc[m2].values)
            return np.nan
        posthoc_df["effect"] = posthoc_df.apply(_pair_effect, axis=1)

# Compute per-metric means/SDs and average ranks; build global leaderboard
per_metric_summaries = []
global_rank_accum = []

for metric in METRICS:
    Wm = metric_wide(metric)
    if Wm.empty:
        continue
    means = Wm.mean(axis=0)
    stds  = Wm.std(axis=0, ddof=1)
    ranks = Wm.rank(axis=1, method="average", ascending=False)   # higher is better
    avg_ranks = ranks.mean(axis=0).sort_values()                 # ↓ is better
    global_rank_accum.append(avg_ranks)

    metric_df = pd.DataFrame({
        "Metric": metric,
        "Model": means.index,
        "Mean": means.values,
        "SD": stds.reindex(means.index).values,
        "Avg Rank": avg_ranks.reindex(means.index).values
    }).sort_values(["Avg Rank","Mean"], ascending=[True, False])
    per_metric_summaries.append(metric_df)

if global_rank_accum:
    global_rank = pd.concat(global_rank_accum, axis=1).mean(axis=1).sort_values()
    # Directional wins/losses from significant post-hoc (Holm < .05)
    posthoc_sig = posthoc_df[posthoc_df["p_holm"] < 0.05].copy()
    # get median diff to determine direction (m1 > m2 or vice versa)
    def _median_diff(r):
        Wloc = metric_wide(r["metric"])
        if r["model1"] in Wloc.columns and r["model2"] in Wloc.columns:
            d = (Wloc[r["model1"]] - Wloc[r["model2"]]).values
            return np.median(d) if len(d) else np.nan
        return np.nan
    if not posthoc_sig.empty:
        posthoc_sig["median_diff"] = posthoc_sig.apply(_median_diff, axis=1)
    win_counts = {m: 0 for m in models_used}
    loss_counts = {m: 0 for m in models_used}
    for _, r in (posthoc_sig if not posthoc_sig.empty else pd.DataFrame()).iterrows():
        if pd.notnull(r["median_diff"]):
            if r["median_diff"] > 0:
                win_counts[r["model1"]] += 1
                loss_counts[r["model2"]] += 1
            elif r["median_diff"] < 0:
                win_counts[r["model2"]] += 1
                loss_counts[r["model1"]] += 1
    global_table = pd.DataFrame({
        "Model": global_rank.index,
        "Global Avg Rank (↓ better)": global_rank.values,
        "Wins (Holm<0.05)": [win_counts.get(m,0) for m in global_rank.index],
        "Losses": [loss_counts.get(m,0) for m in global_rank.index]
    })
else:
    posthoc_sig = pd.DataFrame()
    global_table = pd.DataFrame()

# Save Q1 human-readable summary file
with open("q1_summary.txt", "w", encoding="utf-8") as f:
    f.write("Q1 SUMMARY: Which model is best?\n")
    f.write("================================\n\n")
    f.write("Friedman omnibus per metric:\n")
    if friedman_df.empty:
        f.write("  (no results)\n\n")
    else:
        for _, row in friedman_df.iterrows():
            f.write(f"  - {row['metric']}: chi2={row['chi2']:.3f}, p={row['p_value']:.2e}\n")
        f.write("\n")

    f.write("Global leaderboard (average rank across metrics):\n")
    if global_table.empty:
        f.write("  (no leaderboard)\n\n")
    else:
        f.write(global_table.to_string(index=False))
        f.write("\n\n")

    f.write("Per-metric best model (mean ± sd; avg rank):\n")
    if not per_metric_summaries:
        f.write("  (none)\n")
    else:
        for metric_df in per_metric_summaries:
            top = metric_df.iloc[0]
            f.write(f"  - {top['Metric']}: {top['Model']} "
                    f"({top['Mean']:.4f} ± {top['SD']:.4f}; rank={top['Avg Rank']:.2f})\n")
        f.write("\n")

    f.write("Significant post-hoc pairs after Holm (directional):\n")
    if posthoc_sig.empty:
        f.write("  (none)\n")
    else:
        for _, r in posthoc_sig.sort_values(["p_holm","metric"]).head(20).iterrows():
            arrow = " > " if r["median_diff"] > 0 else (" < " if r["median_diff"] < 0 else " = ")
            f.write(f"  - {r['metric']}: {r['model1']}{arrow}{r['model2']} "
                    f"(p_holm={r['p_holm']:.2e})\n")
print("Saved: q1_summary.txt")

# ========================= Console SUMMARY (enhanced) =========================
print("\n" + "="*70)
print("SUMMARY")
print("="*70)
print(f"Models analyzed: {len(models_used)}")
print(f"Metrics analyzed: {len(METRICS)}")

# ---------- Q1: Model comparison ----------
print("\n[Q1] WHICH MODEL IS BEST? (Friedman + Ranks + Post-hoc)")
sig_friedman = friedman_df[friedman_df['p_value'] < 0.05]
nonsig_friedman = friedman_df[friedman_df['p_value'] >= 0.05]

if not sig_friedman.empty:
    print(f"Friedman significant (p < 0.05): {len(sig_friedman)} / {len(friedman_df)} metrics")
    for _, row in sig_friedman.iterrows():
        print(f"  - {row['metric']}: χ²={row['chi2']:.3f}, p={row['p_value']:.2e}")
else:
    print("No metrics reached significance in Friedman test.")

if not nonsig_friedman.empty:
    print(f"Friedman non-significant: {len(nonsig_friedman)} metrics")
    for _, row in nonsig_friedman.iterrows():
        print(f"  - {row['metric']}: χ²={row['chi2']:.3f}, p={row['p_value']:.2e}")

# Global leaderboard
if not global_table.empty:
    print("\nGlobal leaderboard (average rank across metrics):")
    for _, r in global_table.sort_values("Global Avg Rank (↓ better)").iterrows():
        print(f"  {r['Model']}: avg rank={r['Global Avg Rank (↓ better)']:.2f}, "
              f"Wins={int(r['Wins (Holm<0.05)'])}, Losses={int(r['Losses'])}")

# Per-metric bests
if per_metric_summaries:
    print("\nPer-metric best model (mean ± sd; avg rank):")
    for metric_df in per_metric_summaries:
        top = metric_df.iloc[0]
        print(f"  - {top['Metric']}: {top['Model']} "
              f"({top['Mean']:.4f} ± {top['SD']:.4f}; rank={top['Avg Rank']:.2f})")

# Post-hoc directional highlights
if not posthoc_sig.empty:
    print("\nTop post-hoc (directional, Holm-corrected p) — smallest first:")
    for _, r in posthoc_sig.sort_values(["p_holm","metric"]).head(10).iterrows():
        arrow = " > " if r["median_diff"] > 0 else (" < " if r["median_diff"] < 0 else " = "
        )
        print(f"  {r['metric']}: {r['model1']}{arrow}{r['model2']} (p_holm={r['p_holm']:.2e})")
else:
    print("\nNo significant post-hoc pairs after Holm.")

# ---------- Separator ----------
print("\n" + "-"*70)

# ---------- Q2: Quantization effect ----------
print("[Q2] DOES INT8 QUANTIZATION AFFECT PERFORMANCE?")
sig_quant = quant_df[quant_df['p_value'] < 0.05]
nonsig_quant = quant_df[quant_df['p_value'] >= 0.05]

if not sig_quant.empty:
    print(f"Significant quantization differences (p < 0.05): {len(sig_quant)} cases")
    for _, row in sig_quant.iterrows():
        print(f"  - {row['metric']} [{row['pair']}]: {row['test']}, "
              f"p={row['p_value']:.4g}, effect={row['effect']:.3f}")
else:
    print("No significant quantization effects found.")

if not nonsig_quant.empty:
    print(f"Non-significant quantization results: {len(nonsig_quant)} cases")
    for _, row in nonsig_quant.iterrows():
        print(f"  - {row['metric']} [{row['pair']}]: {row['test']}, "
              f"p={row['p_value']:.4g}, effect={row['effect']:.3f}")


Results saved to TXT files:
- friedman_results.txt
- posthoc_wilcoxon_holm.txt
- quantization_tests.txt
Saved: q1_summary.txt

SUMMARY
Models analyzed: 8
Metrics analyzed: 6

[Q1] WHICH MODEL IS BEST? (Friedman + Ranks + Post-hoc)
Friedman significant (p < 0.05): 6 / 6 metrics
  - Accuracy: χ²=206.611, p=4.56e-41
  - Balanced Accuracy: χ²=181.322, p=1.02e-35
  - Macro F1: χ²=181.889, p=7.77e-36
  - Precision (Macro): χ²=163.567, p=5.69e-32
  - ROC AUC: χ²=173.122, p=5.51e-34
  - Recall (Macro): χ²=181.322, p=1.02e-35

Global leaderboard (average rank across metrics):
  XGBoost: avg rank=1.95, Wins=35, Losses=4
  LightGBM: avg rank=2.64, Wins=30, Losses=7
  Random Forest: avg rank=2.81, Wins=31, Losses=10
  AutoEncoder (FP32): avg rank=4.48, Wins=17, Losses=15
  AutoEncoder (INT8): avg rank=5.27, Wins=12, Losses=21
  FCNN (FP32): avg rank=5.39, Wins=10, Losses=22
  FCNN (INT8): avg rank=5.45, Wins=7, Losses=21
  SVM: avg rank=8.00, Wins=0, Losses=42

Per-metric best model (mean ± sd; av

In [16]:
# =============================
# Q1: WHICH MODEL IS BEST?
# =============================
print("\n[Q1] WHICH MODEL IS BEST? (Friedman + Ranks + Post-hoc)")

# Friedman results summary
sig_friedman = friedman_df[friedman_df['p_value'] < 0.05]
nonsig_friedman = friedman_df[friedman_df['p_value'] >= 0.05]

if not sig_friedman.empty:
    print(f"Friedman test: Significant differences detected in "
          f"{len(sig_friedman)}/{len(friedman_df)} metrics (p < 0.05).")
else:
    print("Friedman test: No significant differences between models.")

print("\nPer-metric significance:")
for _, row in friedman_df.iterrows():
    sig_flag = "✔" if row['p_value'] < 0.05 else "✘"
    print(f"  {sig_flag} {row['metric']}: χ²={row['chi2']:.3f}, p={row['p_value']:.4g}")

# Global leaderboard
print("\nGlobal Leaderboard (avg rank across all metrics):")
if not global_table.empty:
    for _, r in global_table.sort_values("Global Avg Rank (↓ better)").iterrows():
        print(f"  {r['Model']}: rank={r['Global Avg Rank (↓ better)']:.2f}, "
              f"Wins={int(r['Wins (Holm<0.05)'])}, Losses={int(r['Losses'])}")
else:
    print("  (no leaderboard)")

# Per-metric best model
print("\nBest model per metric (mean ± sd; avg rank):")
if per_metric_summaries:
    for metric_df in per_metric_summaries:
        top = metric_df.iloc[0]
        print(f"  - {top['Metric']}: {top['Model']} "
              f"({top['Mean']:.4f} ± {top['SD']:.4f}; rank={top['Avg Rank']:.2f})")
else:
    print("  (no per-metric bests)")

# Post-hoc top pairs (directional)
print("\nTop Post-hoc Comparisons (Holm-corrected p, smallest first):")
if 'posthoc_sig' in locals() and not posthoc_sig.empty:
    show = posthoc_sig.sort_values(["p_holm", "metric"]).head(10)
    for _, row in show.iterrows():
        # Determine direction using median_diff (computed earlier)
        if 'median_diff' in row and pd.notnull(row['median_diff']):
            better = row['model1'] if row['median_diff'] > 0 else row['model2']
            worse  = row['model2'] if row['median_diff'] > 0 else row['model1']
        else:
            # Fallback to effect sign if median_diff missing
            better = row['model1'] if row.get('effect', 0) > 0 else row['model2']
            worse  = row['model2'] if row.get('effect', 0) > 0 else row['model1']
        eff_val = row['effect'] if 'effect' in row and pd.notnull(row['effect']) else np.nan
        print(f"  {row['metric']}: {better} > {worse} (p={row['p_holm']:.2e}, effect={abs(eff_val):.3f})")
else:
    print("  (no significant post-hoc pairs)")

# ---------- Separator ----------
print("\n" + "-"*70)

# =============================
# Q2: DOES INT8 QUANTIZATION AFFECT PERFORMANCE?
# =============================
print("[Q2] DOES INT8 QUANTIZATION AFFECT PERFORMANCE?")

sig_quant = quant_df[quant_df['p_value'] < 0.05]
nonsig_quant = quant_df[quant_df['p_value'] >= 0.05]

# Summary
if not sig_quant.empty:
    print(f"Quantization effect detected in {len(sig_quant)}/{len(quant_df)} cases.")
else:
    print("No significant quantization effects detected.")

# FCNN summary
print("\nFCNN (FP32 vs INT8):")
fcnn_results = quant_df[quant_df['pair'].str.contains("FCNN")]
if not fcnn_results.empty:
    for _, row in fcnn_results.iterrows():
        sig_flag = "✔" if row['p_value'] < 0.05 else "✘"
        print(f"  {sig_flag} {row['metric']}: {row['test']}, "
              f"p={row['p_value']:.4g}, effect={row['effect']:.3f}")
else:
    print("  (no FCNN comparison rows)")

# AutoEncoder summary
print("\nAutoEncoder (FP32 vs INT8):")
ae_results = quant_df[quant_df['pair'].str.contains("AutoEncoder")]
if not ae_results.empty:
    for _, row in ae_results.iterrows():
        sig_flag = "✔" if row['p_value'] < 0.05 else "✘"
        print(f"  {sig_flag} {row['metric']}: {row['test']}, "
              f"p={row['p_value']:.4g}, effect={row['effect']:.3f}")
else:
    print("  (no AutoEncoder comparison rows)")

# Overall interpretation
# print("\nInterpretation:")
# print("  - FCNN: Performance consistently drops with INT8 quantization (most or all metrics significant).")
# print("  - AutoEncoder: Mixed impact — typically Accuracy, Precision, and AUC show significant drops; "
#       "Balanced Accuracy, Macro F1, and Recall often remain non-significant.")



[Q1] WHICH MODEL IS BEST? (Friedman + Ranks + Post-hoc)
Friedman test: Significant differences detected in 6/6 metrics (p < 0.05).

Per-metric significance:
  ✔ Accuracy: χ²=206.611, p=4.563e-41
  ✔ Balanced Accuracy: χ²=181.322, p=1.024e-35
  ✔ Macro F1: χ²=181.889, p=7.774e-36
  ✔ Precision (Macro): χ²=163.567, p=5.693e-32
  ✔ ROC AUC: χ²=173.122, p=5.512e-34
  ✔ Recall (Macro): χ²=181.322, p=1.024e-35

Global Leaderboard (avg rank across all metrics):
  XGBoost: rank=1.95, Wins=35, Losses=4
  LightGBM: rank=2.64, Wins=30, Losses=7
  Random Forest: rank=2.81, Wins=31, Losses=10
  AutoEncoder (FP32): rank=4.48, Wins=17, Losses=15
  AutoEncoder (INT8): rank=5.27, Wins=12, Losses=21
  FCNN (FP32): rank=5.39, Wins=10, Losses=22
  FCNN (INT8): rank=5.45, Wins=7, Losses=21
  SVM: rank=8.00, Wins=0, Losses=42

Best model per metric (mean ± sd; avg rank):
  - Accuracy: XGBoost (0.9988 ± 0.0001; rank=1.00)
  - Balanced Accuracy: XGBoost (0.9787 ± 0.0032; rank=1.00)
  - Macro F1: Random Fores