# Confound Decomposition & Formal Hypothesis Assessment of Walk Resolution Limit

This notebook demonstrates the evaluation that synthesizes 3 iteration-2 experiments (~16,100 graphs) through 5 analyses:

1. **Discrepancy Resolution** — model-free vs MLP-proxy gap correlation
2. **Partial Correlations** — controlling for confounds with bootstrap CIs
3. **Sparsity-Stratified Analysis** — testing hypothesis within sparse/dense subsets
4. **AUC-based Predictor Comparison** — SRI vs alternatives for binary targets
5. **Formal Hypothesis Assessment** — Bayes factors and decision tree

The evaluation uses per-graph metrics (SRI, Vandermonde condition number, SRWE-RWSE L1 distance) to assess whether the walk resolution limit hypothesis is confirmed.

In [1]:
import subprocess, sys
def _pip(*a): subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', *a])

# Packages NOT pre-installed on Colab (always install everywhere)
_pip('pingouin==0.5.5')

# Core packages (pre-installed on Colab, install locally to match Colab env)
if 'google.colab' not in sys.modules:
    _pip('numpy==2.0.2', 'pandas==2.2.2', 'scikit-learn==1.6.1', 'scipy==1.16.3',
         'matplotlib==3.10.0', 'seaborn==0.13.2')


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


## Imports

In [2]:
import json
import math
import warnings
import time

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import roc_auc_score
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns
import pingouin as pg

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

## Data Loading

Load the evaluation output (pre-computed from 3 experiments, ~16,100 graphs). The data includes per-graph metrics and pre-computed analysis results.

In [3]:
GITHUB_DATA_URL = "https://raw.githubusercontent.com/AMGrobelnik/ai-invention-ace67e-the-walk-resolution-limit-a-super-resolu/main/evaluation_iter3_confound_decomp/demo/mini_demo_data.json"
import json, os

def load_data():
    try:
        import urllib.request
        with urllib.request.urlopen(GITHUB_DATA_URL) as response:
            return json.loads(response.read().decode())
    except Exception: pass
    if os.path.exists("mini_demo_data.json"):
        with open("mini_demo_data.json") as f: return json.load(f)
    raise FileNotFoundError("Could not load mini_demo_data.json")

In [4]:
data = load_data()
print(f"Evaluation: {data['metadata']['evaluation_name']}")
print(f"Overall verdict: {data['metadata']['overall_verdict']}")
print(f"Total graphs (full run): {data['metadata']['n_total_graphs']}")
print(f"Datasets in demo: {[d['dataset'] for d in data['datasets']]}")
print(f"Examples per dataset: {[len(d['examples']) for d in data['datasets']]}")

Evaluation: Confound Decomposition & Formal Hypothesis Assessment of Walk Resolution Limit
Overall verdict: confirmed
Total graphs (full run): 16100
Datasets in demo: ['Peptides-func', 'Peptides-struct', 'Synthetic-aliased-pairs', 'ZINC-subset']
Examples per dataset: [25, 25, 25, 25]


## Configuration

Tunable parameters for the analysis. Bootstrap iterations control CI precision; `MAX_EXAMPLES` limits per-dataset examples.

In [5]:
# ---- Tunable Parameters ----
BOOTSTRAP_N = 1000         # Number of bootstrap iterations for partial correlations
BOOTSTRAP_AUC_N = 500      # Number of bootstrap iterations for AUC confidence intervals
RNG = np.random.RandomState(42)
MAX_EXAMPLES = None        # Per-dataset limit (None = use all available)

## Build Per-Graph DataFrame

Reconstruct per-graph data from the evaluation output examples, computing derived features (density, normalized SRI, aliased status).

In [6]:
def build_df_from_eval_output(data):
    """Build a per-graph DataFrame from evaluation output examples."""
    rows = []
    for ds_entry in data["datasets"]:
        ds_name = ds_entry["dataset"]
        for ex in ds_entry["examples"]:
            inp = json.loads(ex["input"])
            out = json.loads(ex["output"])
            row = {
                "dataset": ds_name,
                "graph_idx": inp.get("graph_idx", 0),
                "num_nodes": inp.get("num_nodes", 0),
                "sri_k20": out.get("sri_k20", 0.0),
                "delta_min": out.get("delta_min", 0.0),
                "is_aliased": out.get("is_aliased", 0),
                "resolution_diagnosis": ex.get("predict_resolution_diagnosis", "unknown"),
                "srwe_rwse_l1": ex.get("eval_srwe_rwse_l1", 0.0),
                "vander_cond_log10": ex.get("eval_vander_cond_log10", 0.0),
            }
            rows.append(row)
    
    df = pd.DataFrame(rows)
    
    if MAX_EXAMPLES is not None:
        dfs = []
        for ds_name in df["dataset"].unique():
            ds_df = df[df["dataset"] == ds_name].head(MAX_EXAMPLES)
            dfs.append(ds_df)
        df = pd.concat(dfs, ignore_index=True)
    
    # Compute derived fields
    nn = df["num_nodes"].astype(float)
    df["log_vander_k20"] = np.log10(df["sri_k20"].clip(lower=1e-10).astype(float))
    df["normalized_sri"] = df["sri_k20"].astype(float) / nn.clip(lower=1)
    df["is_aliased"] = df["is_aliased"].astype(int)
    
    print(f"DataFrame: {len(df)} rows, {len(df.columns)} columns")
    print(f"Datasets: {df['dataset'].value_counts().to_dict()}")
    print(f"Aliased fraction: {df['is_aliased'].mean():.3f}")
    return df

df = build_df_from_eval_output(data)

DataFrame: 100 rows, 11 columns
Datasets: {'Peptides-func': 25, 'Peptides-struct': 25, 'Synthetic-aliased-pairs': 25, 'ZINC-subset': 25}
Aliased fraction: 0.610


## Analysis 1: Discrepancy Resolution

Resolve discrepancy between model-free and MLP proxy gap measures using pre-computed aggregate results from exp1 metadata.

In [7]:
# Analysis 1 results are pre-computed in the evaluation output
analysis1 = data["metadata"]["analyses"]["analysis_1_discrepancy_resolution"]

ds_names = ["ZINC-subset", "Peptides-func", "Peptides-struct", "Synthetic-aliased-pairs"]
print("=== Analysis 1: Discrepancy Resolution ===")
for ds_name in ds_names:
    ds = analysis1.get(ds_name, {})
    mf_rho = ds.get("model_free_rho", "N/A")
    mlp_rho = ds.get("mlp_proxy_rho", "N/A")
    disc = ds.get("discrepancy_abs_rho", "N/A")
    print(f"  {ds_name}: model_free_rho={mf_rho}, mlp_rho={mlp_rho}, discrepancy={disc}")

print(f"\n  Cross-dataset model_free vs mlp correlation: {analysis1.get('model_free_vs_mlp_gap_correlation', 'N/A')}")
print(f"  MLP gap CV: {analysis1.get('mlp_gap_coefficient_of_variation', 'N/A')}")

=== Analysis 1: Discrepancy Resolution ===
  ZINC-subset: model_free_rho=0.243298, mlp_rho=0.102401, discrepancy=0.140897
  Peptides-func: model_free_rho=0.750775, mlp_rho=-0.138508, discrepancy=0.612267
  Peptides-struct: model_free_rho=0.750775, mlp_rho=0.067221, discrepancy=0.683555
  Synthetic-aliased-pairs: model_free_rho=0.667619, mlp_rho=None, discrepancy=None

  Cross-dataset model_free vs mlp correlation: -0.866025
  MLP gap CV: 1.3938


## Analysis 2: Partial Correlations with Bootstrap CIs

Compute Spearman correlations between SRI and encoding quality proxies (SRWE-RWSE L1 distance and Vandermonde condition number), controlling for confounders (num_nodes, density).

In [8]:
def bootstrap_spearman(x, y, n_boot=BOOTSTRAP_N):
    """Compute Spearman rho with bootstrap CI."""
    valid = np.isfinite(x) & np.isfinite(y)
    x, y = x[valid], y[valid]
    n = len(x)
    if n < 10:
        return {"rho": np.nan, "p": np.nan, "ci_low": np.nan, "ci_high": np.nan, "n": n}
    
    rho, p = stats.spearmanr(x, y)
    boot_rhos = np.empty(n_boot)
    for i in range(n_boot):
        idx = RNG.randint(0, n, n)
        try:
            boot_rhos[i], _ = stats.spearmanr(x[idx], y[idx])
        except Exception:
            boot_rhos[i] = np.nan
    boot_rhos = boot_rhos[np.isfinite(boot_rhos)]
    ci_low = float(np.percentile(boot_rhos, 2.5)) if len(boot_rhos) > 0 else np.nan
    ci_high = float(np.percentile(boot_rhos, 97.5)) if len(boot_rhos) > 0 else np.nan
    return {"rho": float(rho), "p": float(p), "ci_low": ci_low, "ci_high": ci_high, "n": n}


def partial_spearman(df_sub, x_col, y_col, covariates, n_boot=BOOTSTRAP_N):
    """Compute partial Spearman correlation with bootstrap CI."""
    needed = [x_col, y_col] + covariates
    df_clean = df_sub[needed].dropna()
    n = len(df_clean)
    if n < 10:
        return {"rho": np.nan, "p": np.nan, "ci_low": np.nan, "ci_high": np.nan, "n": n}
    
    try:
        result = pg.partial_corr(data=df_clean, x=x_col, y=y_col,
                                  covar=covariates, method="spearman")
        rho = float(result["r"].values[0])
        p_val = float(result["p-val"].values[0])
    except Exception:
        return {"rho": np.nan, "p": np.nan, "ci_low": np.nan, "ci_high": np.nan, "n": n}
    
    boot_rhos = np.empty(n_boot)
    for i in range(n_boot):
        idx = RNG.randint(0, n, n)
        try:
            df_boot = df_clean.iloc[idx].reset_index(drop=True)
            res = pg.partial_corr(data=df_boot, x=x_col, y=y_col,
                                   covar=covariates, method="spearman")
            boot_rhos[i] = float(res["r"].values[0])
        except Exception:
            boot_rhos[i] = np.nan
    boot_rhos = boot_rhos[np.isfinite(boot_rhos)]
    ci_low = float(np.percentile(boot_rhos, 2.5)) if len(boot_rhos) > 0 else np.nan
    ci_high = float(np.percentile(boot_rhos, 97.5)) if len(boot_rhos) > 0 else np.nan
    return {"rho": rho, "p": p_val, "ci_low": ci_low, "ci_high": ci_high, "n": n}


def _fmt_rho(val):
    """Format rho value for display."""
    if isinstance(val, float) and np.isnan(val):
        return "NaN"
    return f"{val:.4f}"


# Run Analysis 2 on demo data
print("=== Analysis 2: Partial Correlations (demo subset) ===")
t0 = time.time()
analysis2_demo = {}
proxy_cols = ["srwe_rwse_l1", "vander_cond_log10"]

for ds_name in sorted(df["dataset"].unique()):
    ds_df = df[df["dataset"] == ds_name].copy()
    for col in ["sri_k20"] + proxy_cols + ["num_nodes"]:
        if col in ds_df.columns:
            ds_df[col] = pd.to_numeric(ds_df[col], errors="coerce")
    
    ds_results = {}
    for gap_col in proxy_cols:
        gap_results = {}
        # Raw Spearman
        raw = bootstrap_spearman(ds_df["sri_k20"].values.astype(float),
                                  ds_df[gap_col].values.astype(float))
        gap_results["raw"] = raw
        
        # Partial controlling for num_nodes
        partial = partial_spearman(ds_df, "sri_k20", gap_col, ["num_nodes"], n_boot=BOOTSTRAP_N)
        gap_results["ctrl_num_nodes"] = partial
        
        print(f"  {ds_name} [{gap_col}]: raw_rho={_fmt_rho(raw['rho'])}, "
              f"partial_rho={_fmt_rho(partial['rho'])}, n={raw['n']}")
        ds_results[gap_col] = gap_results
    analysis2_demo[ds_name] = ds_results

print(f"  Time: {time.time() - t0:.1f}s")

=== Analysis 2: Partial Correlations (demo subset) ===


  Peptides-func [srwe_rwse_l1]: raw_rho=-0.3917, partial_rho=0.1815, n=25


  Peptides-func [vander_cond_log10]: raw_rho=0.3932, partial_rho=-0.4612, n=25


  Peptides-struct [srwe_rwse_l1]: raw_rho=-0.3917, partial_rho=0.1815, n=25


  Peptides-struct [vander_cond_log10]: raw_rho=0.3932, partial_rho=-0.4612, n=25


  Synthetic-aliased-pairs [srwe_rwse_l1]: raw_rho=0.5718, partial_rho=0.0806, n=25


  Synthetic-aliased-pairs [vander_cond_log10]: raw_rho=0.3327, partial_rho=0.3514, n=25


  ZINC-subset [srwe_rwse_l1]: raw_rho=0.6577, partial_rho=0.5570, n=25


  ZINC-subset [vander_cond_log10]: raw_rho=0.3728, partial_rho=-0.2594, n=25
  Time: 9.0s


## Analysis 4: AUC-based Predictor Comparison

Compare SRI against alternative predictors using AUC for binary classification targets (is_aliased, high Vandermonde condition).

In [9]:
def compute_auc_with_bootstrap(y_true, scores, n_boot=BOOTSTRAP_AUC_N):
    """Compute AUC with bootstrap CI. Tries negated scores and picks best."""
    valid = np.isfinite(y_true) & np.isfinite(scores)
    y_true = y_true[valid].astype(float)
    scores = scores[valid].astype(float)
    n = len(y_true)
    
    if n < 10 or len(np.unique(y_true)) < 2:
        return {"auc": np.nan, "ci_low": np.nan, "ci_high": np.nan, "n": n, "direction": "N/A"}
    
    try:
        auc_pos = roc_auc_score(y_true, scores)
        auc_neg = roc_auc_score(y_true, -scores)
    except ValueError:
        return {"auc": np.nan, "ci_low": np.nan, "ci_high": np.nan, "n": n, "direction": "N/A"}
    
    if auc_neg > auc_pos:
        auc, direction, use_scores = auc_neg, "negative", -scores
    else:
        auc, direction, use_scores = auc_pos, "positive", scores
    
    boot_aucs = np.empty(n_boot)
    for i in range(n_boot):
        idx = RNG.randint(0, n, n)
        y_b, s_b = y_true[idx], use_scores[idx]
        if len(np.unique(y_b)) < 2:
            boot_aucs[i] = np.nan
            continue
        try:
            boot_aucs[i] = roc_auc_score(y_b, s_b)
        except ValueError:
            boot_aucs[i] = np.nan
    
    boot_aucs = boot_aucs[np.isfinite(boot_aucs)]
    ci_low = float(np.percentile(boot_aucs, 2.5)) if len(boot_aucs) > 0 else np.nan
    ci_high = float(np.percentile(boot_aucs, 97.5)) if len(boot_aucs) > 0 else np.nan
    return {"auc": float(auc), "ci_low": ci_low, "ci_high": ci_high, "n": n, "direction": direction}


# Run AUC analysis on demo data
print("=== Analysis 4: Predictor Comparison (demo subset) ===")
t0 = time.time()
predictor_specs = {
    "SRI": "sri_k20",
    "log_Vandermonde_kappa": "log_vander_k20",
    "delta_min": "delta_min",
    "normalized_SRI": "normalized_sri",
    "num_nodes": "num_nodes",
    "vander_cond_log10": "vander_cond_log10",
}

analysis4_demo = {"is_aliased": {}}
for ds_name in sorted(df["dataset"].unique()):
    ds_df = df[df["dataset"] == ds_name]
    y_true = ds_df["is_aliased"].values.astype(float)
    
    if len(np.unique(y_true[np.isfinite(y_true)])) < 2:
        print(f"  {ds_name}: Only one class, skipping")
        analysis4_demo["is_aliased"][ds_name] = {pred: {"auc": np.nan} for pred in predictor_specs}
        continue
    
    ds_pred_results = {}
    for pred_name, pred_col in predictor_specs.items():
        if pred_col not in ds_df.columns or ds_df[pred_col].notna().sum() < 10:
            ds_pred_results[pred_name] = {"auc": np.nan}
            continue
        scores = ds_df[pred_col].values.astype(float)
        auc_res = compute_auc_with_bootstrap(y_true, scores, n_boot=BOOTSTRAP_AUC_N)
        ds_pred_results[pred_name] = auc_res
        if not np.isnan(auc_res["auc"]):
            print(f"  {ds_name} | {pred_name}: AUC={auc_res['auc']:.4f} dir={auc_res['direction']}")
    analysis4_demo["is_aliased"][ds_name] = ds_pred_results

print(f"  Time: {time.time() - t0:.1f}s")

=== Analysis 4: Predictor Comparison (demo subset) ===
  Peptides-func | SRI: AUC=1.0000 dir=negative


  Peptides-func | log_Vandermonde_kappa: AUC=1.0000 dir=negative
  Peptides-func | delta_min: AUC=1.0000 dir=negative


  Peptides-func | normalized_SRI: AUC=1.0000 dir=negative
  Peptides-func | num_nodes: AUC=1.0000 dir=positive


  Peptides-func | vander_cond_log10: AUC=1.0000 dir=negative
  Peptides-struct | SRI: AUC=1.0000 dir=negative


  Peptides-struct | log_Vandermonde_kappa: AUC=1.0000 dir=negative
  Peptides-struct | delta_min: AUC=1.0000 dir=negative


  Peptides-struct | normalized_SRI: AUC=1.0000 dir=negative
  Peptides-struct | num_nodes: AUC=1.0000 dir=positive


  Peptides-struct | vander_cond_log10: AUC=1.0000 dir=negative
  Synthetic-aliased-pairs: Only one class, skipping
  ZINC-subset | SRI: AUC=1.0000 dir=negative


  ZINC-subset | log_Vandermonde_kappa: AUC=1.0000 dir=negative


  ZINC-subset | delta_min: AUC=1.0000 dir=negative


  ZINC-subset | normalized_SRI: AUC=0.9936 dir=negative


  ZINC-subset | num_nodes: AUC=0.8462 dir=positive
  ZINC-subset | vander_cond_log10: AUC=0.6795 dir=negative
  Time: 2.8s


## Analysis 5: Formal Hypothesis Assessment

Assess the walk resolution limit hypothesis using three criteria:
- **C1**: |rho(SRI, gap)| > 0.5 in model-free analysis
- **C2**: AUC > 0.65 for SRI predicting aliased status
- **C3**: SRWE gap reduction >= 50%

In [10]:
# Analysis 5 uses pre-computed results from the full evaluation
analysis5 = data["metadata"]["analyses"]["analysis_5_formal_assessment"]

def interpret_bf(bf):
    """Interpret Bayes Factor."""
    if bf is None or (isinstance(bf, float) and (np.isnan(bf) or np.isinf(bf))):
        return "N/A"
    if bf > 100: return "extreme_for_H1"
    if bf > 30: return "very_strong_for_H1"
    if bf > 10: return "strong_for_H1"
    if bf > 3: return "moderate_for_H1"
    if bf > 1: return "anecdotal_for_H1"
    if bf > 1/3: return "anecdotal_for_H0"
    if bf > 1/10: return "moderate_for_H0"
    return "strong_for_H0"

print("=== Analysis 5: Formal Hypothesis Assessment ===")
c1 = analysis5["criteria"]["criterion_1_correlation"]
c2 = analysis5["criteria"]["criterion_2_auc"]
c3 = analysis5["criteria"]["criterion_3_srwe"]

print("\nCriterion 1 — Correlation (|rho| > 0.5):")
for ds_name in ds_names:
    d = c1.get(ds_name, {})
    print(f"  {ds_name}: model_free_rho={d.get('model_free_raw_rho', 'N/A'):.4f}, "
          f"pass={d.get('model_free_pass', False)}")

print(f"\nCriterion 2 — AUC (> 0.65):")
for ds_name in ds_names:
    d = c2.get(ds_name, {})
    print(f"  {ds_name}: SRI AUC={d.get('sri_auc', 'N/A')}, pass={d.get('pass', False)}")

print(f"\nCriterion 3 — SRWE gap reduction:")
print(f"  RWSE MAE={c3.get('rwse_mae', 'N/A')}, LapPE MAE={c3.get('lape_mae', 'N/A')}, "
      f"SRWE MAE={c3.get('srwe_mae', 'N/A')}")
print(f"  Gap reduction: {c3.get('srwe_gap_reduction_pct', 0):.1f}%, pass={c3.get('pass', False)}")

print(f"\n{'='*50}")
print(f"OVERALL VERDICT: {analysis5.get('overall_verdict', 'N/A').upper()}")
vd = analysis5.get("verdict_details", {})
print(f"  C1 raw passes: {vd.get('c1_raw_passes', 'N/A')}")
print(f"  C2 AUC passes: {vd.get('c2_auc_passes', 'N/A')}")
print(f"  C3 SRWE pass: {vd.get('c3_srwe_pass', 'N/A')}")

=== Analysis 5: Formal Hypothesis Assessment ===

Criterion 1 — Correlation (|rho| > 0.5):
  ZINC-subset: model_free_rho=0.2433, pass=False
  Peptides-func: model_free_rho=0.7508, pass=True
  Peptides-struct: model_free_rho=0.7508, pass=True
  Synthetic-aliased-pairs: model_free_rho=0.6676, pass=True

Criterion 2 — AUC (> 0.65):
  ZINC-subset: SRI AUC=1.0, pass=True
  Peptides-func: SRI AUC=1.0, pass=True
  Peptides-struct: SRI AUC=1.0, pass=True
  Synthetic-aliased-pairs: SRI AUC=1.0, pass=True

Criterion 3 — SRWE gap reduction:
  RWSE MAE=1.213264, LapPE MAE=1.346538, SRWE MAE=1.273452
  Gap reduction: 54.8%, pass=True

OVERALL VERDICT: CONFIRMED
  C1 raw passes: 3/4
  C2 AUC passes: 4/4
  C3 SRWE pass: True


## Visualization

Generate publication-quality figures: SRI vs proxy scatter plots, predictor comparison (AUC bar chart), and verdict summary.

In [11]:
sns.set_style("whitegrid")
plt.rcParams.update({"font.size": 10, "axes.titlesize": 12, "figure.dpi": 100})

datasets = sorted(df["dataset"].unique())

# ---- Figure 1: SRI vs proxy scatter (Vandermonde condition) ----
n_ds = min(len(datasets), 4)
fig, axes = plt.subplots(1, n_ds, figsize=(4 * n_ds, 4), squeeze=False)
for i, ds_name in enumerate(datasets[:4]):
    ax = axes[0, i]
    ds_df = df[df["dataset"] == ds_name].dropna(subset=["sri_k20", "vander_cond_log10"])
    if len(ds_df) == 0:
        ax.text(0.5, 0.5, "No data", ha="center", va="center", transform=ax.transAxes)
        ax.set_title(ds_name)
        continue
    ax.scatter(ds_df["sri_k20"].astype(float), ds_df["vander_cond_log10"].astype(float),
              alpha=0.6, s=20, c="darkorange")
    ax.set_xlabel("SRI (K=20)")
    ax.set_ylabel("log\u2081\u2080(Vandermonde \u03ba)")
    
    # Get pre-computed rho from full analysis
    a2_full = data["metadata"]["analyses"]["analysis_2_partial_correlations"]
    rho = a2_full.get(ds_name, {}).get("vander_cond_log10", {}).get("raw", {}).get("rho", np.nan)
    rho_str = f"\u03c1={rho:.3f}" if rho is not None and not np.isnan(rho) else ""
    ax.set_title(f"{ds_name}\n{rho_str}")

plt.suptitle("SRI vs Vandermonde Condition Number", fontsize=12, y=1.02)
plt.tight_layout()
plt.show()

# ---- Figure 2: Predictor Comparison AUC bar chart (using full analysis results) ----
a4_full = data["metadata"]["analyses"]["analysis_4_predictor_comparison"]["is_aliased"]
valid_ds = [ds for ds in datasets if ds in a4_full]
n_ds = min(len(valid_ds), 4)
if n_ds > 0:
    fig, axes = plt.subplots(1, n_ds, figsize=(5 * n_ds, 5), squeeze=False)
    for i, ds_name in enumerate(valid_ds[:4]):
        ax = axes[0, i]
        preds = a4_full[ds_name]
        names, aucs, errs_lo, errs_hi = [], [], [], []
        for pred_name, pred_res in sorted(preds.items()):
            auc = pred_res.get("auc")
            if auc is None or (isinstance(auc, float) and np.isnan(auc)):
                continue
            names.append(pred_name)
            aucs.append(auc)
            ci_lo = pred_res.get("ci_low", auc)
            ci_hi = pred_res.get("ci_high", auc)
            errs_lo.append(auc - (ci_lo if ci_lo is not None else auc))
            errs_hi.append((ci_hi if ci_hi is not None else auc) - auc)
        
        if len(names) > 0:
            c = ["#e74c3c" if n == "SRI" else "#3498db" if n == "num_nodes" else "#95a5a6" for n in names]
            y_pos = np.arange(len(names))
            ax.barh(y_pos, aucs, xerr=[errs_lo, errs_hi], color=c, capsize=3, alpha=0.8)
            ax.set_yticks(y_pos)
            ax.set_yticklabels(names, fontsize=8)
            ax.axvline(0.5, color="gray", linestyle="--", alpha=0.5)
            ax.axvline(0.65, color="red", linestyle="--", alpha=0.3, label="AUC=0.65")
            ax.set_xlabel("AUC")
            ax.set_title(f"{ds_name}\n(target: is_aliased)")
            ax.set_xlim(0.3, 1.05)
    
    plt.suptitle("Predictor Comparison: AUC for 'is_aliased' (full evaluation)", fontsize=12)
    plt.tight_layout()
    plt.show()

# ---- Figure 3: Verdict Summary ----
fig, ax = plt.subplots(figsize=(10, 5))
ax.axis("off")
verdict = analysis5.get("overall_verdict", "N/A")
c3 = analysis5.get("criteria", {}).get("criterion_3_srwe", {})

text = f"OVERALL VERDICT: {verdict.upper()}\n\n"
text += f"Criterion 1 (|\u03c1| > 0.5, model-free): {vd.get('c1_raw_passes', 'N/A')}\n"
text += f"Criterion 1 (|\u03c1| > 0.5, size-ctrl):  {vd.get('c1_size_ctrl_passes', 'N/A')}\n"
text += f"Criterion 2 (AUC > 0.65):             {vd.get('c2_auc_passes', 'N/A')}\n"
text += f"Criterion 3 (SRWE gap \u2265 50%):         {vd.get('c3_srwe_pass', 'N/A')}\n\n"
text += f"SRWE gap reduction: {c3.get('srwe_gap_reduction_pct', 0):.1f}%\n"
text += f"(RWSE={c3.get('rwse_mae', 0):.4f}, LapPE={c3.get('lape_mae', 0):.4f}, SRWE={c3.get('srwe_mae', 0):.4f})"

color = {"confirmed": "#27ae60", "partially_confirmed": "#f39c12",
         "inconclusive": "#7f8c8d", "disconfirmed": "#e74c3c"}.get(verdict, "#7f8c8d")
ax.text(0.5, 0.5, text, transform=ax.transAxes, fontsize=11, ha="center", va="center",
        bbox=dict(boxstyle="round,pad=0.5", facecolor=color, alpha=0.3),
        family="monospace")
plt.suptitle("Formal Hypothesis Assessment", fontsize=14)
plt.tight_layout()
plt.show()

# ---- Summary metrics table ----
print("\n=== Aggregate Metrics (from full evaluation) ===")
ma = data["metrics_agg"]
for key, val in sorted(ma.items()):
    print(f"  {key}: {val}")


=== Aggregate Metrics (from full evaluation) ===
  c3_srwe_pass: 1.0
  mean_model_free_spearman_rho: 0.6031170823852569
  mean_partial_spearman_rho_srwe: 0.13797694923977913
  mean_partial_spearman_rho_vander: 0.07059574522802284
  mean_raw_spearman_rho_srwe: -0.064872214270833
  mean_raw_spearman_rho_vander: 0.353536589250702
  mean_sri_auc_is_aliased: 1.0
  mlp_gap_coefficient_of_variation: 1.3937998150779485
  model_free_vs_mlp_gap_correlation: -0.8660254037844387
  n_datasets_c1_model_free_pass: 3.0
  n_datasets_c2_auc_pass: 4.0
  srwe_gap_reduction_pct: 54.83890331197377
  verdict_score: 4.0
