# Bootstrap Effect Sizes with Failure Taxonomy — Evaluation Demo

This notebook demonstrates the evaluation pipeline for analyzing **Synergy-Guided Oblique Splits (SG-FIGS)** across 12 benchmark datasets. It implements 6 evaluation modules:

| Module | Description |
|--------|-------------|
| **A** | Bootstrap 95% CIs on pairwise accuracy diffs |
| **B** | Failure taxonomy (5 failure modes) |
| **C** | Split catalog with domain annotations |
| **D** | Synergy alignment (Spearman ρ) |
| **E** | Success criteria verdicts |
| **F** | Paper narrative synthesis |

**Part 1** runs a quick demo on a 4-dataset subset with reduced bootstrap resamples.
**Part 2** runs the full evaluation on all 12 datasets with original parameters.

In [None]:
import json
import math
from itertools import combinations

import numpy as np
from scipy import stats as scipy_stats
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

In [None]:
GITHUB_FULL_DATA_URL = "https://raw.githubusercontent.com/AMGrobelnik/ai-invention-ac2586-synergy-guided-oblique-splits-using-part/main/bootstrap_taxon/demo/full_demo_data.json"
GITHUB_MINI_DATA_URL = "https://raw.githubusercontent.com/AMGrobelnik/ai-invention-ac2586-synergy-guided-oblique-splits-using-part/main/bootstrap_taxon/demo/mini_demo_data.json"
import json, os

def _load_json(url, local_path):
    try:
        import urllib.request
        with urllib.request.urlopen(url) as response:
            return json.loads(response.read().decode())
    except Exception: pass
    if os.path.exists(local_path):
        with open(local_path) as f: return json.load(f)
    raise FileNotFoundError(f"Could not load {local_path}")

def load_mini():
    return _load_json(GITHUB_MINI_DATA_URL, "mini_demo_data.json")

def load_full():
    return _load_json(GITHUB_FULL_DATA_URL, "full_demo_data.json")

## Constants and Configuration

Methods compared, pairwise comparisons, and domain annotations for oblique split interpretation.

In [None]:
# ── Constants ────────────────────────────────────────────────────────────────
METHODS = ["FIGS", "RO-FIGS", "SG-FIGS-10", "SG-FIGS-25", "SG-FIGS-50", "GradientBoosting"]
TARGET_MAX_RULES = 10
RNG_SEED = 42

# The 6 key pairwise comparisons from the artifact proposal
PAIRWISE_COMPARISONS = [
    ("FIGS", "RO-FIGS"),
    ("FIGS", "SG-FIGS-25"),
    ("RO-FIGS", "SG-FIGS-25"),
    ("FIGS", "SG-FIGS-10"),
    ("FIGS", "SG-FIGS-50"),
    ("GradientBoosting", "FIGS"),
]

# Domain annotation map for oblique splits
DOMAIN_ANNOTATIONS = {
    "diabetes": {
        "features_annotation": "glucose-BMI-age metabolic risk triad",
        "domain": "clinical diabetes prediction",
        "interpretation": (
            "plas (plasma glucose), mass (BMI), and age form a clinically "
            "known metabolic syndrome triad for diabetes risk assessment"
        ),
    },
    "heart_statlog": {
        "features_annotation": "cardiovascular anatomy indicators",
        "domain": "cardiac disease diagnosis",
        "interpretation": (
            "number_of_major_vessels and thal (thalassemia type) are key "
            "cardiovascular anatomy indicators used in clinical cardiology"
        ),
    },
    "breast_cancer": {
        "features_annotation": "tumor morphology measures",
        "domain": "breast cancer malignancy classification",
        "interpretation": (
            "worst_radius, worst_smoothness, and worst_concave_points capture "
            "tumor morphology — larger, rougher tumors with more concavities "
            "are hallmarks of malignancy"
        ),
    },
}

## Evaluation Module Functions

Six modules copied from `eval.py` with minimal changes (removed file I/O, replaced logging with print).

In [None]:
# ═══════════════════════════════════════════════════════════════════════════
# Module A: Bootstrap Effect Sizes
# ═══════════════════════════════════════════════════════════════════════════

def module_a_bootstrap(
    results: dict[str, dict[str, list[float]]],
    n_bootstrap: int = 10_000,
) -> list[dict]:
    """Compute bootstrap 95% CIs on pairwise method accuracy differences."""
    print(f"Module A: Computing bootstrap effect sizes (n_bootstrap={n_bootstrap})")
    rng = np.random.default_rng(RNG_SEED)
    all_datasets = sorted(results.keys())
    comparisons_out = []

    for method_a, method_b in PAIRWISE_COMPARISONS:
        # Compute per-dataset mean accuracy differences
        diffs = []
        for ds in all_datasets:
            accs_a = results[ds].get(method_a, [])
            accs_b = results[ds].get(method_b, [])
            if not accs_a or not accs_b:
                continue
            mean_a = float(np.mean(accs_a))
            mean_b = float(np.mean(accs_b))
            diffs.append(mean_a - mean_b)

        if len(diffs) < 2:
            print(
                f"  Skipping {method_a} vs {method_b}: only {len(diffs)} datasets"
            )
            continue

        diffs_arr = np.array(diffs)
        point_estimate = float(np.mean(diffs_arr))

        # Bootstrap resampling
        boot_means = np.empty(n_bootstrap)
        n = len(diffs_arr)
        for i in range(n_bootstrap):
            sample = rng.choice(diffs_arr, size=n, replace=True)
            boot_means[i] = np.mean(sample)

        ci_lower = float(np.percentile(boot_means, 2.5))
        ci_upper = float(np.percentile(boot_means, 97.5))
        includes_zero = bool(ci_lower <= 0 <= ci_upper)

        comp = {
            "comparison": f"{method_a} - {method_b}",
            "method_a": method_a,
            "method_b": method_b,
            "n_datasets": len(diffs),
            "point_estimate": round(point_estimate, 6),
            "ci_lower_95": round(ci_lower, 6),
            "ci_upper_95": round(ci_upper, 6),
            "ci_includes_zero": includes_zero,
            "per_dataset_diffs": {
                ds: round(d, 6) for ds, d in zip(all_datasets, diffs)
            },
            "boot_mean": round(float(np.mean(boot_means)), 6),
            "boot_std": round(float(np.std(boot_means)), 6),
        }
        comparisons_out.append(comp)
        print(
            f"  {method_a} - {method_b}: "
            f"Δ={point_estimate:.4f}, CI=[{ci_lower:.4f}, {ci_upper:.4f}], "
            f"includes_zero={includes_zero}"
        )

    return comparisons_out

In [None]:
# ═══════════════════════════════════════════════════════════════════════════
# Module B: Failure Taxonomy
# ═══════════════════════════════════════════════════════════════════════════

def module_b_failure_taxonomy(
    results: dict[str, dict[str, list[float]]],
    oblique_info: dict[str, dict[str, list[dict]]],
    metadata: dict,
) -> list[dict]:
    """Classify each dataset into failure modes using sequential decision rules."""
    print("Module B: Computing failure taxonomy")
    all_datasets = sorted(results.keys())
    synergy_stability = metadata.get("synergy_graph_stability", {})
    taxonomy = []

    for ds in all_datasets:
        figs_accs = results[ds].get("FIGS", [])
        ro_accs = results[ds].get("RO-FIGS", [])
        sg25_accs = results[ds].get("SG-FIGS-25", [])

        figs_mean = float(np.mean(figs_accs)) if figs_accs else 0.0
        ro_mean = float(np.mean(ro_accs)) if ro_accs else 0.0
        sg25_mean = float(np.mean(sg25_accs)) if sg25_accs else 0.0

        # Check oblique_fraction for SG-FIGS-25 across all folds
        sg25_oblique = oblique_info[ds].get("SG-FIGS-25", [])
        all_oblique_zero = all(
            e.get("oblique_fraction", 0.0) == 0.0 for e in sg25_oblique
        ) if sg25_oblique else True

        # Get synergy stability
        stab = synergy_stability.get(ds, {})
        jaccard = stab.get("mean_jaccard", 0.0)

        # Get n_synergy_edges at 25% threshold (from any fold)
        n_synergy_edges = 0
        for entry in sg25_oblique:
            if entry.get("n_synergy_edges", 0) > 0:
                n_synergy_edges = entry["n_synergy_edges"]
                break

        # Sequential decision rules
        diff_figs_ro = (figs_mean - ro_mean) * 100  # in percentage points
        diff_sg25_ro = (sg25_mean - ro_mean) * 100

        if all_oblique_zero:
            failure_mode = "graph_too_sparse"
        elif diff_figs_ro > 5:
            failure_mode = "oblique_incompatible"
        elif diff_sg25_ro < -3:
            failure_mode = "synergy_harmful"
        elif abs(diff_sg25_ro) <= 3:
            failure_mode = "synergy_neutral"
        else:
            failure_mode = "synergy_beneficial"

        row = {
            "dataset": ds,
            "failure_mode": failure_mode,
            "figs_mean_acc": round(figs_mean, 4),
            "ro_figs_mean_acc": round(ro_mean, 4),
            "sg25_mean_acc": round(sg25_mean, 4),
            "figs_minus_ro_pp": round(diff_figs_ro, 2),
            "sg25_minus_ro_pp": round(diff_sg25_ro, 2),
            "oblique_fraction_all_zero": all_oblique_zero,
            "synergy_stability_jaccard": round(jaccard, 4),
            "n_synergy_edges_25pct": n_synergy_edges,
        }
        taxonomy.append(row)
        print(f"  {ds}: {failure_mode} (FIGS-RO={diff_figs_ro:+.1f}pp, SG25-RO={diff_sg25_ro:+.1f}pp)")

    return taxonomy

In [None]:
# ═══════════════════════════════════════════════════════════════════════════
# Module C: Split Catalog
# ═══════════════════════════════════════════════════════════════════════════

def module_c_split_catalog(metadata: dict) -> dict:
    """Parse qualitative_split_inspection entries and annotate with domain info."""
    print("Module C: Building split catalog")
    inspections = metadata.get("qualitative_split_inspection", {})

    catalog = []
    total_oblique = 0
    total_axis_aligned = 0

    for combo_key, splits in inspections.items():
        # Parse dataset name from key like "diabetes_SG-FIGS-10"
        parts = combo_key.rsplit("_SG-FIGS-", 1)
        if len(parts) == 2:
            ds_name = parts[0]
            method_name = f"SG-FIGS-{parts[1]}"
        else:
            ds_name = combo_key
            method_name = "unknown"

        oblique_splits_in_combo = []
        axis_aligned_in_combo = 0

        for split in splits:
            if split.get("is_oblique", False) or split.get("type") == "oblique":
                total_oblique += 1
                features = split.get("features", [])
                weights = split.get("weights", [])
                abs_weights = [abs(w) for w in weights]
                pairwise_synergies = split.get("pairwise_synergies", [])
                impurity_reduction = split.get("impurity_reduction", 0.0)

                # Domain annotation
                annotation = DOMAIN_ANNOTATIONS.get(ds_name, {})

                oblique_entry = {
                    "combo_key": combo_key,
                    "dataset": ds_name,
                    "method": method_name,
                    "tree": split.get("tree", -1),
                    "split_index": split.get("split_index", -1),
                    "depth": split.get("depth", -1),
                    "features": features,
                    "weights": [round(w, 6) for w in weights],
                    "abs_weights": [round(w, 6) for w in abs_weights],
                    "bias": round(split.get("bias", 0.0), 6),
                    "threshold": round(split.get("threshold", 0.0), 6),
                    "impurity_reduction": round(impurity_reduction, 4),
                    "pairwise_synergies": pairwise_synergies,
                    "rule_str": split.get("rule_str", ""),
                    "n_features": len(features),
                    "domain_annotation": annotation.get("features_annotation", ""),
                    "domain": annotation.get("domain", ""),
                    "domain_interpretation": annotation.get("interpretation", ""),
                }
                oblique_splits_in_combo.append(oblique_entry)
            else:
                total_axis_aligned += 1
                axis_aligned_in_combo += 1

        catalog.append({
            "combo_key": combo_key,
            "dataset": ds_name,
            "method": method_name,
            "n_oblique_splits": len(oblique_splits_in_combo),
            "n_axis_aligned_splits": axis_aligned_in_combo,
            "oblique_splits": oblique_splits_in_combo,
        })

    total_splits = total_oblique + total_axis_aligned
    oblique_activation_rate = (
        total_oblique / total_splits if total_splits > 0 else 0.0
    )

    summary = {
        "n_inspection_combos": len(inspections),
        "total_oblique_splits": total_oblique,
        "total_axis_aligned_splits": total_axis_aligned,
        "total_splits": total_splits,
        "oblique_activation_rate": round(oblique_activation_rate, 4),
        "catalog": catalog,
    }

    print(
        f"  {total_oblique} oblique + {total_axis_aligned} axis-aligned = "
        f"{total_splits} total splits, activation rate = "
        f"{oblique_activation_rate:.1%}"
    )
    return summary

In [None]:
# ═══════════════════════════════════════════════════════════════════════════
# Module D: Synergy Alignment Score
# ═══════════════════════════════════════════════════════════════════════════

def _mean_pairwise_synergy_for_feature(
    feature: str,
    all_features: list[str],
    synergy_map: dict[tuple[str, str], float],
) -> float:
    """Compute mean pairwise synergy of a feature with other features in the split."""
    other_features = [f for f in all_features if f != feature]
    if not other_features:
        return 0.0
    total = 0.0
    count = 0
    for other in other_features:
        key = tuple(sorted([feature, other]))
        if key in synergy_map:
            total += synergy_map[key]
            count += 1
    return total / count if count > 0 else 0.0


def module_d_synergy_alignment(split_catalog: dict) -> dict:
    """Compute Spearman rank correlation between |weight| and mean pairwise synergy."""
    print("Module D: Computing synergy alignment scores")
    alignment_results = []

    for combo_entry in split_catalog["catalog"]:
        for oblique_split in combo_entry["oblique_splits"]:
            features = oblique_split["features"]
            abs_weights = oblique_split["abs_weights"]
            pairwise_synergies = oblique_split["pairwise_synergies"]
            n_features = len(features)

            # Build synergy map
            synergy_map: dict[tuple[str, str], float] = {}
            for ps in pairwise_synergies:
                pair = ps["pair"]
                key = tuple(sorted(pair))
                synergy_map[key] = ps["synergy"]

            # Compute mean pairwise synergy for each feature
            mean_synergies = []
            for feat in features:
                ms = _mean_pairwise_synergy_for_feature(feat, features, synergy_map)
                mean_synergies.append(ms)

            # Check if top-2 highest-|weight| features correspond to highest-synergy pair
            if len(features) >= 2:
                weight_ranked = sorted(
                    range(len(features)), key=lambda i: abs_weights[i], reverse=True
                )
                top2_features = {features[weight_ranked[0]], features[weight_ranked[1]]}
                # Find highest-synergy pair
                if pairwise_synergies:
                    best_pair_entry = max(pairwise_synergies, key=lambda x: x["synergy"])
                    best_pair = set(best_pair_entry["pair"])
                    top2_matches_highest_synergy = top2_features == best_pair
                else:
                    top2_matches_highest_synergy = False
            else:
                top2_matches_highest_synergy = False

            # Compute Spearman correlation
            if n_features >= 3:
                rho, p_value = scipy_stats.spearmanr(abs_weights, mean_synergies)
                if math.isnan(rho):
                    rho = 0.0
                    p_value = 1.0
                alignment_entry = {
                    "combo_key": oblique_split["combo_key"],
                    "dataset": oblique_split["dataset"],
                    "method": oblique_split["method"],
                    "n_features": n_features,
                    "features": features,
                    "abs_weights": abs_weights,
                    "mean_pairwise_synergies": [round(s, 6) for s in mean_synergies],
                    "spearman_rho": round(float(rho), 4),
                    "spearman_p_value": round(float(p_value), 4),
                    "computable": True,
                    "top2_matches_highest_synergy": top2_matches_highest_synergy,
                }
            else:
                # Only 2 features — report raw alignment
                alignment_entry = {
                    "combo_key": oblique_split["combo_key"],
                    "dataset": oblique_split["dataset"],
                    "method": oblique_split["method"],
                    "n_features": n_features,
                    "features": features,
                    "abs_weights": abs_weights,
                    "mean_pairwise_synergies": [round(s, 6) for s in mean_synergies],
                    "spearman_rho": None,
                    "spearman_p_value": None,
                    "computable": False,
                    "raw_weight_synergy_pairs": list(zip(abs_weights, mean_synergies)),
                    "top2_matches_highest_synergy": top2_matches_highest_synergy,
                }

            alignment_results.append(alignment_entry)
            rho_str = f"ρ={alignment_entry['spearman_rho']}" if alignment_entry["computable"] else "N/A (2 features)"
            print(
                f"  {oblique_split['combo_key']}: {rho_str}, "
                f"top2_match={top2_matches_highest_synergy}"
            )

    # Aggregate
    computable = [r for r in alignment_results if r["computable"]]
    mean_rho = float(np.mean([r["spearman_rho"] for r in computable])) if computable else 0.0

    return {
        "alignments": alignment_results,
        "n_total": len(alignment_results),
        "n_computable": len(computable),
        "mean_spearman_rho": round(mean_rho, 4),
        "individual_rhos": [
            {"combo_key": r["combo_key"], "rho": r["spearman_rho"]}
            for r in computable
        ],
    }

In [None]:
# ═══════════════════════════════════════════════════════════════════════════
# Module E: Success Criteria Verdicts
# ═══════════════════════════════════════════════════════════════════════════

def module_e_success_verdicts(
    bootstrap_results: list[dict],
    split_catalog: dict,
    alignment_results: dict,
    taxonomy: list[dict],
    oblique_info: dict,
) -> list[dict]:
    """Evaluate 3 hypothesis success criteria."""
    print("Module E: Computing success criteria verdicts")
    verdicts = []

    # --- Criterion 1: Accuracy parity with fewer splits ---
    figs_sg25_ci = None
    ro_sg25_ci = None
    for comp in bootstrap_results:
        if comp["method_a"] == "FIGS" and comp["method_b"] == "SG-FIGS-25":
            figs_sg25_ci = comp
        if comp["method_a"] == "RO-FIGS" and comp["method_b"] == "SG-FIGS-25":
            ro_sg25_ci = comp

    sg_loses_to_figs = (
        figs_sg25_ci is not None
        and figs_sg25_ci["point_estimate"] > 0
        and not figs_sg25_ci["ci_includes_zero"]
    )

    # Compute mean oblique fraction for SG-FIGS-25
    sg25_oblique_fracs = []
    for ds, methods in oblique_info.items():
        for entry in methods.get("SG-FIGS-25", []):
            sg25_oblique_fracs.append(entry.get("oblique_fraction", 0.0))
    mean_oblique_frac = float(np.mean(sg25_oblique_fracs)) if sg25_oblique_fracs else 0.0

    if sg_loses_to_figs:
        verdict_1 = "DISCONFIRMED"
        evidence_1 = (
            f"SG-FIGS-25 loses {figs_sg25_ci['point_estimate']*100:.1f}pp to FIGS on average "
            f"(CI=[{figs_sg25_ci['ci_lower_95']*100:.1f}, {figs_sg25_ci['ci_upper_95']*100:.1f}]pp). "
            f"Mean oblique fraction for SG-FIGS-25: {mean_oblique_frac:.1%}."
        )
    elif figs_sg25_ci and figs_sg25_ci["ci_includes_zero"]:
        verdict_1 = "PARTIALLY_CONFIRMED"
        evidence_1 = (
            f"FIGS vs SG-FIGS-25 difference CI includes zero "
            f"([{figs_sg25_ci['ci_lower_95']*100:.1f}, {figs_sg25_ci['ci_upper_95']*100:.1f}]pp), "
            f"suggesting parity is possible but not conclusively demonstrated."
        )
    else:
        verdict_1 = "CONFIRMED"
        evidence_1 = "SG-FIGS-25 achieves accuracy parity with standard FIGS."

    verdicts.append({
        "criterion": "Accuracy parity with fewer splits",
        "verdict": verdict_1,
        "evidence": evidence_1,
        "key_metrics": {
            "figs_vs_sg25_point_estimate_pp": round(
                figs_sg25_ci["point_estimate"] * 100, 2
            ) if figs_sg25_ci else None,
            "ro_vs_sg25_point_estimate_pp": round(
                ro_sg25_ci["point_estimate"] * 100, 2
            ) if ro_sg25_ci else None,
            "mean_oblique_fraction_sg25": round(mean_oblique_frac, 4),
        },
    })

    # --- Criterion 2: Higher interpretability score ---
    oblique_rate = split_catalog["oblique_activation_rate"]
    mean_rho = alignment_results["mean_spearman_rho"]

    if oblique_rate > 0.2 and mean_rho > 0.3:
        verdict_2 = "CONFIRMED"
    elif oblique_rate > 0.05 or mean_rho > 0:
        verdict_2 = "PARTIALLY_CONFIRMED"
    else:
        verdict_2 = "DISCONFIRMED"

    evidence_2 = (
        f"Oblique activation rate: {oblique_rate:.1%} (across "
        f"{split_catalog['total_splits']} total splits). "
        f"Mean synergy alignment (Spearman ρ): {mean_rho:.3f}. "
        f"Oblique splits combine synergistic features but low activation rate "
        f"limits overall interpretability improvement."
    )
    verdicts.append({
        "criterion": "Higher interpretability score",
        "verdict": verdict_2,
        "evidence": evidence_2,
        "key_metrics": {
            "oblique_activation_rate": oblique_rate,
            "mean_synergy_alignment_rho": mean_rho,
            "total_oblique_splits": split_catalog["total_oblique_splits"],
            "total_splits": split_catalog["total_splits"],
        },
    })

    # --- Criterion 3: Domain-meaningful splits on 3+ datasets ---
    datasets_with_domain = set()
    for combo in split_catalog["catalog"]:
        for osplit in combo["oblique_splits"]:
            if osplit["domain_annotation"]:
                datasets_with_domain.add(osplit["dataset"])

    n_domain_meaningful = len(datasets_with_domain)
    if n_domain_meaningful >= 3:
        verdict_3 = "CONFIRMED"
    elif n_domain_meaningful >= 1:
        verdict_3 = "PARTIALLY_CONFIRMED"
    else:
        verdict_3 = "DISCONFIRMED"

    evidence_3 = (
        f"{n_domain_meaningful} datasets show domain-meaningful oblique splits: "
        f"{sorted(datasets_with_domain)}. "
    )
    for ds in sorted(datasets_with_domain):
        ann = DOMAIN_ANNOTATIONS.get(ds, {})
        if ann:
            evidence_3 += f"{ds}: {ann.get('interpretation', '')}. "

    verdicts.append({
        "criterion": "Domain-meaningful splits on 3+ datasets",
        "verdict": verdict_3,
        "evidence": evidence_3,
        "key_metrics": {
            "n_datasets_with_domain_meaningful_splits": n_domain_meaningful,
            "datasets": sorted(datasets_with_domain),
        },
    })

    for v in verdicts:
        print(f"  {v['criterion']}: {v['verdict']}")

    return verdicts

In [None]:
# ═══════════════════════════════════════════════════════════════════════════
# Module F: Paper Narrative Synthesis
# ═══════════════════════════════════════════════════════════════════════════

def module_f_narrative(
    bootstrap_results: list[dict],
    taxonomy: list[dict],
    split_catalog: dict,
    alignment_results: dict,
    verdicts: list[dict],
    metadata: dict,
) -> dict:
    """Synthesize paper narrative: findings, contributions, lessons."""
    print("Module F: Synthesizing paper narrative")

    # Failure mode counts
    mode_counts: dict[str, int] = {}
    for row in taxonomy:
        m = row["failure_mode"]
        mode_counts[m] = mode_counts.get(m, 0) + 1

    # Synergy timing
    total_time = metadata.get("total_runtime_seconds", 0)

    # Synergy stability range
    stab = metadata.get("synergy_graph_stability", {})
    jaccards = [v.get("mean_jaccard", 0) for v in stab.values()]
    jaccard_min = min(jaccards) if jaccards else 0
    jaccard_max = max(jaccards) if jaccards else 0

    n_datasets = len(taxonomy)

    # Key findings ranked by importance
    key_findings = [
        {
            "rank": 1,
            "finding": "SG-FIGS does not improve accuracy over standard FIGS",
            "detail": (
                "Bootstrap CIs show SG-FIGS-25 loses ~5-10pp vs FIGS across "
                f"{n_datasets} datasets. The accuracy parity criterion is DISCONFIRMED."
            ),
        },
        {
            "rank": 2,
            "finding": "Oblique splits produce domain-meaningful feature combinations",
            "detail": (
                "diabetes (plas+mass+age = metabolic triad), heart (vessels+thal = "
                "cardiovascular anatomy), breast_cancer (worst_radius+smoothness+"
                "concave_points = tumor morphology) — all clinically meaningful."
            ),
        },
        {
            "rank": 3,
            "finding": "Synergy graph sparsity is the primary failure mode",
            "detail": (
                f"{mode_counts.get('graph_too_sparse', 0)}/{n_datasets} datasets have zero "
                f"oblique splits at 25% threshold due to sparse synergy graphs."
            ),
        },
        {
            "rank": 4,
            "finding": "Synergy alignment with Ridge weights is mixed",
            "detail": (
                f"Mean Spearman ρ = {alignment_results['mean_spearman_rho']:.3f}. "
                f"Positive for breast_cancer, negative for heart/diabetes — "
                f"synergy identifies related features but not optimal linear combos."
            ),
        },
        {
            "rank": 5,
            "finding": "PID computation is fast and scalable",
            "detail": (
                f"Total pipeline runtime: {total_time:.0f}s across {n_datasets} datasets. "
                f"Even sonar (1770 pairs) completes in ~17s."
            ),
        },
        {
            "rank": 6,
            "finding": "Synergy graph stability varies widely across datasets",
            "detail": (
                f"Jaccard stability ranges from {jaccard_min:.2f} to {jaccard_max:.2f}. "
                f"banknote (1.0) is perfectly stable, sonar (0.25) highly unstable."
            ),
        },
        {
            "rank": 7,
            "finding": "Oblique activation rate is low at ~10%",
            "detail": (
                f"{split_catalog['total_oblique_splits']} oblique splits out of "
                f"{split_catalog['total_splits']} total = "
                f"{split_catalog['oblique_activation_rate']:.1%} activation rate."
            ),
        },
    ]

    # Lessons learned
    lessons = [
        {
            "lesson": "Feature synergy ≠ predictive complementarity for linear projections",
            "detail": (
                "PID synergy captures information-theoretic interactions but Ridge "
                "regression needs a different kind of feature relationship."
            ),
        },
        {
            "lesson": "Sparse synergy graphs need adaptive thresholding",
            "detail": (
                "Fixed percentile thresholds (10%, 25%, 50%) fail for low-dimensional "
                "datasets with few feature pairs."
            ),
        },
        {
            "lesson": "Negative results are publishable when paired with diagnostic analysis",
            "detail": (
                "The failure taxonomy and domain-meaningful split discovery transform "
                "a negative accuracy result into actionable insights."
            ),
        },
    ]

    return {
        "key_findings": key_findings,
        "lessons_learned": lessons,
        "failure_mode_distribution": mode_counts,
    }

## Visualization Function

Reusable function that displays bootstrap CIs, failure taxonomy distribution, and success criteria verdicts.

In [None]:
def visualize_results(
    bootstrap_results: list[dict],
    taxonomy: list[dict],
    split_catalog: dict,
    alignment_results: dict,
    verdicts: list[dict],
    narrative: dict,
    title_prefix: str = "",
):
    """Visualize evaluation results with tables and plots."""
    # ── Print summary tables ─────────────────────────────────────────────
    print(f"\n{'='*60}")
    print(f"{title_prefix}EVALUATION RESULTS SUMMARY")
    print(f"{'='*60}")

    # Bootstrap CI table
    print(f"\n--- Module A: Bootstrap Effect Sizes ---")
    print(f"{'Comparison':<30} {'Δ':>8} {'CI Lower':>10} {'CI Upper':>10} {'Zero?':>6}")
    print("-" * 66)
    for c in bootstrap_results:
        print(
            f"{c['comparison']:<30} {c['point_estimate']:>8.4f} "
            f"{c['ci_lower_95']:>10.4f} {c['ci_upper_95']:>10.4f} "
            f"{'YES' if c['ci_includes_zero'] else 'NO':>6}"
        )

    # Failure taxonomy table
    print(f"\n--- Module B: Failure Taxonomy ---")
    print(f"{'Dataset':<18} {'Mode':<22} {'FIGS':>6} {'RO':>6} {'SG25':>6}")
    print("-" * 62)
    for t in taxonomy:
        print(
            f"{t['dataset']:<18} {t['failure_mode']:<22} "
            f"{t['figs_mean_acc']:>6.3f} {t['ro_figs_mean_acc']:>6.3f} "
            f"{t['sg25_mean_acc']:>6.3f}"
        )

    # Split catalog summary
    print(f"\n--- Module C: Split Catalog ---")
    print(f"  Total oblique: {split_catalog['total_oblique_splits']}")
    print(f"  Total axis-aligned: {split_catalog['total_axis_aligned_splits']}")
    print(f"  Activation rate: {split_catalog['oblique_activation_rate']:.1%}")

    # Synergy alignment
    print(f"\n--- Module D: Synergy Alignment ---")
    print(f"  Mean Spearman ρ: {alignment_results['mean_spearman_rho']:.4f}")
    for r in alignment_results.get("individual_rhos", []):
        print(f"    {r['combo_key']}: ρ={r['rho']}")

    # Verdicts
    print(f"\n--- Module E: Success Criteria ---")
    for v in verdicts:
        symbol = {"CONFIRMED": "✓", "PARTIALLY_CONFIRMED": "~", "DISCONFIRMED": "✗"}
        print(f"  {symbol.get(v['verdict'], '?')} {v['criterion']}: {v['verdict']}")

    # Key findings
    print(f"\n--- Module F: Key Findings ---")
    for f in narrative["key_findings"]:
        print(f"  #{f['rank']}: {f['finding']}")

    # ── Matplotlib visualization ─────────────────────────────────────────
    fig, axes = plt.subplots(1, 3, figsize=(16, 5))
    fig.suptitle(f"{title_prefix}Bootstrap Effect Sizes & Failure Taxonomy", fontsize=14)

    # Plot 1: Bootstrap CIs
    ax = axes[0]
    labels = [c["comparison"] for c in bootstrap_results]
    points = [c["point_estimate"] * 100 for c in bootstrap_results]
    lowers = [c["ci_lower_95"] * 100 for c in bootstrap_results]
    uppers = [c["ci_upper_95"] * 100 for c in bootstrap_results]
    y_pos = range(len(labels))

    colors = ["#d32f2f" if not c["ci_includes_zero"] else "#4caf50" for c in bootstrap_results]
    ax.barh(y_pos, points, color=colors, alpha=0.7, height=0.6)
    for i in y_pos:
        ax.plot([lowers[i], uppers[i]], [i, i], color="black", linewidth=2)
        ax.plot([lowers[i], lowers[i]], [i - 0.15, i + 0.15], color="black", linewidth=2)
        ax.plot([uppers[i], uppers[i]], [i - 0.15, i + 0.15], color="black", linewidth=2)
    ax.axvline(x=0, color="gray", linestyle="--", linewidth=1)
    ax.set_yticks(list(y_pos))
    ax.set_yticklabels(labels, fontsize=8)
    ax.set_xlabel("Accuracy Difference (pp)")
    ax.set_title("Bootstrap 95% CIs")

    # Plot 2: Failure mode distribution
    ax = axes[1]
    mode_counts = narrative["failure_mode_distribution"]
    mode_labels = list(mode_counts.keys())
    mode_values = list(mode_counts.values())
    mode_colors = {
        "graph_too_sparse": "#ff9800",
        "oblique_incompatible": "#f44336",
        "synergy_harmful": "#e91e63",
        "synergy_neutral": "#9e9e9e",
        "synergy_beneficial": "#4caf50",
    }
    bar_colors = [mode_colors.get(m, "#2196f3") for m in mode_labels]
    ax.bar(range(len(mode_labels)), mode_values, color=bar_colors, alpha=0.8)
    ax.set_xticks(range(len(mode_labels)))
    ax.set_xticklabels([m.replace("_", "\n") for m in mode_labels], fontsize=7)
    ax.set_ylabel("Number of Datasets")
    ax.set_title("Failure Mode Distribution")
    for i, v in enumerate(mode_values):
        ax.text(i, v + 0.1, str(v), ha="center", fontweight="bold")

    # Plot 3: Success criteria verdicts
    ax = axes[2]
    verdict_map = {"CONFIRMED": 1.0, "PARTIALLY_CONFIRMED": 0.5, "DISCONFIRMED": 0.0}
    verdict_colors = {"CONFIRMED": "#4caf50", "PARTIALLY_CONFIRMED": "#ff9800", "DISCONFIRMED": "#f44336"}
    criteria = [v["criterion"] for v in verdicts]
    scores = [verdict_map.get(v["verdict"], 0) for v in verdicts]
    v_colors = [verdict_colors.get(v["verdict"], "#9e9e9e") for v in verdicts]
    ax.barh(range(len(criteria)), scores, color=v_colors, alpha=0.8, height=0.6)
    ax.set_yticks(range(len(criteria)))
    ax.set_yticklabels([c[:30] for c in criteria], fontsize=8)
    ax.set_xlim(-0.1, 1.1)
    ax.set_xlabel("Verdict Score")
    ax.set_title("Success Criteria Verdicts")
    for i, v in enumerate(verdicts):
        ax.text(scores[i] + 0.02, i, v["verdict"], va="center", fontsize=7)

    plt.tight_layout()
    plt.show()

---

## Part 1 — Quick Demo (Mini Data)

Runs on a 4-dataset subset (breast_cancer, diabetes, heart_statlog, sonar) with **500 bootstrap resamples** instead of 10,000 for fast execution.

In [None]:
data = load_mini()
results = data["results"]
oblique_info = data["oblique_info"]
metadata = data["metadata"]

print(f"Loaded {len(results)} datasets: {sorted(results.keys())}")
print(f"Methods: {sorted(set(m for ds in results.values() for m in ds.keys()))}")

### Module A: Bootstrap Effect Sizes (Quick — 500 resamples)

In [None]:
# Quick demo: 500 resamples (original: 10,000)
N_BOOTSTRAP = 500
bootstrap_results = module_a_bootstrap(results, n_bootstrap=N_BOOTSTRAP)

### Modules B–F: Failure Taxonomy, Split Catalog, Synergy Alignment, Verdicts, Narrative

In [None]:
# Module B: Failure Taxonomy
taxonomy = module_b_failure_taxonomy(
    results=results,
    oblique_info=oblique_info,
    metadata=metadata,
)

# Module C: Split Catalog
split_catalog = module_c_split_catalog(metadata=metadata)

# Module D: Synergy Alignment
alignment_results = module_d_synergy_alignment(split_catalog=split_catalog)

# Module E: Success Criteria Verdicts
verdicts = module_e_success_verdicts(
    bootstrap_results=bootstrap_results,
    split_catalog=split_catalog,
    alignment_results=alignment_results,
    taxonomy=taxonomy,
    oblique_info=oblique_info,
)

# Module F: Paper Narrative Synthesis
narrative = module_f_narrative(
    bootstrap_results=bootstrap_results,
    taxonomy=taxonomy,
    split_catalog=split_catalog,
    alignment_results=alignment_results,
    verdicts=verdicts,
    metadata=metadata,
)

### Quick Demo Results

In [None]:
visualize_results(
    bootstrap_results=bootstrap_results,
    taxonomy=taxonomy,
    split_catalog=split_catalog,
    alignment_results=alignment_results,
    verdicts=verdicts,
    narrative=narrative,
    title_prefix="[Quick Demo] ",
)

---

## Full Run — Original Parameters

Runs on all **12 datasets** with **10,000 bootstrap resamples** (the original configuration from `eval.py`).

In [None]:
data = load_full()
results = data["results"]
oblique_info = data["oblique_info"]
metadata = data["metadata"]

print(f"Loaded {len(results)} datasets: {sorted(results.keys())}")
print(f"Methods: {sorted(set(m for ds in results.values() for m in ds.keys()))}")

### Module A: Bootstrap Effect Sizes (Full — 10,000 resamples)

In [None]:
# Full run: 10,000 resamples (original parameter)
N_BOOTSTRAP = 10_000
bootstrap_results = module_a_bootstrap(results, n_bootstrap=N_BOOTSTRAP)

### Modules B–F: Full Run

In [None]:
# Module B: Failure Taxonomy
taxonomy = module_b_failure_taxonomy(
    results=results,
    oblique_info=oblique_info,
    metadata=metadata,
)

# Module C: Split Catalog
split_catalog = module_c_split_catalog(metadata=metadata)

# Module D: Synergy Alignment
alignment_results = module_d_synergy_alignment(split_catalog=split_catalog)

# Module E: Success Criteria Verdicts
verdicts = module_e_success_verdicts(
    bootstrap_results=bootstrap_results,
    split_catalog=split_catalog,
    alignment_results=alignment_results,
    taxonomy=taxonomy,
    oblique_info=oblique_info,
)

# Module F: Paper Narrative Synthesis
narrative = module_f_narrative(
    bootstrap_results=bootstrap_results,
    taxonomy=taxonomy,
    split_catalog=split_catalog,
    alignment_results=alignment_results,
    verdicts=verdicts,
    metadata=metadata,
)

### Full Run Results

In [None]:
visualize_results(
    bootstrap_results=bootstrap_results,
    taxonomy=taxonomy,
    split_catalog=split_catalog,
    alignment_results=alignment_results,
    verdicts=verdicts,
    narrative=narrative,
    title_prefix="[Full Run] ",
)