# SG-FIGS Comprehensive Statistical Evaluation — Demo

This notebook demonstrates the **seven-block statistical evaluation** of SG-FIGS experiment results across 12 datasets and 6 methods.

**Evaluation Blocks:**
1. Per-dataset accuracy/AUC tables with Bonferroni-corrected paired t-tests (15 pairs)
2. Critical difference diagram data (Friedman test + Nemenyi CD with clique identification)
3. Spearman correlations between SG-FIGS-25 advantage and dataset properties
4. Oblique split activation analysis with degeneration detection
5. Accuracy-at-matched-complexity curves across max_rules={5,10,15}
6. Ablation decomposition (oblique penalty vs synergy effect)
7. Positive case study narratives

**Part 1** runs a quick demo on a curated subset (3 datasets).  
**Part 2** runs the full analysis on all 12 datasets with original parameters.

In [None]:
"""Imports — copied from eval.py with minimal additions for notebook context."""

import json
import math
import sys
from collections import defaultdict
from itertools import combinations

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({"font.size": 11})

In [None]:
"""Data loading helpers — GitHub URL with local fallback for Colab compatibility."""

GITHUB_FULL_DATA_URL = "https://raw.githubusercontent.com/AMGrobelnik/ai-invention-ac2586-synergy-guided-oblique-splits-using-part/main/sg_figs_eval/demo/full_demo_data.json"
GITHUB_MINI_DATA_URL = "https://raw.githubusercontent.com/AMGrobelnik/ai-invention-ac2586-synergy-guided-oblique-splits-using-part/main/sg_figs_eval/demo/mini_demo_data.json"
import json, os

def _load_json(url, local_path):
    try:
        import urllib.request
        with urllib.request.urlopen(url) as response:
            return json.loads(response.read().decode())
    except Exception: pass
    if os.path.exists(local_path):
        with open(local_path) as f: return json.load(f)
    raise FileNotFoundError(f"Could not load {local_path}")

def load_mini():
    return _load_json(GITHUB_MINI_DATA_URL, "mini_demo_data.json")

def load_full():
    return _load_json(GITHUB_FULL_DATA_URL, "full_demo_data.json")

---
## Part 1 — Quick Demo (Mini Data: 3 Datasets)

In [None]:
data = load_mini()
print(f"Loaded mini data: {len(data['datasets'])} datasets, "
      f"{sum(len(d['examples']) for d in data['datasets'])} examples")
print(f"Datasets: {[d['dataset'] for d in data['datasets']]}")
print(f"Aggregate metrics: {len(data['metrics_agg'])} keys")

### Constants and Helper Functions

Evaluation constants and statistical helper functions copied from the original script.

In [None]:
# ---------------------------------------------------------------------------
# Constants (from eval.py)
# ---------------------------------------------------------------------------
METHODS = ["FIGS", "RO-FIGS", "SG-FIGS-10", "SG-FIGS-25", "SG-FIGS-50", "GradientBoosting"]
MAX_RULES_VALUES = [5, 10, 15]
N_FOLDS = 5
ALPHA = 0.05
N_METHODS = len(METHODS)
N_PAIRS = N_METHODS * (N_METHODS - 1) // 2  # 15
BONFERRONI_ALPHA = ALPHA / N_PAIRS


# ---------------------------------------------------------------------------
# Helpers (from eval.py)
# ---------------------------------------------------------------------------

def _safe_float(val):
    """Convert to float, handling NaN / None."""
    if val is None:
        return float("nan")
    f = float(val)
    return f


def _bonferroni_sig(p: float) -> str:
    """Return significance indicator under Bonferroni correction."""
    if p < 0.001 / N_PAIRS:
        return "***"
    if p < 0.01 / N_PAIRS:
        return "**"
    if p < BONFERRONI_ALPHA:
        return "*"
    return "ns"


def _paired_ttest(a: list[float], b: list[float]) -> tuple[float, float]:
    """Two-sided paired t-test; returns (t_stat, p_value).

    Falls back to (0.0, 1.0) when variance is zero.
    """
    a_arr = np.array(a, dtype=np.float64)
    b_arr = np.array(b, dtype=np.float64)
    diff = a_arr - b_arr
    if np.std(diff, ddof=1) == 0:
        return 0.0, 1.0
    t_stat, p_val = stats.ttest_rel(a_arr, b_arr)
    return float(t_stat), float(p_val)


def _rank_methods_per_dataset(
    method_means: dict[str, float],
) -> dict[str, float]:
    """Rank methods (1 = best) by accuracy; ties get average rank."""
    sorted_methods = sorted(method_means.items(), key=lambda x: -x[1])
    ranks: dict[str, float] = {}
    i = 0
    while i < len(sorted_methods):
        j = i + 1
        while j < len(sorted_methods) and np.isclose(sorted_methods[j][1], sorted_methods[i][1]):
            j += 1
        avg_rank = np.mean(list(range(i + 1, j + 1)))
        for k in range(i, j):
            ranks[sorted_methods[k][0]] = float(avg_rank)
        i = j
    return ranks


def _nemenyi_q_alpha(k: int, alpha: float = 0.05) -> float:
    """Critical value for Nemenyi test (Studentized range / sqrt(2)).

    Hardcoded for k=6, alpha=0.05 (from standard tables).
    """
    if k == 6 and alpha == 0.05:
        return 4.030 / math.sqrt(2)  # approx 2.850
    raise ValueError(f"Nemenyi q not hardcoded for k={k}, alpha={alpha}")


print("Constants and helpers loaded.")
print(f"Methods: {METHODS}")
print(f"N_PAIRS (Bonferroni): {N_PAIRS}, Bonferroni alpha: {BONFERRONI_ALPHA:.4f}")

### Reconstruct Structured Data

Parse the eval output into the nested structure `{dataset: {method: {max_rules: {fold: {metric: val}}}}}` that the analysis blocks expect. This adapts the original `load_iter4_data()` to work with the eval output format.

In [None]:
def restructure_eval_data(raw_data: dict) -> dict:
    """Reconstruct the nested structure from eval output examples.

    Returns: {dataset: {method: {max_rules: {fold: {accuracy, auc, ...}}}}}
    """
    structured = {}
    for ds_block in raw_data["datasets"]:
        ds_name = ds_block["dataset"]
        structured[ds_name] = {}
        for ex in ds_block["examples"]:
            # Parse input to get method, fold, max_rules
            inp = json.loads(ex["input"])
            method = inp["method"]
            fold = inp["fold"]
            max_rules = inp["max_rules"]
            # Parse output for per-fold metrics
            out = json.loads(ex["output"])
            structured.setdefault(ds_name, {}).setdefault(method, {}).setdefault(max_rules, {})[fold] = {
                "accuracy": _safe_float(out.get("accuracy", 0)),
                "auc": _safe_float(out.get("auc", 0)),
                "oblique_fraction": _safe_float(out.get("oblique_fraction", 0)),
                "n_oblique": out.get("n_oblique", 0),
                # Eval-level metrics from the example
                "eval_oblique_fraction": ex.get("eval_oblique_fraction", 0.0),
                "eval_degeneration_flag": ex.get("eval_degeneration_flag", 0.0),
            }
    return structured

# Restructure loaded data
structured_data = restructure_eval_data(data)
datasets = list(structured_data.keys())
print(f"Restructured data: {len(datasets)} datasets")
for ds in datasets:
    methods = list(structured_data[ds].keys())
    print(f"  {ds}: {len(methods)} methods")

### Block 1: Per-Dataset Accuracy/AUC Tables

Computes mean±std accuracy and AUC per method per dataset, with Bonferroni-corrected paired t-tests across all 15 method pairs.

In [None]:
# ---------------------------------------------------------------------------
# Block 1: Per-Dataset Accuracy/AUC Tables (from eval.py)
# ---------------------------------------------------------------------------

def block1_per_dataset_accuracy(
    data: dict,
    datasets: list[str],
    max_rules: int = 10,
) -> dict:
    """For each dataset at max_rules=10, compute mean+/-std accuracy and AUC,
    and run Bonferroni-corrected paired t-tests across all 15 method pairs."""
    results = {}
    for ds in datasets:
        ds_data = data.get(ds, {})
        method_accs: dict[str, list[float]] = {}
        method_aucs: dict[str, list[float]] = {}
        for method in METHODS:
            accs = []
            aucs = []
            mr_data = ds_data.get(method, {}).get(max_rules, {})
            for fold in range(N_FOLDS):
                fold_data = mr_data.get(fold, {})
                accs.append(fold_data.get("accuracy", float("nan")))
                aucs.append(fold_data.get("auc", float("nan")))
            method_accs[method] = accs
            method_aucs[method] = aucs

        # Method summaries
        method_stats = {}
        for method in METHODS:
            accs = np.array(method_accs[method])
            aucs = np.array(method_aucs[method])
            method_stats[method] = {
                "accuracy_mean": float(np.nanmean(accs)),
                "accuracy_std": float(np.nanstd(accs, ddof=1)),
                "auc_mean": float(np.nanmean(aucs)),
                "auc_std": float(np.nanstd(aucs, ddof=1)),
            }

        # Pairwise t-tests (Bonferroni)
        pairwise = {}
        method_pairs = list(combinations(METHODS, 2))
        for m1, m2 in method_pairs:
            t_stat, p_val = _paired_ttest(method_accs[m1], method_accs[m2])
            p_corrected = min(p_val * N_PAIRS, 1.0)
            sig = _bonferroni_sig(p_val)
            pairwise[f"{m1}_vs_{m2}"] = {
                "t_statistic": t_stat,
                "p_value_raw": p_val,
                "p_value_corrected": p_corrected,
                "significant": sig,
                "mean_diff": method_stats[m1]["accuracy_mean"] - method_stats[m2]["accuracy_mean"],
            }

        # Rank methods for this dataset
        means_dict = {m: method_stats[m]["accuracy_mean"] for m in METHODS}
        ranks = _rank_methods_per_dataset(means_dict)

        # Delta vs FIGS
        figs_mean = method_stats["FIGS"]["accuracy_mean"]

        results[ds] = {
            "method_stats": method_stats,
            "pairwise_tests": pairwise,
            "ranks": ranks,
            "delta_vs_figs": {m: method_stats[m]["accuracy_mean"] - figs_mean for m in METHODS},
        }

    return results

block1 = block1_per_dataset_accuracy(data=structured_data, datasets=datasets, max_rules=10)

# Print summary table
print("Block 1: Per-Dataset Accuracy (max_rules=10)")
print(f"{'Dataset':<15}", end="")
for m in METHODS:
    print(f"{m:>16}", end="")
print()
print("-" * (15 + 16 * len(METHODS)))
for ds in datasets:
    print(f"{ds:<15}", end="")
    for m in METHODS:
        s = block1[ds]["method_stats"][m]
        print(f"  {s['accuracy_mean']:.3f}±{s['accuracy_std']:.3f}", end="")
    print()

### Block 2: Critical Difference Diagram Data

Friedman test for overall method differences, plus Nemenyi CD for pairwise comparisons and clique identification.

In [None]:
# ---------------------------------------------------------------------------
# Block 2: Critical Difference Diagram Data (from eval.py)
# ---------------------------------------------------------------------------

def block2_critical_difference(
    data: dict,
    datasets: list[str],
    max_rules: int = 10,
) -> dict:
    """Friedman test + Nemenyi CD computation."""
    n_datasets = len(datasets)
    rank_matrix = np.zeros((n_datasets, N_METHODS))
    mean_acc_matrix = np.zeros((n_datasets, N_METHODS))

    for i, ds in enumerate(datasets):
        ds_data = data.get(ds, {})
        means = {}
        for j, method in enumerate(METHODS):
            mr_data = ds_data.get(method, {}).get(max_rules, {})
            accs = [mr_data.get(fold, {}).get("accuracy", float("nan")) for fold in range(N_FOLDS)]
            mean_acc = float(np.nanmean(accs))
            means[method] = mean_acc
            mean_acc_matrix[i, j] = mean_acc

        ranks = _rank_methods_per_dataset(means)
        for j, method in enumerate(METHODS):
            rank_matrix[i, j] = ranks[method]

    # Friedman test
    chi2, p_friedman = stats.friedmanchisquare(*[rank_matrix[:, j] for j in range(N_METHODS)])

    # Mean ranks
    mean_ranks = {METHODS[j]: float(np.mean(rank_matrix[:, j])) for j in range(N_METHODS)}

    # Nemenyi CD
    k = N_METHODS
    n = n_datasets
    q_alpha = _nemenyi_q_alpha(k=k, alpha=0.05)
    cd = q_alpha * math.sqrt(k * (k + 1) / (6 * n))

    # Identify cliques
    sorted_methods = sorted(mean_ranks.items(), key=lambda x: x[1])
    cliques = []
    for i in range(len(sorted_methods)):
        clique = [sorted_methods[i][0]]
        for j in range(i + 1, len(sorted_methods)):
            if sorted_methods[j][1] - sorted_methods[i][1] < cd:
                clique.append(sorted_methods[j][0])
        if len(clique) > 1:
            is_subset = any(set(clique).issubset(set(c)) for c in cliques)
            if not is_subset:
                cliques.append(clique)

    result = {
        "friedman_chi_sq": float(chi2),
        "friedman_p": float(p_friedman),
        "mean_ranks": mean_ranks,
        "nemenyi_cd": cd,
        "nemenyi_q_alpha": q_alpha,
        "cd_cliques": cliques,
        "sorted_methods": [m for m, _ in sorted_methods],
        "sorted_ranks": [float(r) for _, r in sorted_methods],
    }
    return result

block2 = block2_critical_difference(data=structured_data, datasets=datasets, max_rules=10)

print(f"Friedman chi² = {block2['friedman_chi_sq']:.2f}, p = {block2['friedman_p']:.2e}")
print(f"Nemenyi CD = {block2['nemenyi_cd']:.3f}")
print("\nMean Ranks:")
for m, r in sorted(block2["mean_ranks"].items(), key=lambda x: x[1]):
    print(f"  {m:<20} {r:.2f}")
print(f"\nCD Cliques: {block2['cd_cliques']}")

### Block 5: Accuracy-at-Matched-Complexity Curves

Computes mean accuracy at `max_rules={5,10,15}` for each method and tests whether SG-FIGS-25 peaks at lower complexity than FIGS.

In [None]:
# ---------------------------------------------------------------------------
# Block 5: Accuracy-at-Matched-Complexity Curves (from eval.py)
# ---------------------------------------------------------------------------

def block5_complexity_curves(
    data: dict,
    datasets: list[str],
) -> dict:
    """Compute mean accuracy at max_rules={5,10,15} for each method."""
    results = {}
    for method in METHODS:
        mr_results = {}
        for mr in MAX_RULES_VALUES:
            all_accs = []
            for ds in datasets:
                ds_data = data.get(ds, {})
                mr_data = ds_data.get(method, {}).get(mr, {})
                for fold in range(N_FOLDS):
                    acc = mr_data.get(fold, {}).get("accuracy", float("nan"))
                    all_accs.append(acc)
            mean_acc = float(np.nanmean(all_accs))
            mr_results[mr] = mean_acc
        # Peak
        peak_mr = max(mr_results, key=mr_results.get)
        peak_acc = mr_results[peak_mr]
        efficiency_ratio = peak_acc / peak_mr if peak_mr > 0 else 0.0
        results[method] = {
            "accuracy_at_mr5": mr_results[5],
            "accuracy_at_mr10": mr_results[10],
            "accuracy_at_mr15": mr_results[15],
            "peak_max_rules": peak_mr,
            "peak_accuracy": peak_acc,
            "efficiency_ratio": efficiency_ratio,
        }

    # Per-dataset peak complexity sign test
    sg25_lower_count = 0
    figs_lower_count = 0
    ties = 0
    for ds in datasets:
        ds_data = data.get(ds, {})
        for target_method, counter_method in [("SG-FIGS-25", "FIGS")]:
            target_peaks = {}
            counter_peaks = {}
            for mr in MAX_RULES_VALUES:
                target_accs = [ds_data.get(target_method, {}).get(mr, {}).get(fold, {}).get("accuracy", float("nan"))
                               for fold in range(N_FOLDS)]
                counter_accs = [ds_data.get(counter_method, {}).get(mr, {}).get(fold, {}).get("accuracy", float("nan"))
                                for fold in range(N_FOLDS)]
                target_peaks[mr] = float(np.nanmean(target_accs))
                counter_peaks[mr] = float(np.nanmean(counter_accs))
            target_peak_mr = max(target_peaks, key=target_peaks.get)
            counter_peak_mr = max(counter_peaks, key=counter_peaks.get)
            if target_peak_mr < counter_peak_mr:
                sg25_lower_count += 1
            elif target_peak_mr > counter_peak_mr:
                figs_lower_count += 1
            else:
                ties += 1

    n_sign = sg25_lower_count + figs_lower_count
    if n_sign > 0:
        sign_test_p = float(stats.binomtest(sg25_lower_count, n_sign, 0.5).pvalue)
    else:
        sign_test_p = 1.0

    results["peak_complexity_comparison"] = {
        "sg25_peaks_lower": sg25_lower_count,
        "figs_peaks_lower": figs_lower_count,
        "ties": ties,
        "sign_test_p": sign_test_p,
    }
    return results

block5 = block5_complexity_curves(data=structured_data, datasets=datasets)

print("Block 5: Accuracy at Matched Complexity")
print(f"{'Method':<20} {'MR=5':>8} {'MR=10':>8} {'MR=15':>8} {'Peak':>6} {'Eff.Ratio':>10}")
print("-" * 62)
for m in METHODS:
    b = block5[m]
    print(f"{m:<20} {b['accuracy_at_mr5']:>8.4f} {b['accuracy_at_mr10']:>8.4f} "
          f"{b['accuracy_at_mr15']:>8.4f} {b['peak_max_rules']:>6.0f} {b['efficiency_ratio']:>10.4f}")
pcc = block5["peak_complexity_comparison"]
print(f"\nSign test: SG25 lower={pcc['sg25_peaks_lower']}, "
      f"FIGS lower={pcc['figs_peaks_lower']}, ties={pcc['ties']}, p={pcc['sign_test_p']:.3f}")

### Block 6: Ablation Decomposition

Decomposes accuracy changes into `oblique_penalty` (FIGS - RO-FIGS) and `synergy_effect` (SG-FIGS-25 - RO-FIGS), classifying datasets into categories.

In [None]:
# ---------------------------------------------------------------------------
# Block 6: Ablation Decomposition (from eval.py)
# ---------------------------------------------------------------------------

def block6_ablation_decomposition(
    data: dict,
    datasets: list[str],
    max_rules: int = 10,
) -> dict:
    """Decompose accuracy changes: oblique_penalty and synergy_effect."""
    results = {}
    categories_count = {"oblique_harmful": 0, "synergy_helps": 0, "synergy_hurts": 0, "neutral": 0}

    for ds in datasets:
        ds_data = data.get(ds, {})
        means = {}
        for method in ["FIGS", "RO-FIGS", "SG-FIGS-25"]:
            mr_data = ds_data.get(method, {}).get(max_rules, {})
            accs = [mr_data.get(fold, {}).get("accuracy", float("nan")) for fold in range(N_FOLDS)]
            means[method] = float(np.nanmean(accs))

        figs_acc = means["FIGS"]
        rofigs_acc = means["RO-FIGS"]
        sgfigs25_acc = means["SG-FIGS-25"]

        oblique_penalty = figs_acc - rofigs_acc
        synergy_effect = sgfigs25_acc - rofigs_acc
        total_gap = figs_acc - sgfigs25_acc

        # Classify
        category = "neutral"
        if oblique_penalty > 0.05:
            category = "oblique_harmful"
        if synergy_effect > 0:
            if category == "oblique_harmful":
                category = "oblique_harmful"
            else:
                category = "synergy_helps"
        elif synergy_effect < -0.05:
            category = "synergy_hurts"

        if category in categories_count:
            categories_count[category] += 1

        results[ds] = {
            "figs_accuracy": figs_acc,
            "rofigs_accuracy": rofigs_acc,
            "sgfigs25_accuracy": sgfigs25_acc,
            "oblique_penalty": oblique_penalty,
            "synergy_effect": synergy_effect,
            "total_gap": total_gap,
            "category": category,
        }

    n_datasets = len(datasets)
    category_fractions = {k: v / n_datasets for k, v in categories_count.items()}

    return {
        "per_dataset": results,
        "category_counts": categories_count,
        "category_fractions": category_fractions,
    }

block6 = block6_ablation_decomposition(data=structured_data, datasets=datasets, max_rules=10)

print("Block 6: Ablation Decomposition")
print(f"{'Dataset':<15} {'FIGS':>8} {'RO-FIGS':>8} {'SG25':>8} {'OblPen':>8} {'SynEff':>8} {'Category':<18}")
print("-" * 85)
for ds in datasets:
    b = block6["per_dataset"][ds]
    print(f"{ds:<15} {b['figs_accuracy']:>8.4f} {b['rofigs_accuracy']:>8.4f} "
          f"{b['sgfigs25_accuracy']:>8.4f} {b['oblique_penalty']:>+8.4f} "
          f"{b['synergy_effect']:>+8.4f} {b['category']:<18}")
print(f"\nCategory fractions: {block6['category_fractions']}")

### Aggregate Metrics Summary

Display key aggregate metrics from the pre-computed `metrics_agg` dictionary.

In [None]:
# Display pre-computed aggregate metrics from the data file
metrics = data["metrics_agg"]

print("=" * 60)
print("AGGREGATE METRICS SUMMARY")
print("=" * 60)

print(f"\nFriedman test: chi²={metrics['eval_friedman_chi_sq']:.2f}, "
      f"p={metrics['eval_friedman_p']:.2e}")
print(f"Nemenyi CD: {metrics['eval_nemenyi_cd']:.3f}")

print("\nMethod Rankings (from full eval):")
print(f"  {'Method':<20} {'Mean Rank':>10} {'Grand Mean Acc':>15}")
print(f"  {'-'*47}")
for m in METHODS:
    safe = m.replace("-", "_")
    rank = metrics[f"eval_mean_rank_{safe}"]
    acc = metrics[f"eval_grand_mean_accuracy_{safe}"]
    print(f"  {m:<20} {rank:>10.2f} {acc:>15.4f}")

print(f"\nAblation fractions:")
for cat in ["oblique_harmful", "synergy_helps", "synergy_hurts", "neutral"]:
    frac = metrics[f"eval_ablation_frac_{cat}"]
    print(f"  {cat}: {frac:.1%}")

print(f"\nPositive case studies: {int(metrics['eval_n_positive_case_studies'])}")

### Visualization

Three-panel figure: (1) Mean accuracy per method per dataset, (2) Accuracy-at-complexity curves, (3) Ablation decomposition.

In [None]:
def visualize_results(block1_res, block5_res, block6_res, datasets_list, title_prefix=""):
    """Reusable visualization function for evaluation results.

    Creates a 3-panel figure:
      1. Mean accuracy per method per dataset (grouped bar chart)
      2. Accuracy-at-complexity curves (line plot across max_rules)
      3. Ablation decomposition (stacked bar: oblique_penalty vs synergy_effect)
    """
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b"]
    method_short = ["FIGS", "RO-FIGS", "SG10", "SG25", "SG50", "GB"]

    # --- Panel 1: Grouped bar chart of mean accuracy ---
    ax = axes[0]
    x = np.arange(len(datasets_list))
    width = 0.12
    for j, method in enumerate(METHODS):
        accs = [block1_res[ds]["method_stats"][method]["accuracy_mean"] for ds in datasets_list]
        ax.bar(x + j * width - width * 2.5, accs, width, label=method_short[j], color=colors[j])
    ax.set_xticks(x)
    ax.set_xticklabels(datasets_list, rotation=45, ha="right", fontsize=9)
    ax.set_ylabel("Mean Accuracy")
    ax.set_title(f"{title_prefix}Accuracy by Method & Dataset")
    ax.legend(fontsize=7, ncol=2)
    ax.set_ylim(0.4, 1.05)

    # --- Panel 2: Complexity curves ---
    ax = axes[1]
    mr_vals = MAX_RULES_VALUES
    for j, method in enumerate(METHODS):
        accs = [block5_res[method][f"accuracy_at_mr{mr}"] for mr in mr_vals]
        ax.plot(mr_vals, accs, "o-", label=method_short[j], color=colors[j], linewidth=2)
    ax.set_xlabel("max_rules")
    ax.set_ylabel("Mean Accuracy")
    ax.set_title(f"{title_prefix}Accuracy vs Complexity")
    ax.legend(fontsize=7)
    ax.set_xticks(mr_vals)

    # --- Panel 3: Ablation decomposition ---
    ax = axes[2]
    ds_names = datasets_list
    oblique_penalties = [block6_res["per_dataset"][ds]["oblique_penalty"] for ds in ds_names]
    synergy_effects = [block6_res["per_dataset"][ds]["synergy_effect"] for ds in ds_names]
    x = np.arange(len(ds_names))
    ax.bar(x - 0.15, oblique_penalties, 0.3, label="Oblique Penalty", color="#1f77b4")
    ax.bar(x + 0.15, synergy_effects, 0.3, label="Synergy Effect", color="#2ca02c")
    ax.axhline(y=0, color="black", linewidth=0.5)
    ax.set_xticks(x)
    ax.set_xticklabels(ds_names, rotation=45, ha="right", fontsize=9)
    ax.set_ylabel("Accuracy Difference")
    ax.set_title(f"{title_prefix}Ablation Decomposition")
    ax.legend(fontsize=8)

    plt.tight_layout()
    plt.show()

# Run visualization for Part 1 (mini data)
visualize_results(block1, block5, block6, datasets, title_prefix="[Mini] ")

---
## Part 2 — Full Run (Original Parameters)

Loads the full dataset (all 12 datasets, 360 examples) and re-runs all analysis blocks with original parameters.

In [None]:
# Load full dataset
data = load_full()
print(f"Loaded full data: {len(data['datasets'])} datasets, "
      f"{sum(len(d['examples']) for d in data['datasets'])} examples")
print(f"Datasets: {[d['dataset'] for d in data['datasets']]}")

In [None]:
# Restructure full data and run all blocks with original parameters
structured_data = restructure_eval_data(data)
datasets = list(structured_data.keys())

# Block 1: Per-dataset accuracy (max_rules=10, all 12 datasets)
block1 = block1_per_dataset_accuracy(data=structured_data, datasets=datasets, max_rules=10)

print("Block 1: Per-Dataset Accuracy (max_rules=10) — Full Run")
print(f"{'Dataset':<15}", end="")
for m in METHODS:
    print(f"{m:>16}", end="")
print()
print("-" * (15 + 16 * len(METHODS)))
for ds in datasets:
    print(f"{ds:<15}", end="")
    for m in METHODS:
        s = block1[ds]["method_stats"][m]
        print(f"  {s['accuracy_mean']:.3f}±{s['accuracy_std']:.3f}", end="")
    print()

In [None]:
# Block 2: Critical difference (Friedman + Nemenyi) — Full Run
block2 = block2_critical_difference(data=structured_data, datasets=datasets, max_rules=10)

print(f"Friedman chi² = {block2['friedman_chi_sq']:.2f}, p = {block2['friedman_p']:.2e}")
print(f"Nemenyi CD = {block2['nemenyi_cd']:.3f}")
print("\nMean Ranks:")
for m, r in sorted(block2["mean_ranks"].items(), key=lambda x: x[1]):
    print(f"  {m:<20} {r:.2f}")
print(f"\nCD Cliques: {block2['cd_cliques']}")

In [None]:
# Block 5: Complexity curves — Full Run
block5 = block5_complexity_curves(data=structured_data, datasets=datasets)

print("Block 5: Accuracy at Matched Complexity — Full Run")
print(f"{'Method':<20} {'MR=5':>8} {'MR=10':>8} {'MR=15':>8} {'Peak':>6} {'Eff.Ratio':>10}")
print("-" * 62)
for m in METHODS:
    b = block5[m]
    print(f"{m:<20} {b['accuracy_at_mr5']:>8.4f} {b['accuracy_at_mr10']:>8.4f} "
          f"{b['accuracy_at_mr15']:>8.4f} {b['peak_max_rules']:>6.0f} {b['efficiency_ratio']:>10.4f}")
pcc = block5["peak_complexity_comparison"]
print(f"\nSign test: SG25 lower={pcc['sg25_peaks_lower']}, "
      f"FIGS lower={pcc['figs_peaks_lower']}, ties={pcc['ties']}, p={pcc['sign_test_p']:.3f}")

In [None]:
# Block 6: Ablation decomposition — Full Run
block6 = block6_ablation_decomposition(data=structured_data, datasets=datasets, max_rules=10)

print("Block 6: Ablation Decomposition — Full Run")
print(f"{'Dataset':<15} {'FIGS':>8} {'RO-FIGS':>8} {'SG25':>8} {'OblPen':>8} {'SynEff':>8} {'Category':<18}")
print("-" * 85)
for ds in datasets:
    b = block6["per_dataset"][ds]
    print(f"{ds:<15} {b['figs_accuracy']:>8.4f} {b['rofigs_accuracy']:>8.4f} "
          f"{b['sgfigs25_accuracy']:>8.4f} {b['oblique_penalty']:>+8.4f} "
          f"{b['synergy_effect']:>+8.4f} {b['category']:<18}")
print(f"\nCategory fractions: {block6['category_fractions']}")

In [None]:
# Full aggregate metrics summary
metrics = data["metrics_agg"]

print("=" * 60)
print("FULL AGGREGATE METRICS SUMMARY (12 datasets)")
print("=" * 60)

print(f"\nFriedman test: chi²={metrics['eval_friedman_chi_sq']:.2f}, "
      f"p={metrics['eval_friedman_p']:.2e}")
print(f"Nemenyi CD: {metrics['eval_nemenyi_cd']:.3f}")

print("\nMethod Rankings:")
print(f"  {'Method':<20} {'Mean Rank':>10} {'Grand Mean Acc':>15}")
print(f"  {'-'*47}")
for m in METHODS:
    safe = m.replace("-", "_")
    rank = metrics[f"eval_mean_rank_{safe}"]
    acc = metrics[f"eval_grand_mean_accuracy_{safe}"]
    print(f"  {m:<20} {rank:>10.2f} {acc:>15.4f}")

print(f"\nAblation fractions:")
for cat in ["oblique_harmful", "synergy_helps", "synergy_hurts", "neutral"]:
    frac = metrics[f"eval_ablation_frac_{cat}"]
    print(f"  {cat}: {frac:.1%}")

print(f"\nPositive case studies: {int(metrics['eval_n_positive_case_studies'])}")

In [None]:
# Full visualization — reuse the same function
visualize_results(block1, block5, block6, datasets, title_prefix="[Full] ")