# Nonlinearity Reduction Experiment Analysis

This notebook aggregates and analyzes results produced by the experimental framework:

Features:
- Loads `history.json` / `meta.json` from multiple runs in `runs/`
- Extracts % linear neurons (from run naming or gating stats)
- Plots accuracy vs. % linear neurons (Tasks 1 & 2)
- Visualizes gating alpha distributions over epochs (Approach 2)
- Compares latency vs. accuracy (efficiency trade-off)
- Computes Pareto frontier (accuracy vs. latency)
- Aggregates per-layer gating stats (who becomes linear?)
- Provides hooks for activation stats integration (post-pruning)

Adapt / extend for later tasks (ImageNet, Transformers, Tabular).

In [None]:
# Environment & Imports
import os, json, re, math, glob
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Any
from collections import defaultdict

%matplotlib inline
sns.set_theme(context="notebook", style="whitegrid")

RUNS_DIR = Path("runs")  # adjust if needed

def percent(x):
    return f"{100*x:.2f}%"

## 1. Load All Runs

Expect each run directory to contain:
- `history.json`: list of epoch dicts
- `meta.json`: metadata (param counts, latency, etc.)

We infer linear ratio:
1. From run name tokens containing integers (e.g. `mnist_fixed_50` → 0.50)
2. From config (if gating, approximate after harden: fraction alpha < 0.5 or < threshold)

For gating runs, we also extract alpha stats per epoch (stored in history).

In [None]:
def infer_ratio_from_name(name: str):
    # Look for token that's 0-100 integer
    for token in name.split('_'):
        if token.isdigit():
            val = int(token)
            if 0 <= val <= 100:
                return val / 100.0
    return None

def load_runs(runs_dir=RUNS_DIR):
    runs = []
    for path in runs_dir.glob('*'):
        if not path.is_dir():
            continue
        hist_path = path / 'history.json'
        meta_path = path / 'meta.json'
        if not hist_path.exists() or not meta_path.exists():
            continue
        try:
            history = json.loads(hist_path.read_text())
            meta = json.loads(meta_path.read_text())
        except json.JSONDecodeError:
            continue
        run_name = path.name
        cfg = meta.get('config', {})
        approach = cfg.get('approach', 'unknown')
        # Base ratio inference
        ratio = infer_ratio_from_name(run_name)
        # If gating and gating stats exist, derive final effective nonlinear fraction
        gating_final = None
        if approach == 'gating' and len(history) > 0:
            last = history[-1]
            gstats = last.get('gating', [])
            if gstats:
                # Use mean alpha as proxy; effective linear ratio = mean(1-alpha)
                alphas = [layer_stat['alpha_mean'] for layer_stat in gstats if 'alpha_mean' in layer_stat]
                if alphas:
                    mean_alpha = np.mean(alphas)
                    gating_final = 1 - mean_alpha  # fraction linear-ish
        runs.append({
            'name': run_name,
            'history': history,
            'meta': meta,
            'config': cfg,
            'approach': approach,
            'ratio_name': ratio,
            'gating_linear_ratio': gating_final
        })
    return runs

runs = load_runs()
print(f"Loaded {len(runs)} runs.")
runs[:2]  # preview

## 2. Build Summary Table

Extract final epoch metrics (val accuracy, train accuracy, train time) & latency.

In [None]:
def summarize_runs(runs):
    rows = []
    for r in runs:
        if not r['history']:
            continue
        final = r['history'][-1]
        meta = r['meta']
        cfg = r['config']
        latency = meta.get('latency', {})
        approx_ratio = r['ratio_name']
        if r['approach'] == 'gating' and r['gating_linear_ratio'] is not None:
            approx_ratio = r['gating_linear_ratio']
        rows.append({
            'run': r['name'],
            'approach': r['approach'],
            'linear_ratio_est': approx_ratio,
            'val_acc': final.get('val_acc'),
            'train_acc': final.get('train_acc'),
            'val_loss': final.get('val_loss'),
            'train_loss': final.get('train_loss'),
            'epochs': final.get('epoch'),
            'mean_latency_s': latency.get('mean_latency_s'),
            'p50_latency_s': latency.get('p50_latency_s'),
            'p90_latency_s': latency.get('p90_latency_s'),
            'params': meta.get('param_counts', {}).get('trainable_params'),
            'flops_linear_only': meta.get('approx_linear_flops'),
            'memory_mb': meta.get('memory_mb')
        })
    df = pd.DataFrame(rows)
    return df

summary_df = summarize_runs(runs)
summary_df.sort_values('linear_ratio_est').head()

## 3. Accuracy vs Linear Ratio

Plotted for fixed approach + gating (gating uses effective ratio estimate).

In [None]:
def plot_accuracy_vs_ratio(df):
    dfp = df.dropna(subset=['linear_ratio_est', 'val_acc']).copy()
    plt.figure(figsize=(7,5))
    sns.lineplot(
        data=dfp,
        x='linear_ratio_est', y='val_acc', hue='approach', style='approach', marker='o'
    )
    plt.xlabel('Estimated Linear Ratio')
    plt.ylabel('Validation Accuracy')
    plt.title('Accuracy vs Linear Neuron Ratio')
    plt.legend()
    plt.tight_layout()
    plt.show()

plot_accuracy_vs_ratio(summary_df)

## 4. Latency vs Accuracy (Pareto Frontier)

Examines efficiency trade-off.

In [None]:
def compute_pareto(df, acc_col='val_acc', lat_col='mean_latency_s'):
    d = df.dropna(subset=[acc_col, lat_col]).sort_values(lat_col)
    pareto = []
    best_acc = -np.inf
    for _, row in d.iterrows():
        if row[acc_col] > best_acc:
            pareto.append(row)
            best_acc = row[acc_col]
    return pd.DataFrame(pareto)

def plot_latency_tradeoff(df):
    d = df.dropna(subset=['val_acc', 'mean_latency_s']).copy()
    plt.figure(figsize=(7,5))
    sns.scatterplot(
        data=d,
        x='mean_latency_s', y='val_acc', hue='linear_ratio_est', size='linear_ratio_est', palette='viridis', sizes=(30,180)
    )
    pareto = compute_pareto(d)
    if not pareto.empty:
        plt.plot(pareto['mean_latency_s'], pareto['val_acc'], color='red', linewidth=2, label='Pareto Frontier')
    plt.xlabel('Mean Inference Latency (s)')
    plt.ylabel('Validation Accuracy')
    plt.title('Latency vs Accuracy Trade-off')
    plt.legend()
    plt.tight_layout()
    plt.show()

plot_latency_tradeoff(summary_df)

## 5. Over-Time Training Curves

Compare convergence behavior for selected runs.

In [None]:
def get_run_history_df(run):
    rows = []
    for ep in run['history']:
        rows.append({
            'epoch': ep['epoch'],
            'val_acc': ep.get('val_acc'),
            'train_acc': ep.get('train_acc'),
            'val_loss': ep.get('val_loss'),
            'train_loss': ep.get('train_loss'),
            'run': run['name'],
            'approach': run['approach']
        })
    return pd.DataFrame(rows)

histories = pd.concat([get_run_history_df(r) for r in runs if r['history']], ignore_index=True)
plt.figure(figsize=(8,5))
sns.lineplot(data=histories, x='epoch', y='val_acc', hue='run')
plt.title('Validation Accuracy Over Epochs (All Runs)')
plt.xlabel('Epoch')
plt.ylabel('Val Acc')
plt.tight_layout()
plt.show()

# Optional: filter clutter by choosing top N or pattern
subset_pattern = None  # e.g. 'mnist_fixed'
if subset_pattern:
    filt = histories[histories['run'].str.contains(subset_pattern)]
    plt.figure(figsize=(8,5))
    sns.lineplot(data=filt, x='epoch', y='val_acc', hue='run')
    plt.title(f'Validation Accuracy Over Epochs ({subset_pattern})')
    plt.tight_layout()
    plt.show()

## 6. Gating Alpha Dynamics (Approach 2)

Shows how alphas evolve toward linear (low alpha) vs nonlinear (high alpha).

In [None]:
def extract_gating_time_series(runs):
    recs = []
    for r in runs:
        if r['approach'] != 'gating':
            continue
        for ep in r['history']:
            gating_list = ep.get('gating', [])
            for layer_stat in gating_list:
                if 'alpha_mean' in layer_stat:
                    recs.append({
                        'run': r['name'],
                        'epoch': ep['epoch'],
                        'layer': layer_stat['layer'],
                        'alpha_mean': layer_stat['alpha_mean'],
                        'alpha_median': layer_stat['alpha_median'],
                        'alpha_lt_0.1': layer_stat.get('alpha_lt_0.1'),
                        'alpha_gt_0.9': layer_stat.get('alpha_gt_0.9')
                    })
    return pd.DataFrame(recs)

gating_df = extract_gating_time_series(runs)
if not gating_df.empty:
    plt.figure(figsize=(9,5))
    sns.lineplot(data=gating_df, x='epoch', y='alpha_mean', hue='layer', style='run', markers=True, dashes=False)
    plt.title('Gating Alpha Mean per Layer over Epochs')
    plt.ylabel('Alpha Mean (Nonlinearity Weight)')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(9,5))
    sns.lineplot(data=gating_df, x='epoch', y='alpha_lt_0.1', hue='layer', style='run')
    plt.title('Fraction of Nearly-Linear Neurons (alpha < 0.1)')
    plt.ylabel('Fraction')
    plt.tight_layout()
    plt.show()
else:
    print('No gating runs found for alpha dynamics.')

## 7. Generalization Gap vs Ratio

Compute (train_acc - val_acc) for potential overfitting differences.

In [None]:
gap_df = summary_df.copy()
gap_df['gen_gap'] = gap_df['train_acc'] - gap_df['val_acc']
plt.figure(figsize=(7,5))
sns.barplot(data=gap_df.dropna(subset=['linear_ratio_est']), x='linear_ratio_est', y='gen_gap', hue='approach')
plt.title('Generalization Gap vs Linear Ratio')
plt.xlabel('Linear Ratio (Estimated)')
plt.ylabel('Train Acc - Val Acc')
plt.tight_layout()
plt.show()
gap_df.sort_values('gen_gap', ascending=False).head()

## 8. Parameter & FLOP Normalization

Assess whether accuracy differences correlate with approximate linear-layer FLOPs (proxy) & param counts.

In [None]:
flop_df = summary_df.dropna(subset=['val_acc', 'flops_linear_only']).copy()
plt.figure(figsize=(7,5))
sns.scatterplot(data=flop_df, x='flops_linear_only', y='val_acc', hue='approach', size='linear_ratio_est')
plt.title('Accuracy vs Approx Linear FLOPs')
plt.xlabel('Approx Linear Layer FLOPs (Theoretical)')
plt.ylabel('Val Acc')
plt.tight_layout()
plt.show()
flop_df[['run','flops_linear_only','val_acc','linear_ratio_est']].sort_values('val_acc', ascending=False).head()

## 9. Efficiency Composite Metric

Example: Accuracy / Latency, to rank runs on a simple scalar efficiency score.

In [None]:
eff_df = summary_df.dropna(subset=['val_acc', 'mean_latency_s']).copy()
eff_df['acc_per_ms'] = eff_df['val_acc'] / (eff_df['mean_latency_s'] * 1000)
eff_df.sort_values('acc_per_ms', ascending=False).head(10)

## 10. Save Aggregated CSV

Useful for external reporting.

In [None]:
summary_path = Path('analysis_summary.csv')
summary_df.to_csv(summary_path, index=False)
print(f'Saved summary to {summary_path.resolve()}')

## 11. Hooks for Activation Stats / Pruning Analysis (Extend Later)

If you saved activation stats (e.g., before/after pruning) in a JSON file per run, load and compare here.

In [None]:
# Example placeholder for activation stats integration
def load_activation_stats(run_dir: Path):
    cand = run_dir / 'activation_stats.json'
    if cand.exists():
        try:
            return json.loads(cand.read_text())
        except json.JSONDecodeError:
            return None
    return None

activation_records = []
for r in runs:
    stats = load_activation_stats(RUNS_DIR / r['name'])
    if stats:
        # Flatten layer stats example (depends on your actual format)
        for layer, st in stats.items():
            activation_records.append({
                'run': r['name'],
                'layer': layer,
                **st
            })

if activation_records:
    act_df = pd.DataFrame(activation_records)
    display(act_df.head())
else:
    print('No activation stats found (expected if pruning not run).')

## 12. Findings Template (Fill Manually)

| Aspect | Observation | Notes |
|--------|------------|-------|
| Min nonlinear % (MNIST) |  |  |
| Min nonlinear % (CIFAR-10 head) |  |  |
| Gating convergence pattern |  |  |
| Latency gain at 50% linear |  |  |
| Generalization gap trend |  |  |
| Layer sensitivity (if layerwise) |  |  |
| Pruning vs fixed baseline |  |  |

Use this section to consolidate structured insights for report writing.

## 13. Next Extensions
- Add violin plots for per-layer alpha distributions
- Integrate energy metrics (if recorded)
- Add adversarial robustness evaluation overlay
- Compare early vs late layers (requires saving layer indices mapping)
- Multi-dataset aggregation (tabular vs vision vs NLP)

Feel free to adapt and push improvements.