# KDM — SOM Hyperparameter Tuning
### Grid Size · Iterations · Samples

Based on **Section 3.2.3** of Septier et al. — the paper tunes three hyperparameters in sequence:
1. **Grid size** (number of nodes) — scanned first, others fixed at max  
2. **Number of iterations** — scanned after optimal grid found  
3. **Number of training samples** — scanned last

Two scoring metrics guide the search:
- **TE** (Topological Error) → measures topology preservation → target ≈ 0  
- **QE** (Quantisation Error) → measures map resolution → target small  

Additionally, **classification performance** (balanced accuracy) is measured to find the grid size that best separates crisis vs non-crisis events.

---

## 0 · Configuration

In [None]:
# ╔══════════════════════════════════════════════════════╗
# ║              HYPERPARAMETER TUNING CONFIG            ║
# ╚══════════════════════════════════════════════════════╝

INPUT_CSV    = "your_catalogue.csv"   # ← your catalogue
CSV_SEP      = ","
FEATURE_COLS = ["N+", "T+", "R+", "dm+"]
OUTPUT_DIR   = "kdm_tuning"

# ── Search grids ─────────────────────────────────────────
GRID_SIZES   = [2, 3, 4, 5, 6, 8, 10]     # grid side lengths to test (2→4 nodes … 10→100)
ITER_VALUES  = [1_000, 5_000, 10_000, 25_000, 50_000, 100_000]   # iterations to test
SAMPLE_FRACS = [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]  # fraction of data as training samples

# ── Fixed values while tuning the other param ────────────
FIXED_GRID_SIZE   = 4        # used while tuning iterations & samples
FIXED_N_ITER      = 50_000   # used while tuning grid size & samples
FIXED_SAMPLE_FRAC = 1.0      # used while tuning grid size & iterations

# ── Repeatability ─────────────────────────────────────────
N_REPEATS    = 3             # repeat each config N times (variance estimate)
RANDOM_SEEDS = [42, 7, 123]  # one seed per repeat

# ── Convergence check (ratio stability) ──────────────────
# Paper: crisis/non-crisis ratio should not vary > 5% across repeats
RATIO_TOLERANCE = 0.05

print("✓ Config loaded")
print(f"  Grid sizes   : {GRID_SIZES}")
print(f"  Iterations   : {ITER_VALUES}")
print(f"  Sample fracs : {SAMPLE_FRACS}")
print(f"  Repeats      : {N_REPEATS}")

## 1 · Imports

In [None]:
import os, time, warnings
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import RobustScaler

warnings.filterwarnings("ignore")
os.makedirs(OUTPUT_DIR, exist_ok=True)

plt.rcParams.update({
    "figure.dpi": 130, "axes.spines.top": False,
    "axes.spines.right": False, "axes.grid": True, "grid.alpha": 0.3,
})
print("✓ Libraries ready")

## 2 · SOM & Classifier (compact versions for fast tuning)

In [None]:
class SOM:
    """Minimal SOM for hyperparameter tuning — same algorithm as full KDM."""

    def __init__(self, grid_size, n_features, n_iterations,
                 learning_rate=0.5, random_state=42):
        self.G  = grid_size
        self.nf = n_features
        self.ni = n_iterations
        self.lr0 = learning_rate
        self.s0  = grid_size / 2.0
        self.rng = np.random.default_rng(random_state)
        self.W   = self.rng.uniform(0, 1, (grid_size * grid_size, n_features)).astype(np.float32)
        xx, yy   = np.meshgrid(np.arange(grid_size), np.arange(grid_size))
        self._xy = np.column_stack([xx.ravel(), yy.ravel()])

    def _bmu(self, x):
        d = self.W - x
        return int(np.argmin(np.einsum("ij,ij->i", d, d)))

    def fit(self, X, n_samples=None):
        X   = X.astype(np.float32)
        n   = len(X)
        tau = self.ni / np.log(self.s0 + 1e-9)
        # Subsample if n_samples specified
        if n_samples is not None and n_samples < n:
            idx = self.rng.choice(n, n_samples, replace=False)
            X   = X[idx]
            n   = n_samples
        for t in range(1, self.ni + 1):
            sigma = self.s0  * np.exp(-t / tau)
            lr    = self.lr0 * np.exp(-t / self.ni)
            x     = X[self.rng.integers(0, n)]
            bmu   = self._bmu(x)
            br, bc = divmod(bmu, self.G)
            d2 = np.sum((self._xy - [br, bc]) ** 2, axis=1)
            h  = np.exp(-d2 / (2 * max(sigma, 1e-4) ** 2)).reshape(-1, 1)
            self.W += lr * h * (x - self.W)
        return self

    def map_flat(self, X):
        X = X.astype(np.float32)
        out = np.empty(len(X), dtype=int)
        for i, x in enumerate(X):
            d = self.W - x
            out[i] = int(np.argmin(np.einsum("ij,ij->i", d, d)))
        return out

    def topological_error(self, X, n=1500):
        X = X[:n].astype(np.float32)
        err = 0
        for x in X:
            d    = self.W - x
            ds   = np.einsum("ij,ij->i", d, d)
            top2 = np.argsort(ds)[:2]
            b1   = np.array(divmod(top2[0], self.G))
            b2   = np.array(divmod(top2[1], self.G))
            if np.max(np.abs(b1 - b2)) > 1:
                err += 1
        return err / len(X)

    def quantisation_error(self, X, n=1500):
        X = X[:n].astype(np.float32)
        tot = 0.0
        for x in X:
            d   = self.W - x
            tot += float(np.sqrt(np.einsum("ij,ij->i", d, d).min()))
        return tot / len(X)


def classify_som(som, X_scaled, n_clusters=None):
    """Run agglomerative clustering + centroid classification on trained SOM."""
    G   = som.G
    nc  = n_clusters if n_clusters else max(2, (G * G) // 2)
    agg = AgglomerativeClustering(n_clusters=nc, linkage="ward")
    node_cl = agg.fit_predict(som.W)
    bmu_flat = som.map_flat(X_scaled)
    ev_cl    = node_cl[bmu_flat]
    cl_ids   = np.unique(node_cl)

    centroids = np.array([
        X_scaled[ev_cl == cid].mean(axis=0) if (ev_cl == cid).sum() > 0
        else som.W[node_cl == cid].mean(axis=0)
        for cid in cl_ids
    ])

    # Identify feature roles
    feat_names = FEATURE_COLS
    def fi(keys):
        for i, n in enumerate(feat_names):
            if any(k.lower() in n.lower() for k in keys): return i
        return None
    iT = fi(["t+"]); iR = fi(["r+"]); iN = fi(["n+"]); idm = fi(["dm+"])

    Ak = np.zeros(len(cl_ids)); Bk = np.zeros(len(cl_ids))
    for k in range(len(cl_ids)):
        Ck = centroids[k]; a = b = 0.0
        if iT  is not None: T=centroids[:,iT];  a+=abs(np.min(T)-Ck[iT])/(abs(np.min(T))+1e-12); b+=abs(np.max(T)-Ck[iT])/(abs(np.max(T))+1e-12)
        if iR  is not None: R=centroids[:,iR];  a+=abs(np.min(R)-Ck[iR])/(abs(np.min(R))+1e-12); b+=abs(np.max(R)-Ck[iR])/(abs(np.max(R))+1e-12)
        if iN  is not None: N=centroids[:,iN];  a+=abs(np.max(N)-Ck[iN])/(abs(np.max(N))+1e-12); b+=abs(np.min(N)-Ck[iN])/(abs(np.min(N))+1e-12)
        if idm is not None: a+=abs(1.0-Ck[idm]); b-=abs(1.0-Ck[idm])
        Ak[k]=a; Bk[k]=b

    exp_A = np.exp(Ak); exp_B = np.exp(Bk)
    p_crisis = exp_A / (exp_A + exp_B)
    cl_labels = (p_crisis >= 0.5).astype(int)
    ev_labels = cl_labels[ev_cl]

    crisis_ratio = ev_labels.mean()
    return ev_labels, crisis_ratio


def balanced_accuracy(y_true, y_pred):
    """Balanced accuracy for imbalanced classes (Eq. 17 in paper)."""
    tp = ((y_pred == 1) & (y_true == 1)).sum()
    fn = ((y_pred == 0) & (y_true == 1)).sum()
    tn = ((y_pred == 0) & (y_true == 0)).sum()
    fp = ((y_pred == 1) & (y_true == 0)).sum()
    sens = tp / (tp + fn + 1e-9)
    spec = tn / (tn + fp + 1e-9)
    return 0.5 * (sens + spec)

print("✓ SOM and classifier ready")

## 3 · Load Data & Scale Features

In [None]:
df = pd.read_csv(INPUT_CSV, sep=CSV_SEP, low_memory=False)
print(f"Loaded: {len(df):,} events | columns: {list(df.columns)}")

missing = [c for c in FEATURE_COLS if c not in df.columns]
if missing:
    raise ValueError(f"Columns not found: {missing}")

X_raw = df[FEATURE_COLS].copy()
X_raw.replace([np.inf, -np.inf], np.nan, inplace=True)
X_raw = X_raw.fillna(X_raw.median())

scaler   = RobustScaler()
X_scaled = scaler.fit_transform(X_raw.values).astype(np.float32)
N_TOTAL  = len(X_scaled)

# Ground truth (if available) — used for balanced accuracy
# If you have a 'label' or 'true_label' column, set this:
GT_COL = None   # e.g. "true_label"   ← change if available
y_true = df[GT_COL].values.astype(int) if GT_COL and GT_COL in df.columns else None

if y_true is not None:
    print(f"✓ Ground truth found in '{GT_COL}' — balanced accuracy will be computed")
else:
    print("ℹ No ground truth column — only TE/QE/ratio metrics will be computed")

print(f"✓ Feature matrix: {X_scaled.shape}  |  scaler: RobustScaler")

## 4 · Single-Run Helper

In [None]:
def run_one(grid_size, n_iter, n_samples, seed):
    """Train one SOM and return TE, QE, crisis_ratio, balanced_acc, elapsed."""
    t0  = time.time()
    som = SOM(grid_size=grid_size, n_features=X_scaled.shape[1],
              n_iterations=n_iter, random_state=seed)
    som.fit(X_scaled, n_samples=n_samples)
    te  = som.topological_error(X_scaled)
    qe  = som.quantisation_error(X_scaled)
    ev_labels, ratio = classify_som(som, X_scaled)
    ba  = balanced_accuracy(y_true, ev_labels) if y_true is not None else np.nan
    return dict(te=te, qe=qe, ratio=ratio, bal_acc=ba, elapsed=time.time()-t0)

print("✓ run_one() ready — quick sanity check:")
r = run_one(FIXED_GRID_SIZE, 5000, None, 42)
print(f"  TE={r['te']:.4f}  QE={r['qe']:.4f}  crisis_ratio={r['ratio']:.3f}"
      f"  bal_acc={r['bal_acc']:.3f}  time={r['elapsed']:.1f}s")

## 5 · Phase 1 — Tune Grid Size

Fix iterations and samples at maximum, scan grid sizes.  
Target: **local minimum of TE and QE**, and **maximum balanced accuracy** for the binary classification task.

In [None]:
print("=" * 60)
print("  PHASE 1 — Grid Size Scan")
print(f"  Fixed: n_iter={FIXED_N_ITER:,}  samples=ALL")
print("=" * 60)

results_grid = []

for gs in GRID_SIZES:
    row_te=[]; row_qe=[]; row_ratio=[]; row_ba=[]; row_t=[]
    for seed in RANDOM_SEEDS[:N_REPEATS]:
        r = run_one(gs, FIXED_N_ITER, None, seed)
        row_te.append(r["te"]); row_qe.append(r["qe"])
        row_ratio.append(r["ratio"]); row_ba.append(r["bal_acc"])
        row_t.append(r["elapsed"])

    ratio_std = np.std(row_ratio)
    stable    = ratio_std <= RATIO_TOLERANCE
    results_grid.append(dict(
        grid_size=gs,
        nodes=gs*gs,
        te_mean=np.mean(row_te),   te_std=np.std(row_te),
        qe_mean=np.mean(row_qe),   qe_std=np.std(row_qe),
        ratio_mean=np.mean(row_ratio), ratio_std=ratio_std,
        ba_mean=np.nanmean(row_ba), ba_std=np.nanstd(row_ba),
        stable=stable,
        time_mean=np.mean(row_t),
    ))
    flag = "✓ stable" if stable else f"✗ ratio_std={ratio_std:.3f}"
    print(f"  Grid {gs}×{gs} ({gs*gs:3d} nodes) | "
          f"TE={np.mean(row_te):.4f}±{np.std(row_te):.4f}  "
          f"QE={np.mean(row_qe):.4f}±{np.std(row_qe):.4f}  "
          f"BA={np.nanmean(row_ba):.3f}  {flag}")

df_grid = pd.DataFrame(results_grid)
print("\nDone.")

In [None]:
# ── Plot Phase 1 ──────────────────────────────────────────────────────
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
fig.suptitle("Phase 1 — Grid Size Scan", fontsize=13, fontweight="bold")

metrics = [
    ("te_mean",  "te_std",  "Topological Error (TE)", "#e74c3c", "lower is better"),
    ("qe_mean",  "qe_std",  "Quantisation Error (QE)","#3498db", "lower is better"),
    ("ba_mean",  "ba_std",  "Balanced Accuracy",       "#2ecc71", "higher is better"),
]
for ax, (m, s, title, color, note) in zip(axes, metrics):
    ax.errorbar(df_grid["nodes"], df_grid[m], yerr=df_grid[s],
                marker="o", color=color, lw=2, capsize=4, markersize=7)
    ax.set_xlabel("Number of nodes (G²)", fontsize=11)
    ax.set_title(f"{title}\n({note})", fontsize=11, fontweight="bold")
    ax.set_xticks(df_grid["nodes"])
    ax.set_xticklabels([f"{g}×{g}\n({g*g})" for g in df_grid["grid_size"]], fontsize=8)

    # Mark minimum / maximum
    if "lower" in note:
        best_idx = df_grid[m].idxmin()
    else:
        best_idx = df_grid[m].idxmax()
    ax.axvline(df_grid.loc[best_idx, "nodes"], color=color, ls="--", alpha=0.5)
    ax.scatter([df_grid.loc[best_idx, "nodes"]], [df_grid.loc[best_idx, m]],
               s=120, color=color, zorder=5, marker="*")

plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/phase1_grid_size.png", bbox_inches="tight")
plt.show()

# Recommendation
best_te = df_grid.loc[df_grid["te_mean"].idxmin(), "grid_size"]
best_ba = df_grid.loc[df_grid["ba_mean"].idxmax(), "grid_size"]
print(f"\n  Best TE      → grid size {best_te}×{best_te} ({best_te**2} nodes)")
print(f"  Best Bal.Acc → grid size {best_ba}×{best_ba} ({best_ba**2} nodes)")
print(f"\n  Paper recommendation: 4×4 (16 nodes) for binary crisis/non-crisis task")

In [None]:
# ── Set optimal grid size ─────────────────────────────────────────────
# Use BA-optimal if ground truth available, else TE-optimal
if y_true is not None:
    OPTIMAL_GRID = int(df_grid.loc[df_grid["ba_mean"].idxmax(), "grid_size"])
else:
    OPTIMAL_GRID = int(df_grid.loc[df_grid["te_mean"].idxmin(), "grid_size"])

print(f"→ OPTIMAL_GRID_SIZE set to: {OPTIMAL_GRID}×{OPTIMAL_GRID} ({OPTIMAL_GRID**2} nodes)")
print("  (Override manually if desired — paper uses 4×4)")

## 6 · Phase 2 — Tune Number of Iterations

Fix grid at the optimal size found in Phase 1. Scan iteration counts.  
Target: **TE and QE plateau** (flat growth) — more iterations beyond this point add no benefit.

In [None]:
print("=" * 60)
print("  PHASE 2 — Iteration Count Scan")
print(f"  Fixed: grid={OPTIMAL_GRID}×{OPTIMAL_GRID}  samples=ALL")
print("=" * 60)

results_iter = []

for n_iter in ITER_VALUES:
    row_te=[]; row_qe=[]; row_ratio=[]; row_ba=[]; row_t=[]
    for seed in RANDOM_SEEDS[:N_REPEATS]:
        r = run_one(OPTIMAL_GRID, n_iter, None, seed)
        row_te.append(r["te"]); row_qe.append(r["qe"])
        row_ratio.append(r["ratio"]); row_ba.append(r["bal_acc"])
        row_t.append(r["elapsed"])

    ratio_std = np.std(row_ratio)
    results_iter.append(dict(
        n_iter=n_iter,
        te_mean=np.mean(row_te),   te_std=np.std(row_te),
        qe_mean=np.mean(row_qe),   qe_std=np.std(row_qe),
        ratio_mean=np.mean(row_ratio), ratio_std=ratio_std,
        ba_mean=np.nanmean(row_ba), ba_std=np.nanstd(row_ba),
        time_mean=np.mean(row_t),
    ))
    print(f"  iter={n_iter:>8,} | TE={np.mean(row_te):.4f}±{np.std(row_te):.4f}  "
          f"QE={np.mean(row_qe):.4f}±{np.std(row_qe):.4f}  "
          f"BA={np.nanmean(row_ba):.3f}  ratio_std={ratio_std:.4f}  "
          f"time={np.mean(row_t):.1f}s")

df_iter = pd.DataFrame(results_iter)
print("\nDone.")

In [None]:
# ── Plot Phase 2 ──────────────────────────────────────────────────────
fig, axes = plt.subplots(1, 4, figsize=(18, 4))
fig.suptitle(f"Phase 2 — Iteration Scan  (grid {OPTIMAL_GRID}×{OPTIMAL_GRID})",
             fontsize=13, fontweight="bold")

specs = [
    ("te_mean",    "te_std",    "Topological Error (TE)", "#e74c3c"),
    ("qe_mean",    "qe_std",    "Quantisation Error (QE)","#3498db"),
    ("ba_mean",    "ba_std",    "Balanced Accuracy",       "#2ecc71"),
    ("ratio_std",  None,        "Crisis Ratio Std Dev\n(target < 0.05)", "#9b59b6"),
]
for ax, (m, s, title, color) in zip(axes, specs):
    vals = df_iter[m]
    ax.plot(df_iter["n_iter"], vals, marker="o", color=color, lw=2, markersize=7)
    if s is not None:
        ax.fill_between(df_iter["n_iter"], vals - df_iter[s], vals + df_iter[s],
                        alpha=0.2, color=color)
    if m == "ratio_std":
        ax.axhline(RATIO_TOLERANCE, color="red", ls="--", lw=1.5, label=f"tolerance={RATIO_TOLERANCE}")
        ax.legend(fontsize=9)
    ax.set_xlabel("Iterations", fontsize=11)
    ax.set_title(title, fontsize=10, fontweight="bold")
    ax.set_xscale("log")

plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/phase2_iterations.png", bbox_inches="tight")
plt.show()

# ── Detect plateau: where TE improvement < 5% of total range ─────────
te_range  = df_iter["te_mean"].max() - df_iter["te_mean"].min()
te_deltas = df_iter["te_mean"].diff().abs().fillna(999)
plateau_mask = te_deltas < 0.05 * te_range
if plateau_mask.any():
    OPTIMAL_ITERS = int(df_iter.loc[plateau_mask.idxmax(), "n_iter"])
else:
    OPTIMAL_ITERS = int(df_iter["n_iter"].iloc[-1])

print(f"→ TE plateau detected at: {OPTIMAL_ITERS:,} iterations")

# Also check ratio stability
stable_iters = df_iter.loc[df_iter["ratio_std"] <= RATIO_TOLERANCE, "n_iter"]
if len(stable_iters) > 0:
    OPTIMAL_ITERS = max(OPTIMAL_ITERS, int(stable_iters.iloc[0]))
    print(f"  Ratio stability achieved at: {int(stable_iters.iloc[0]):,} iterations")

print(f"\n→ OPTIMAL_N_ITER set to: {OPTIMAL_ITERS:,}")

## 7 · Phase 3 — Tune Number of Training Samples

Fix grid and iterations at optimal values. Scan training sample counts.  
Target: **minimum samples** where QE stops improving — reducing training time without losing accuracy.

In [None]:
print("=" * 60)
print("  PHASE 3 — Training Sample Count Scan")
print(f"  Fixed: grid={OPTIMAL_GRID}×{OPTIMAL_GRID}  iter={OPTIMAL_ITERS:,}")
print("=" * 60)

results_samples = []
sample_counts   = [max(100, int(f * N_TOTAL)) for f in SAMPLE_FRACS]
sample_counts   = sorted(set(sample_counts))   # deduplicate

for n_samp in sample_counts:
    row_te=[]; row_qe=[]; row_ratio=[]; row_ba=[]; row_t=[]
    for seed in RANDOM_SEEDS[:N_REPEATS]:
        r = run_one(OPTIMAL_GRID, OPTIMAL_ITERS, n_samp, seed)
        row_te.append(r["te"]); row_qe.append(r["qe"])
        row_ratio.append(r["ratio"]); row_ba.append(r["bal_acc"])
        row_t.append(r["elapsed"])

    ratio_std = np.std(row_ratio)
    results_samples.append(dict(
        n_samples=n_samp,
        frac=n_samp/N_TOTAL,
        te_mean=np.mean(row_te),   te_std=np.std(row_te),
        qe_mean=np.mean(row_qe),   qe_std=np.std(row_qe),
        ratio_mean=np.mean(row_ratio), ratio_std=ratio_std,
        ba_mean=np.nanmean(row_ba), ba_std=np.nanstd(row_ba),
        time_mean=np.mean(row_t),
    ))
    print(f"  samples={n_samp:>7,} ({n_samp/N_TOTAL*100:4.0f}%) | "
          f"TE={np.mean(row_te):.4f}±{np.std(row_te):.4f}  "
          f"QE={np.mean(row_qe):.4f}±{np.std(row_qe):.4f}  "
          f"ratio_std={ratio_std:.4f}  time={np.mean(row_t):.1f}s")

df_samples = pd.DataFrame(results_samples)
print("\nDone.")

In [None]:
# ── Plot Phase 3 ──────────────────────────────────────────────────────
fig, axes = plt.subplots(1, 4, figsize=(18, 4))
fig.suptitle(f"Phase 3 — Sample Count Scan  (grid {OPTIMAL_GRID}×{OPTIMAL_GRID}, iter {OPTIMAL_ITERS:,})",
             fontsize=12, fontweight="bold")

specs = [
    ("te_mean",   "te_std",   "Topological Error (TE)", "#e74c3c"),
    ("qe_mean",   "qe_std",   "Quantisation Error (QE)","#3498db"),
    ("ba_mean",   "ba_std",   "Balanced Accuracy",       "#2ecc71"),
    ("ratio_std", None,       "Crisis Ratio Std Dev\n(target < 0.05)", "#9b59b6"),
]
for ax, (m, s, title, color) in zip(axes, specs):
    vals = df_samples[m]
    ax.plot(df_samples["n_samples"], vals, marker="o", color=color, lw=2, markersize=7)
    if s is not None:
        ax.fill_between(df_samples["n_samples"], vals - df_samples[s], vals + df_samples[s],
                        alpha=0.2, color=color)
    if m == "ratio_std":
        ax.axhline(RATIO_TOLERANCE, color="red", ls="--", lw=1.5, label=f"tolerance={RATIO_TOLERANCE}")
        ax.legend(fontsize=9)
    ax.set_xlabel("Training samples", fontsize=11)
    ax.set_title(title, fontsize=10, fontweight="bold")

plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/phase3_samples.png", bbox_inches="tight")
plt.show()

# ── Optimal sample count: first n where ratio_std < tolerance ─────────
stable = df_samples.loc[df_samples["ratio_std"] <= RATIO_TOLERANCE, "n_samples"]
if len(stable) > 0:
    OPTIMAL_SAMPLES = int(stable.iloc[0])
else:
    OPTIMAL_SAMPLES = int(df_samples["n_samples"].iloc[-1])
    print(f"  ⚠ Ratio never stabilised — using all samples")

# Also: find where QE plateaus
qe_range  = df_samples["qe_mean"].max() - df_samples["qe_mean"].min()
qe_deltas = df_samples["qe_mean"].diff().abs().fillna(999)
qe_plateau = df_samples.loc[qe_deltas < 0.05 * qe_range, "n_samples"]
if len(qe_plateau) > 0:
    qe_stable_n = int(qe_plateau.iloc[0])
    OPTIMAL_SAMPLES = max(OPTIMAL_SAMPLES, qe_stable_n)

print(f"\n→ OPTIMAL_SAMPLES set to: {OPTIMAL_SAMPLES:,} ({100*OPTIMAL_SAMPLES/N_TOTAL:.0f}% of data)")

## 8 · Summary — Optimal Hyperparameters

In [None]:
print("╔══════════════════════════════════════════════════════╗")
print("║          OPTIMAL HYPERPARAMETERS (KDM)               ║")
print("╠══════════════════════════════════════════════════════╣")
print(f"║  Grid size       : {OPTIMAL_GRID}×{OPTIMAL_GRID} = {OPTIMAL_GRID**2} nodes{'':<28}║")
print(f"║  N iterations    : {OPTIMAL_ITERS:<34,}║")
print(f"║  Training samples: {OPTIMAL_SAMPLES:<34,}║")
print("╠══════════════════════════════════════════════════════╣")
print("║  Paper values (for reference):                       ║")
print("║    Grid size  : 4×4 = 16 nodes                       ║")
print("║    Iterations : 100,000 (2,000,000 for CAT4)         ║")
print("║    Samples    : 7,000                                 ║")
print("╚══════════════════════════════════════════════════════╝")

print("\nUse these values in your KDM notebook (Section 0 Config):")
print(f"  GRID_SIZE    = {OPTIMAL_GRID}")
print(f"  N_ITERATIONS = {OPTIMAL_ITERS}")
print(f"  # Training samples ≈ {OPTIMAL_SAMPLES} (set in SOM.fit() via n_samples arg)")

In [None]:
# ── Combined summary plot ──────────────────────────────────────────────
fig, axes = plt.subplots(3, 2, figsize=(13, 11))
fig.suptitle("KDM — Hyperparameter Tuning Summary", fontsize=14, fontweight="bold")

# Phase 1
for ax, (m, lbl, color) in zip(axes[0], [
    ("te_mean", "TE", "#e74c3c"), ("qe_mean", "QE", "#3498db")]):
    ax.errorbar(df_grid["nodes"], df_grid[m], yerr=df_grid[m.replace("mean","std")],
                marker="o", color=color, lw=2, capsize=4, markersize=7)
    ax.axvline(OPTIMAL_GRID**2, color="k", ls="--", lw=1.5, label=f"Optimal: {OPTIMAL_GRID}²")
    ax.set_xlabel("Nodes (G²)"); ax.set_title(f"Phase 1 — {lbl} vs Grid Size", fontweight="bold")
    ax.set_xticks(df_grid["nodes"])
    ax.set_xticklabels([f"{g}²\n={g*g}" for g in df_grid["grid_size"]], fontsize=8)
    ax.legend(fontsize=9)

# Phase 2
for ax, (m, lbl, color) in zip(axes[1], [
    ("te_mean", "TE", "#e74c3c"), ("qe_mean", "QE", "#3498db")]):
    ax.plot(df_iter["n_iter"], df_iter[m], marker="o", color=color, lw=2, markersize=7)
    ax.fill_between(df_iter["n_iter"],
                    df_iter[m]-df_iter[m.replace("mean","std")],
                    df_iter[m]+df_iter[m.replace("mean","std")], alpha=0.2, color=color)
    ax.axvline(OPTIMAL_ITERS, color="k", ls="--", lw=1.5, label=f"Optimal: {OPTIMAL_ITERS:,}")
    ax.set_xlabel("Iterations (log)"); ax.set_xscale("log")
    ax.set_title(f"Phase 2 — {lbl} vs Iterations", fontweight="bold"); ax.legend(fontsize=9)

# Phase 3
for ax, (m, lbl, color) in zip(axes[2], [
    ("te_mean", "TE", "#e74c3c"), ("ratio_std", "Ratio Std", "#9b59b6")]):
    ax.plot(df_samples["n_samples"], df_samples[m], marker="o", color=color, lw=2, markersize=7)
    ax.axvline(OPTIMAL_SAMPLES, color="k", ls="--", lw=1.5, label=f"Optimal: {OPTIMAL_SAMPLES:,}")
    if m == "ratio_std":
        ax.axhline(RATIO_TOLERANCE, color="red", ls=":", lw=1.5, label="Tolerance")
    ax.set_xlabel("Training samples")
    ax.set_title(f"Phase 3 — {lbl} vs Samples", fontweight="bold"); ax.legend(fontsize=9)

plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/summary_tuning.png", bbox_inches="tight")
plt.show()

In [None]:
# ── Save all results to CSV ────────────────────────────────────────────
df_grid.to_csv(f"{OUTPUT_DIR}/tuning_phase1_grid.csv", index=False)
df_iter.to_csv(f"{OUTPUT_DIR}/tuning_phase2_iter.csv", index=False)
df_samples.to_csv(f"{OUTPUT_DIR}/tuning_phase3_samples.csv", index=False)

summary = pd.DataFrame([{
    "optimal_grid_size":   OPTIMAL_GRID,
    "optimal_n_iter":      OPTIMAL_ITERS,
    "optimal_n_samples":   OPTIMAL_SAMPLES,
}])
summary.to_csv(f"{OUTPUT_DIR}/optimal_hyperparameters.csv", index=False)

print("Saved:")
for f in sorted(os.listdir(OUTPUT_DIR)):
    print(f"  {f}")

## Tips

| Situation | Action |
|---|---|
| Ratio never stabilises | Increase `ITER_VALUES` upper bound or add more data |
| TE stays high for all grids | Increase `FIXED_N_ITER` in Phase 1 |
| Tuning is slow | Reduce `N_REPEATS` to 1 for a quick scan, then confirm with 3 |
| No ground truth | Rely on TE/QE plateau + ratio stability — paper's primary criteria |
| Large catalogue (>100k) | Increase `SAMPLE_FRACS` denominator; start `GRID_SIZES` at `[3,4,5]` |