# Ablation of Calibration


## Model Definition

In [2]:
# =========================
# Calibration Ablation Top Cell (Anchor + Router, NO CAL step)
# =========================
import os, math, gc
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# ---------- Globals ----------
LOG2PI  = math.log(2*math.pi)
DEVICE  = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.set_default_dtype(torch.float32)

# ---------- Repro ----------
def set_seed(seed: int = 1):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# ---------- Utils ----------
def rmse_score(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def zscore_fit(y_train):
    my = float(np.mean(y_train))
    sy = float(np.std(y_train) + 1e-8)
    return my, sy

def _topk_mask(w, k=2):
    _, topi = torch.topk(w, k, dim=-1)
    mask = torch.zeros_like(w).scatter_(-1, topi, 1.0)
    w2 = w * mask
    return w2 / (w2.sum(dim=-1, keepdim=True) + 1e-12)

def _topk_mask_smooth(w, k=2, eps=0.05):
    _, topi = torch.topk(w, k, dim=-1)
    mask = torch.zeros_like(w).scatter_(-1, topi, 1.0)
    w_top = w * mask
    w_top = w_top / (w_top.sum(dim=-1, keepdim=True) + 1e-12)
    return (1.0 - eps) * w_top + (eps / k) * mask

# ---------- Dataset Loader ----------
def load_dataset(name: str, verbose: bool = False):
    """Return (X, y) as float64; numeric clean + drop NaN rows."""
    def _clean_numeric(df: pd.DataFrame) -> pd.DataFrame:
        df = df.apply(pd.to_numeric, errors="coerce")
        df = df.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
        return df

    name = name.lower().strip()

    if name == "housing":
        df = pd.read_csv(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
            header=None, sep=r"\s+", engine="python", comment="#", skip_blank_lines=True
        )
        if df.shape[1] == 1:
            df = df[0].astype(str).str.strip().str.split(r"\s+", expand=True)
        df = _clean_numeric(df); assert df.shape[1] >= 14
        X = df.iloc[:, :-1].to_numpy(np.float64); y = df.iloc[:, -1].to_numpy(np.float64)
        if verbose: print(f"[Housing] X.shape={X.shape} y.shape={y.shape} | y∈[{y.min():.3f},{y.max():.3f}]")
        return X, y

    if name == "concrete":
        df = pd.read_excel(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls"
        )
        df = _clean_numeric(df)
        X = df.iloc[:, :-1].to_numpy(np.float64); y = df.iloc[:, -1].to_numpy(np.float64)
        if verbose: print(f"[Concrete] X.shape={X.shape} y.shape={y.shape} | y∈[{y.min():.3f},{y.max():.3f}]")
        return X, y

    if name == "wine":
        df = pd.read_csv(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
            delimiter=";"
        )
        df = _clean_numeric(df)
        X = df.iloc[:, :-1].to_numpy(np.float64); y = df.iloc[:, -1].to_numpy(np.float64)
        if verbose: print(f"[WineRed] X.shape={X.shape} y.shape={y.shape} | y∈[{y.min():.3f},{y.max():.3f}]")
        return X, y

    if name == "energy":
        df = pd.read_excel(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"
        )
        df = _clean_numeric(df)
        X = df.iloc[:, :-2].to_numpy(np.float64)  # drop Y1,Y2
        y = df.iloc[:, -1].to_numpy(np.float64)   # use Y2
        if verbose: print(f"[Energy] X.shape={X.shape} y.shape={y.shape} | y∈[{np.min(y):.3f},{np.max(y):.3f}]")
        return X, y

    if name == "yacht":
        df = pd.read_csv(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/00243/yacht_hydrodynamics.data",
            header=None, sep=r"\s+", engine="python", comment="#", skip_blank_lines=True
        )
        if df.shape[1] == 1:
            df = df[0].astype(str).str.strip().str.split(r"\s+", expand=True)
        df = _clean_numeric(df); assert df.shape[1] >= 7
        X = df.iloc[:, :-1].to_numpy(np.float64); y = df.iloc[:, -1].to_numpy(np.float64)
        if verbose: print(f"[Yacht] X.shape={X.shape} y.shape={y.shape} | y∈[{y.min():.3f},{y.max():.3f}]")
        return X, y

    # local datasets (optional)
    if name == "power":
        local = "power.xlsx"
        if not os.path.exists(local): raise FileNotFoundError("[power] need local 'power.xlsx'")
        df = pd.read_excel(local); df = _clean_numeric(df)
        X = df.iloc[:, :-1].to_numpy(np.float64); y = df.iloc[:, -1].to_numpy(np.float64)
        if verbose: print(f"[Power(local)] X.shape={X.shape} y.shape={y.shape}")
        return X, y

    if name == "kin8nm":
        local = "kin8nm.csv"; df = pd.read_csv(local); df = _clean_numeric(df)
        X = df.iloc[:, :-1].to_numpy(np.float64); y = df.iloc[:, -1].to_numpy(np.float64)
        return X, y

    if name == "msd":
        local = "YearPredictionMSD.txt"
        df = pd.read_csv(local, header=None); df = df.iloc[:, ::-1]; df = _clean_numeric(df)
        X = df.iloc[:, :-1].to_numpy(np.float64); y = df.iloc[:, -1].to_numpy(np.float64)
        return X, y

    if name == "protein":
        local = "protein.csv"; df = pd.read_csv(local, sep=None, engine="python"); df = _clean_numeric(df)
        y = df.iloc[:, 0].to_numpy(np.float64); X = df.iloc[:, 1:].to_numpy(np.float64)
        return X, y

    if name == "naval":
        local = "naval.txt"; df = pd.read_csv(local, sep=r"\s+", header=None, engine="python"); df = _clean_numeric(df)
        X = df.iloc[:, :-1].to_numpy(np.float64); y = df.iloc[:, -1].to_numpy(np.float64)
        return X, y

    raise ValueError("Unknown dataset.")

# ---------- MoE building blocks (Anchor + Router) ----------
class Projection(nn.Module):
    def __init__(self, d, D):
        super().__init__()
        self.w = nn.Linear(d, D, bias=True)
        nn.init.xavier_uniform_(self.w.weight); nn.init.zeros_(self.w.bias)
    def forward(self, x): return self.w(x)

class Window(nn.Module):
    def __init__(self, K, D, min_log_s=-2.5, max_log_s=1.0):
        super().__init__()
        self.c         = nn.Parameter(torch.randn(K, D))
        self.log_s     = nn.Parameter(torch.zeros(K, D))
        self.min_log_s = min_log_s; self.max_log_s = max_log_s
    def forward(self, z):
        log_s = torch.clamp(self.log_s, min=self.min_log_s, max=self.max_log_s)
        diff2 = ((z[:, None] - self.c)**2) / (2 * torch.exp(log_s)**2)
        return torch.exp(-diff2.sum(dim=-1)) + 1e-12  # [B,K]

class Router(nn.Module):
    def __init__(self, D, K):
        super().__init__()
        self.q   = nn.Linear(D, 64)
        self.k   = nn.Parameter(torch.randn(K, 64))
        self.tau = 3.0
    def forward(self, z):
        logits = (self.q(z) @ self.k.T) / math.sqrt(64)
        return F.softmax(logits / self.tau, dim=-1)

class ExpertMDN(nn.Module):
    def __init__(self, d, h, nc, sigma_min=5e-2, sigma_max=1.0, learn_mean=True):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d, h), nn.ReLU(),
            nn.Linear(h, h), nn.ReLU()
        )
        self.logits = nn.Linear(h, nc)
        self.learn_mean = learn_mean
        if learn_mean:
            self.means  = nn.Linear(h, nc)
        self.log_sc = nn.Linear(h, nc)
        self.sigma_min = float(sigma_min)
        self.sigma_max = float(sigma_max)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight); nn.init.zeros_(m.bias)
    def forward(self, x):
        h  = self.net(x)
        pi = F.softmax(self.logits(h), dim=-1)
        mu = self.means(h) if self.learn_mean else None
        sg = torch.exp(self.log_sc(h)).clamp(self.sigma_min, self.sigma_max)
        return pi, mu, sg

class BLRMoE(nn.Module):
    """Anchor-MoE with Router; mean_mode in {'anchor','anchor+delta','free'}."""
    def __init__(self, d, D, K, hid, nc,
                 mean_mode='anchor+delta', delta_l2=3e-3,
                 w_ent_warm=+1e-3, w_ent_cool=-2e-4,
                 l2_win=1e-4, lb_coef=1e-3,
                 sigma_min=5e-2, sigma_max=1.0,
                 topk=2, smooth_eps=0.05):
        super().__init__()
        assert mean_mode in ['anchor','anchor+delta','free']
        assert 1 <= topk <= K
        self.mean_mode = mean_mode
        self.delta_l2  = float(delta_l2)
        self.proj   = Projection(d, D)
        self.win    = Window(K, D)
        self.router = Router(D, K)
        learn_mean  = (mean_mode != 'anchor')
        self.exps   = nn.ModuleList([ExpertMDN(d, hid, nc, sigma_min, sigma_max, learn_mean=learn_mean) for _ in range(K)])
        self.w_ent_warm = w_ent_warm
        self.w_ent_cool = w_ent_cool
        self.l2_win     = l2_win
        self.lb_coef    = lb_coef
        self.topk       = topk
        self.smooth_eps = smooth_eps
        self.K = K; self.nc = nc

    def _mixture_params(self, X, train=True):
        z = self.proj(X)
        w = self.win(z) * self.router(z)
        w = w / (w.sum(dim=-1, keepdim=True) + 1e-12)
        w = _topk_mask_smooth(w, k=self.topk, eps=self.smooth_eps) if train else _topk_mask(w, k=self.topk)

        B, K, C = X.size(0), self.K, self.nc
        Pi = torch.full((B, K, C), 1.0/C, device=X.device)
        Mu = torch.zeros(B, K, C, device=X.device)
        Sg = torch.ones(B, K, C, device=X.device)

        _, topi = torch.topk(w, self.topk, dim=-1)
        uniq = torch.unique(topi)
        for j in uniq.tolist():
            pi_j, mu_j, sg_j = self.exps[j](X)
            mask = (topi == j).any(dim=1).float().unsqueeze(-1)
            Pi[:, j, :] = pi_j * mask + Pi[:, j, :]*(1 - mask)
            if mu_j is not None:
                Mu[:, j, :] = mu_j * mask + Mu[:, j, :]*(1 - mask)
            Sg[:, j, :] = sg_j * mask + Sg[:, j, :]*(1 - mask)
        return w, Pi, Mu, Sg, z

    def nll(self, X, y_z, mu_anchor_z=None, epoch=1, warmup_epochs=150):
        train_flag = self.training
        w, Pi, Mu, Sg, z = self._mixture_params(X, train=train_flag)

        if self.mean_mode == 'anchor':
            assert mu_anchor_z is not None
            Mu_eff = mu_anchor_z[:, None, None]
            delta_pen = 0.0
        elif self.mean_mode == 'anchor+delta':
            assert mu_anchor_z is not None
            Mu_eff = mu_anchor_z[:, None, None] + Mu
            delta_pen = (Mu**2).mean() * self.delta_l2
        else:
            Mu_eff = Mu
            delta_pen = 0.0

        yv = y_z[:, None, None]
        logp = -0.5 * ((yv - Mu_eff)/Sg)**2 - torch.log(Sg) - 0.5*LOG2PI

        w3   = w[:, :, None]
        logw = torch.where(w3 > 0, torch.log(w3 + 1e-12), torch.full_like(w3, -1e9))
        logpi= torch.log(Pi + 1e-12)

        logmix = torch.logsumexp(logw + logpi + logp, dim=(1,2))
        nll = -logmix.mean()

        if train_flag:
            w_ent = self.w_ent_warm if epoch <= warmup_epochs else self.w_ent_cool
            p = self.router(z)
            ent = (p * torch.log(p + 1e-12)).sum(dim=1).mean()
            l2w = (self.win.log_s**2).mean()
            rho = p.mean(dim=0)
            lb_loss = ((rho - 1.0/p.size(1))**2).sum()
            nll = nll + w_ent*ent + self.l2_win*l2w + self.lb_coef*lb_loss + delta_pen
        return nll

    @torch.no_grad()
    def predict_mean_var(self, X, mu_anchor_z=None):
        w, Pi, Mu, Sg, _ = self._mixture_params(X, train=False)
        if self.mean_mode == 'anchor':
            assert mu_anchor_z is not None
            Mu_eff = mu_anchor_z[:, None, None]
        elif self.mean_mode == 'anchor+delta':
            assert mu_anchor_z is not None
            Mu_eff = mu_anchor_z[:, None, None] + Mu
        else:
            Mu_eff = Mu

        mu_e  = (Pi * Mu_eff).sum(dim=2)              # [B,K]
        m2_e  = (Pi * (Sg**2 + Mu_eff**2)).sum(dim=2) # [B,K]
        mu_z  = (w * mu_e).sum(dim=1)                 # [B]
        second= (w * m2_e).sum(dim=1)                 # [B]
        var_z = torch.clamp(second - mu_z**2, min=1e-9)
        return mu_z, var_z

# ---------- One-split training (NO CALIBRATION) ----------
def train_one_split_no_cal(
    X_all, y_all, X_te, y_te,
    standardize_x=True,
    D=2, K=8, HID=128, NC=3,
    LR=1e-3, EPOCHS=400,
    MEAN_MODE='anchor+delta',
    DELTA_L2=3e-3,
    SIGMA_MIN=5e-2, SIGMA_MAX=1.0,
    TOPK=2, SMOOTH_EPS=0.05,
    seed=1, val_frac=0.2
):
    """
    Calibration ablation: 取消后验线性校准与 CAL 切分。
    90%训练数据全部用作 TV；在 TV 内部切 TR/VA 做早停与挑 best_it。
    无任何泄漏（TEST 仅在最后使用）。
    """
    rng = np.random.RandomState(seed)

    # ---- TV = 全部训练；在 TV 内部切 TR/VA ----
    X_tv, y_tv = X_all, y_all
    X_tr, X_va, y_tr, y_va = train_test_split(
        X_tv, y_tv, test_size=val_frac, random_state=seed
    )

    # ---- 1) Anchor：在 TR/VA 上选 best_it，然后在 TV 上重训 ----
    gbdt_big = GradientBoostingRegressor(
        n_estimators=2000, learning_rate=0.05, max_depth=3,
        subsample=1.0, random_state=seed
    ).fit(X_tr, y_tr)

    va_curve = [rmse_score(y_va, p) for p in gbdt_big.staged_predict(X_va)]
    best_it  = int(np.argmin(va_curve)) + 1

    gbdt_sub   = GradientBoostingRegressor(
        n_estimators=best_it, learning_rate=0.05, max_depth=3,
        subsample=1.0, random_state=seed
    ).fit(X_tr, y_tr)

    gbdt_final = GradientBoostingRegressor(
        n_estimators=best_it, learning_rate=0.05, max_depth=3,
        subsample=1.0, random_state=seed
    ).fit(X_tv, y_tv)

    # ---- 2) z-score 统计（Phase-1 用 TR，Phase-2 用 TV）----
    my_tr, sy_tr = zscore_fit(y_tr)
    my_tv, sy_tv = zscore_fit(y_tv)

    # ---- 3) 生成 TR/VA 的 anchor，并把 anchor 作为额外特征耦合 ----
    mu_tr_anchor = (gbdt_sub.predict(X_tr) - my_tr) / sy_tr
    mu_va_anchor = (gbdt_sub.predict(X_va) - my_tr) / sy_tr
    X_tr_aug = np.column_stack([X_tr, mu_tr_anchor])
    X_va_aug = np.column_stack([X_va, mu_va_anchor])

    # 标准化（以 TR 为基）并转 tensor
    if standardize_x:
        mx_tr = X_tr_aug.mean(0, keepdims=True)
        sx_tr = X_tr_aug.std(0, keepdims=True) + 1e-8
    else:
        mx_tr = np.zeros((1, X_tr_aug.shape[1])); sx_tr = np.ones((1, X_tr_aug.shape[1]))

    def to_tensor(x, yz=None, mx=None, sx=None):
        Xt = torch.tensor(((x - mx)/sx).astype(np.float32), device=DEVICE)
        if yz is None: return Xt
        Yt = torch.tensor(yz.astype(np.float32), device=DEVICE)
        return Xt, Yt

    y_tr_z = (y_tr - my_tr) / sy_tr
    y_va_z = (y_va - my_tr) / sy_tr

    model = BLRMoE(  # 这里用“完整模型”（含 Router）
        d=X_tr_aug.shape[1], D=D, K=K, hid=HID, nc=NC,
        mean_mode=MEAN_MODE, delta_l2=DELTA_L2,
        sigma_min=SIGMA_MIN, sigma_max=SIGMA_MAX,
        topk=TOPK, smooth_eps=SMOOTH_EPS
    ).to(DEVICE)
    opt = optim.AdamW(model.parameters(), lr=LR, weight_decay=3e-4)

    Xtr_t, ytr_t = to_tensor(X_tr_aug, y_tr_z, mx_tr, sx_tr)
    Xva_t, yva_t = to_tensor(X_va_aug, y_va_z, mx_tr, sx_tr)
    mu_tr_t = torch.tensor(mu_tr_anchor.astype(np.float32), device=DEVICE)
    mu_va_t = torch.tensor(mu_va_anchor.astype(np.float32), device=DEVICE)

    best_ep, best_vnll, best_state = 0, +1e9, None
    for ep in range(1, EPOCHS+1):
        model.train(); opt.zero_grad()
        loss = model.nll(Xtr_t, ytr_t, mu_anchor_z=mu_tr_t, epoch=ep, warmup_epochs=150)
        loss.backward(); nn.utils.clip_grad_norm_(model.parameters(), 2.0); opt.step()
        model.router.tau = max(model.router.tau * 0.995, 1.0)

        model.eval()
        with torch.no_grad():
            vnll = float(model.nll(Xva_t, yva_t, mu_anchor_z=mu_va_t).cpu().item())
        if vnll < best_vnll:
            best_vnll = vnll; best_ep = ep
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    # ---- 4) Phase-2：用 TV 复训；为 TV/TE 生成 anchor ----
    mu_tv_anchor = (gbdt_final.predict(X_tv) - my_tv) / sy_tv
    mu_te_anchor = (gbdt_final.predict(X_te) - my_tv) / sy_tv

    X_tv_aug = np.column_stack([X_tv, mu_tv_anchor])
    X_te_aug = np.column_stack([X_te, mu_te_anchor])

    if standardize_x:
        mx_tv = X_tv_aug.mean(0, keepdims=True)
        sx_tv = X_tv_aug.std(0, keepdims=True) + 1e-8
    else:
        mx_tv = np.zeros((1, X_tv_aug.shape[1])); sx_tv = np.ones((1, X_tv_aug.shape[1]))

    y_tv_z = (y_tv - my_tv) / sy_tv
    y_te_z = (y_te - my_tv) / sy_tv

    model.load_state_dict(best_state)
    model.train()
    opt = optim.AdamW(model.parameters(), lr=LR, weight_decay=3e-4)

    Xtv_t, ytv_t = to_tensor(X_tv_aug, y_tv_z, mx_tv, sx_tv)
    Xte_t        = to_tensor(X_te_aug, None,   mx_tv, sx_tv)
    mu_tv_t  = torch.tensor(mu_tv_anchor.astype(np.float32), device=DEVICE)
    mu_te_t  = torch.tensor(mu_te_anchor.astype(np.float32), device=DEVICE)

    for ep in range(1, best_ep+1):
        opt.zero_grad()
        loss = model.nll(Xtv_t, ytv_t, mu_anchor_z=mu_tv_t, epoch=ep, warmup_epochs=150)
        loss.backward(); nn.utils.clip_grad_norm_(model.parameters(), 2.0); opt.step()
        model.router.tau = max(model.router.tau * 0.995, 1.0)

    # ---- 5) TEST：无校准（直接用 TV 统计还原均值）；NLL 用 z 评估 ----
    model.eval()
    with torch.no_grad():
        yte_t = torch.tensor(y_te_z.astype(np.float32), device=DEVICE)
        test_nll = float(model.nll(Xte_t, yte_t, mu_anchor_z=mu_te_t).cpu().item())

        mu_z_te, _ = model.predict_mean_var(Xte_t, mu_anchor_z=mu_te_t)
        mu_te_orig = mu_z_te.cpu().numpy().astype(np.float64) * sy_tv + my_tv  # 无校准

    rmse = rmse_score(y_te.astype(np.float64), mu_te_orig)
    r2   = float(r2_score(y_te.astype(np.float64), mu_te_orig))
    return rmse, test_nll, best_ep, r2



## Housing


In [5]:
# === Run on Housing — Calibration Ablation (NO post-hoc CAL), leak-free ===
set_seed(1)
X, y = load_dataset("housing", verbose=True)

SEED = 1
n = X.shape[0]
rng = np.random.RandomState(SEED)

# 20 × (90/10) 外层划分
splits = []
for i in range(20):
    perm = rng.permutation(n)
    tr = perm[: round(n * 0.9)]
    te = perm[round(n * 0.9):]
    splits.append((tr, te))

# 超参（与 Anchor-MoE 主设定一致）
STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS     = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS     = 2, 0.05

rmses, nlls, r2s = [], [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):
    rmse, nll, best_ep, r2 = train_one_split_no_cal(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}  R²={r2:.4f}")
    rmses.append(rmse); nlls.append(nll); r2s.append(r2)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# 汇总
rmses = np.asarray(rmses, dtype=np.float64)
nlls  = np.asarray(nlls,  dtype=np.float64)
r2s   = np.asarray(r2s,   dtype=np.float64)
se = lambda a: a.std(ddof=1)/np.sqrt(len(a))

print("\n== Anchor-MoE (Calibration Ablation: NO CAL) on Housing ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")
print(f" R²         = {r2s.mean():.4f} ± {se(r2s):.4f}")

[Housing] X.shape=(506, 13) y.shape=(506,) | y∈[5.000,50.000]
[01/20] MoE best_ep= 14  TestRMSE(orig)=2.5180  TestNLL(z)=0.2402  R²=0.8867
[02/20] MoE best_ep= 14  TestRMSE(orig)=2.5395  TestNLL(z)=0.4070  R²=0.8973
[03/20] MoE best_ep= 10  TestRMSE(orig)=2.5489  TestNLL(z)=0.2886  R²=0.9212
[04/20] MoE best_ep= 10  TestRMSE(orig)=2.9835  TestNLL(z)=0.4587  R²=0.9087
[05/20] MoE best_ep= 12  TestRMSE(orig)=3.0009  TestNLL(z)=0.6621  R²=0.9293
[06/20] MoE best_ep= 11  TestRMSE(orig)=2.5941  TestNLL(z)=0.3232  R²=0.9172
[07/20] MoE best_ep= 10  TestRMSE(orig)=1.9909  TestNLL(z)=0.4026  R²=0.8782
[08/20] MoE best_ep= 11  TestRMSE(orig)=2.9305  TestNLL(z)=0.4421  R²=0.8788
[09/20] MoE best_ep= 13  TestRMSE(orig)=3.0064  TestNLL(z)=1.1618  R²=0.8916
[10/20] MoE best_ep=  9  TestRMSE(orig)=4.2192  TestNLL(z)=0.8531  R²=0.8068
[11/20] MoE best_ep= 10  TestRMSE(orig)=2.9192  TestNLL(z)=0.7322  R²=0.9136
[12/20] MoE best_ep= 10  TestRMSE(orig)=2.1877  TestNLL(z)=0.1980  R²=0.9301
[13/20] MoE be

## Concrete

In [3]:
# === Run on Housing — Calibration Ablation (NO post-hoc CAL), leak-free ===
set_seed(1)
X, y = load_dataset("concrete", verbose=True)

SEED = 1
n = X.shape[0]
rng = np.random.RandomState(SEED)

# 20 × (90/10) 外层划分
splits = []
for i in range(20):
    perm = rng.permutation(n)
    tr = perm[: round(n * 0.9)]
    te = perm[round(n * 0.9):]
    splits.append((tr, te))

# 超参（与 Anchor-MoE 主设定一致）
STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS     = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS     = 2, 0.05

rmses, nlls, r2s = [], [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):
    rmse, nll, best_ep, r2 = train_one_split_no_cal(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}  R²={r2:.4f}")
    rmses.append(rmse); nlls.append(nll); r2s.append(r2)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# 汇总
rmses = np.asarray(rmses, dtype=np.float64)
nlls  = np.asarray(nlls,  dtype=np.float64)
r2s   = np.asarray(r2s,   dtype=np.float64)
se = lambda a: a.std(ddof=1)/np.sqrt(len(a))

print("\n== Anchor-MoE (Calibration Ablation: NO CAL) on Housing ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")
print(f" R²         = {r2s.mean():.4f} ± {se(r2s):.4f}")

[Concrete] X.shape=(1030, 8) y.shape=(1030,) | y∈[2.332,82.599]
[01/20] MoE best_ep= 18  TestRMSE(orig)=4.7647  TestNLL(z)=0.4434  R²=0.9259
[02/20] MoE best_ep= 17  TestRMSE(orig)=3.4393  TestNLL(z)=-0.0077  R²=0.9541
[03/20] MoE best_ep= 18  TestRMSE(orig)=2.8907  TestNLL(z)=-0.1717  R²=0.9716
[04/20] MoE best_ep= 15  TestRMSE(orig)=4.4722  TestNLL(z)=0.1325  R²=0.9272
[05/20] MoE best_ep= 13  TestRMSE(orig)=4.6924  TestNLL(z)=0.2881  R²=0.9080
[06/20] MoE best_ep= 15  TestRMSE(orig)=3.6997  TestNLL(z)=-0.0508  R²=0.9481
[07/20] MoE best_ep= 16  TestRMSE(orig)=4.2595  TestNLL(z)=0.4259  R²=0.9384
[08/20] MoE best_ep= 16  TestRMSE(orig)=4.8239  TestNLL(z)=0.0243  R²=0.9116
[09/20] MoE best_ep= 16  TestRMSE(orig)=4.1273  TestNLL(z)=0.1597  R²=0.9415
[10/20] MoE best_ep= 16  TestRMSE(orig)=4.7230  TestNLL(z)=0.3897  R²=0.9230
[11/20] MoE best_ep= 15  TestRMSE(orig)=3.6446  TestNLL(z)=-0.0125  R²=0.9452
[12/20] MoE best_ep= 12  TestRMSE(orig)=4.4414  TestNLL(z)=0.1453  R²=0.9263
[13/20] 

## Energy

In [4]:
# === Run on Housing — Calibration Ablation (NO post-hoc CAL), leak-free ===
set_seed(1)
X, y = load_dataset("energy", verbose=True)

SEED = 1
n = X.shape[0]
rng = np.random.RandomState(SEED)

# 20 × (90/10) 外层划分
splits = []
for i in range(20):
    perm = rng.permutation(n)
    tr = perm[: round(n * 0.9)]
    te = perm[round(n * 0.9):]
    splits.append((tr, te))

# 超参（与 Anchor-MoE 主设定一致）
STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS     = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS     = 2, 0.05

rmses, nlls, r2s = [], [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):
    rmse, nll, best_ep, r2 = train_one_split_no_cal(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}  R²={r2:.4f}")
    rmses.append(rmse); nlls.append(nll); r2s.append(r2)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# 汇总
rmses = np.asarray(rmses, dtype=np.float64)
nlls  = np.asarray(nlls,  dtype=np.float64)
r2s   = np.asarray(r2s,   dtype=np.float64)
se = lambda a: a.std(ddof=1)/np.sqrt(len(a))

print("\n== Anchor-MoE (Calibration Ablation: NO CAL) on Housing ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")
print(f" R²         = {r2s.mean():.4f} ± {se(r2s):.4f}")

[Energy] X.shape=(768, 8) y.shape=(768,) | y∈[10.900,48.030]
[01/20] MoE best_ep= 61  TestRMSE(orig)=0.9426  TestNLL(z)=-0.9539  R²=0.9904
[02/20] MoE best_ep= 96  TestRMSE(orig)=0.9832  TestNLL(z)=-0.9238  R²=0.9906
[03/20] MoE best_ep=169  TestRMSE(orig)=0.8509  TestNLL(z)=-1.2703  R²=0.9918
[04/20] MoE best_ep=331  TestRMSE(orig)=0.9713  TestNLL(z)=-0.9051  R²=0.9868
[05/20] MoE best_ep=347  TestRMSE(orig)=1.1201  TestNLL(z)=-0.6470  R²=0.9867
[06/20] MoE best_ep=120  TestRMSE(orig)=1.0670  TestNLL(z)=-1.0686  R²=0.9860
[07/20] MoE best_ep=350  TestRMSE(orig)=1.0840  TestNLL(z)=-0.4402  R²=0.9891
[08/20] MoE best_ep=336  TestRMSE(orig)=0.7901  TestNLL(z)=-1.2204  R²=0.9937
[09/20] MoE best_ep= 33  TestRMSE(orig)=0.9056  TestNLL(z)=-0.9674  R²=0.9902
[10/20] MoE best_ep=121  TestRMSE(orig)=0.9693  TestNLL(z)=-0.9573  R²=0.9894
[11/20] MoE best_ep=381  TestRMSE(orig)=1.1105  TestNLL(z)=-0.6504  R²=0.9867
[12/20] MoE best_ep= 41  TestRMSE(orig)=0.8842  TestNLL(z)=-1.2069  R²=0.9911
[13

In [5]:
# === Run on Housing — Calibration Ablation (NO post-hoc CAL), leak-free ===
set_seed(1)
X, y = load_dataset("kin8nm", verbose=True)

SEED = 1
n = X.shape[0]
rng = np.random.RandomState(SEED)

# 20 × (90/10) 外层划分
splits = []
for i in range(20):
    perm = rng.permutation(n)
    tr = perm[: round(n * 0.9)]
    te = perm[round(n * 0.9):]
    splits.append((tr, te))

# 超参（与 Anchor-MoE 主设定一致）
STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS     = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS     = 2, 0.05

rmses, nlls, r2s = [], [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):
    rmse, nll, best_ep, r2 = train_one_split_no_cal(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}  R²={r2:.4f}")
    rmses.append(rmse); nlls.append(nll); r2s.append(r2)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# 汇总
rmses = np.asarray(rmses, dtype=np.float64)
nlls  = np.asarray(nlls,  dtype=np.float64)
r2s   = np.asarray(r2s,   dtype=np.float64)
se = lambda a: a.std(ddof=1)/np.sqrt(len(a))

print("\n== Anchor-MoE (Calibration Ablation: NO CAL) on Housing ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")
print(f" R²         = {r2s.mean():.4f} ± {se(r2s):.4f}")

[01/20] MoE best_ep=  6  TestRMSE(orig)=0.1515  TestNLL(z)=0.9876  R²=0.6823
[02/20] MoE best_ep=  7  TestRMSE(orig)=0.1475  TestNLL(z)=0.9942  R²=0.6921
[03/20] MoE best_ep=  7  TestRMSE(orig)=0.1483  TestNLL(z)=0.9738  R²=0.6708
[04/20] MoE best_ep=  7  TestRMSE(orig)=0.1422  TestNLL(z)=0.9972  R²=0.7104
[05/20] MoE best_ep=  5  TestRMSE(orig)=0.1492  TestNLL(z)=0.9190  R²=0.6363
[06/20] MoE best_ep=  7  TestRMSE(orig)=0.1474  TestNLL(z)=1.0024  R²=0.7023
[07/20] MoE best_ep=  7  TestRMSE(orig)=0.1517  TestNLL(z)=1.0784  R²=0.6434
[08/20] MoE best_ep=  6  TestRMSE(orig)=0.1421  TestNLL(z)=0.9413  R²=0.7363
[09/20] MoE best_ep=  7  TestRMSE(orig)=0.1370  TestNLL(z)=0.8412  R²=0.7247
[10/20] MoE best_ep=  6  TestRMSE(orig)=0.1500  TestNLL(z)=0.9915  R²=0.6860
[11/20] MoE best_ep=  7  TestRMSE(orig)=0.1423  TestNLL(z)=0.9567  R²=0.7242
[12/20] MoE best_ep=  7  TestRMSE(orig)=0.1473  TestNLL(z)=1.0254  R²=0.6846
[13/20] MoE best_ep=  7  TestRMSE(orig)=0.1482  TestNLL(z)=0.9192  R²=0.7016

## Naval

In [3]:
# === Run on Housing — Calibration Ablation (NO post-hoc CAL), leak-free ===
set_seed(1)
X, y = load_dataset("naval", verbose=True)

SEED = 1
n = X.shape[0]
rng = np.random.RandomState(SEED)

# 20 × (90/10) 外层划分
splits = []
for i in range(20):
    perm = rng.permutation(n)
    tr = perm[: round(n * 0.9)]
    te = perm[round(n * 0.9):]
    splits.append((tr, te))

# 超参（与 Anchor-MoE 主设定一致）
STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS     = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS     = 2, 0.05

rmses, nlls, r2s = [], [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):
    rmse, nll, best_ep, r2 = train_one_split_no_cal(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}  R²={r2:.4f}")
    rmses.append(rmse); nlls.append(nll); r2s.append(r2)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# 汇总
rmses = np.asarray(rmses, dtype=np.float64)
nlls  = np.asarray(nlls,  dtype=np.float64)
r2s   = np.asarray(r2s,   dtype=np.float64)
se = lambda a: a.std(ddof=1)/np.sqrt(len(a))

print("\n== Anchor-MoE (Calibration Ablation: NO CAL) on Housing ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")
print(f" R²         = {r2s.mean():.4f} ± {se(r2s):.4f}")

[01/20] MoE best_ep=279  TestRMSE(orig)=0.0008  TestNLL(z)=-0.9827  R²=0.9894
[02/20] MoE best_ep=149  TestRMSE(orig)=0.0005  TestNLL(z)=-1.2423  R²=0.9950
[03/20] MoE best_ep=304  TestRMSE(orig)=0.0006  TestNLL(z)=-0.9949  R²=0.9933
[04/20] MoE best_ep=229  TestRMSE(orig)=0.0006  TestNLL(z)=-1.3553  R²=0.9943
[05/20] MoE best_ep=279  TestRMSE(orig)=0.0006  TestNLL(z)=-1.0081  R²=0.9935
[06/20] MoE best_ep=146  TestRMSE(orig)=0.0006  TestNLL(z)=-1.2442  R²=0.9939
[07/20] MoE best_ep= 90  TestRMSE(orig)=0.0006  TestNLL(z)=-1.0729  R²=0.9930
[08/20] MoE best_ep=103  TestRMSE(orig)=0.0005  TestNLL(z)=-1.2429  R²=0.9949
[09/20] MoE best_ep=183  TestRMSE(orig)=0.0006  TestNLL(z)=-1.0587  R²=0.9934
[10/20] MoE best_ep=104  TestRMSE(orig)=0.0006  TestNLL(z)=-1.1893  R²=0.9932
[11/20] MoE best_ep=195  TestRMSE(orig)=0.0006  TestNLL(z)=-0.9942  R²=0.9939
[12/20] MoE best_ep=372  TestRMSE(orig)=0.0005  TestNLL(z)=-1.1910  R²=0.9945
[13/20] MoE best_ep=242  TestRMSE(orig)=0.0006  TestNLL(z)=-1.10

## Power

In [7]:
# === Run on Housing — Calibration Ablation (NO post-hoc CAL), leak-free ===
set_seed(1)
X, y = load_dataset("power", verbose=True)

SEED = 1
n = X.shape[0]
rng = np.random.RandomState(SEED)

# 20 × (90/10) 外层划分
splits = []
for i in range(20):
    perm = rng.permutation(n)
    tr = perm[: round(n * 0.9)]
    te = perm[round(n * 0.9):]
    splits.append((tr, te))

# 超参（与 Anchor-MoE 主设定一致）
STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS     = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS     = 2, 0.05

rmses, nlls, r2s = [], [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):
    rmse, nll, best_ep, r2 = train_one_split_no_cal(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}  R²={r2:.4f}")
    rmses.append(rmse); nlls.append(nll); r2s.append(r2)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# 汇总
rmses = np.asarray(rmses, dtype=np.float64)
nlls  = np.asarray(nlls,  dtype=np.float64)
r2s   = np.asarray(r2s,   dtype=np.float64)
se = lambda a: a.std(ddof=1)/np.sqrt(len(a))

print("\n== Anchor-MoE (Calibration Ablation: NO CAL) on Housing ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")
print(f" R²         = {r2s.mean():.4f} ± {se(r2s):.4f}")

[Power(local)] X.shape=(9568, 4) y.shape=(9568,)
[01/20] MoE best_ep=143  TestRMSE(orig)=3.1920  TestNLL(z)=-0.0919  R²=0.9633
[02/20] MoE best_ep= 22  TestRMSE(orig)=3.2945  TestNLL(z)=-0.0917  R²=0.9634
[03/20] MoE best_ep= 66  TestRMSE(orig)=2.9467  TestNLL(z)=-0.2605  R²=0.9713
[04/20] MoE best_ep= 18  TestRMSE(orig)=2.9708  TestNLL(z)=-0.2593  R²=0.9677
[05/20] MoE best_ep= 56  TestRMSE(orig)=3.6666  TestNLL(z)=-0.0787  R²=0.9553
[06/20] MoE best_ep= 59  TestRMSE(orig)=3.4606  TestNLL(z)=-0.0649  R²=0.9591
[07/20] MoE best_ep= 38  TestRMSE(orig)=3.0472  TestNLL(z)=-0.2186  R²=0.9674
[08/20] MoE best_ep=102  TestRMSE(orig)=2.9472  TestNLL(z)=0.0552  R²=0.9706
[09/20] MoE best_ep= 23  TestRMSE(orig)=3.3879  TestNLL(z)=-0.1962  R²=0.9615
[10/20] MoE best_ep= 20  TestRMSE(orig)=3.2815  TestNLL(z)=-0.2119  R²=0.9623
[11/20] MoE best_ep= 24  TestRMSE(orig)=2.9981  TestNLL(z)=-0.2023  R²=0.9684
[12/20] MoE best_ep= 18  TestRMSE(orig)=3.2945  TestNLL(z)=-0.0847  R²=0.9636
[13/20] MoE best

## Protein

In [8]:
# === Run on Protein — Calibration Ablation (NO post-hoc CAL), leak-free (downsample to 10k) ===
set_seed(1)
X, y = load_dataset("protein", verbose=True)   # requires local 'protein.csv'

SEED = 1
rng  = np.random.RandomState(SEED)

# Downsample to 10,000 before any splitting (leak-free)
MAX_N = 10_000
if X.shape[0] > MAX_N:
    idx = rng.choice(X.shape[0], MAX_N, replace=False)
    X = X[idx].copy()
    y = y[idx].copy()

n = X.shape[0]

# 20 × (90/10) 外层划分
splits = []
for i in range(20):
    perm = rng.permutation(n)
    tr = perm[: round(n * 0.9)]
    te = perm[round(n * 0.9):]
    splits.append((tr, te))

# 超参（与 Anchor-MoE 主设定一致）
STANDARDIZE_X = True
D, K, HID, NC  = 2, 8, 128, 3
LR, EPOCHS     = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS     = 2, 0.05

rmses, nlls, r2s = [], [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):
    rmse, nll, best_ep, r2 = train_one_split_no_cal(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}  R²={r2:.4f}")
    rmses.append(rmse); nlls.append(nll); r2s.append(r2)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# 汇总
rmses = np.asarray(rmses, dtype=np.float64)
nlls  = np.asarray(nlls,  dtype=np.float64)
r2s   = np.asarray(r2s,   dtype=np.float64)
se = lambda a: a.std(ddof=1)/np.sqrt(len(a))

print("\n== Anchor-MoE (Calibration Ablation: NO CAL) on Protein (10k downsample) ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")
print(f" R²         = {r2s.mean():.4f} ± {se(r2s):.4f}")

[01/20] MoE best_ep=116  TestRMSE(orig)=4.4435  TestNLL(z)=1.0201  R²=0.4890
[02/20] MoE best_ep=  6  TestRMSE(orig)=4.3157  TestNLL(z)=1.0940  R²=0.5116
[03/20] MoE best_ep= 24  TestRMSE(orig)=4.2766  TestNLL(z)=0.9442  R²=0.4970
[04/20] MoE best_ep= 73  TestRMSE(orig)=4.3144  TestNLL(z)=0.7369  R²=0.4867
[05/20] MoE best_ep= 84  TestRMSE(orig)=4.4873  TestNLL(z)=0.9044  R²=0.4712
[06/20] MoE best_ep= 88  TestRMSE(orig)=4.6044  TestNLL(z)=1.0003  R²=0.4451
[07/20] MoE best_ep= 69  TestRMSE(orig)=4.2610  TestNLL(z)=0.8132  R²=0.5151
[08/20] MoE best_ep= 74  TestRMSE(orig)=4.3224  TestNLL(z)=0.8632  R²=0.5149
[09/20] MoE best_ep=  5  TestRMSE(orig)=4.3314  TestNLL(z)=1.1174  R²=0.5218
[10/20] MoE best_ep=  5  TestRMSE(orig)=4.5072  TestNLL(z)=1.0794  R²=0.4602
[11/20] MoE best_ep=  7  TestRMSE(orig)=4.3232  TestNLL(z)=1.0591  R²=0.5087
[12/20] MoE best_ep=169  TestRMSE(orig)=4.2366  TestNLL(z)=0.7541  R²=0.5140
[13/20] MoE best_ep= 78  TestRMSE(orig)=4.3122  TestNLL(z)=0.7623  R²=0.5193

## Wine

In [10]:
# === Run on Housing — Calibration Ablation (NO post-hoc CAL), leak-free ===
set_seed(1)
X, y = load_dataset("wine", verbose=True)

SEED = 1
n = X.shape[0]
rng = np.random.RandomState(SEED)

# 20 × (90/10) 外层划分
splits = []
for i in range(20):
    perm = rng.permutation(n)
    tr = perm[: round(n * 0.9)]
    te = perm[round(n * 0.9):]
    splits.append((tr, te))

# 超参（与 Anchor-MoE 主设定一致）
STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS     = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS     = 2, 0.05

rmses, nlls, r2s = [], [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):
    rmse, nll, best_ep, r2 = train_one_split_no_cal(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}  R²={r2:.4f}")
    rmses.append(rmse); nlls.append(nll); r2s.append(r2)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# 汇总
rmses = np.asarray(rmses, dtype=np.float64)
nlls  = np.asarray(nlls,  dtype=np.float64)
r2s   = np.asarray(r2s,   dtype=np.float64)
se = lambda a: a.std(ddof=1)/np.sqrt(len(a))

print("\n== Anchor-MoE (Calibration Ablation: NO CAL) on Housing ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")
print(f" R²         = {r2s.mean():.4f} ± {se(r2s):.4f}")

[WineRed] X.shape=(1599, 11) y.shape=(1599,) | y∈[3.000,8.000]
[01/20] MoE best_ep=  1  TestRMSE(orig)=0.5817  TestNLL(z)=1.0947  R²=0.5390
[02/20] MoE best_ep=  3  TestRMSE(orig)=0.5605  TestNLL(z)=1.1032  R²=0.4922
[03/20] MoE best_ep=  5  TestRMSE(orig)=0.6973  TestNLL(z)=1.4297  R²=0.4524
[04/20] MoE best_ep=  3  TestRMSE(orig)=0.6043  TestNLL(z)=1.1490  R²=0.4244
[05/20] MoE best_ep=  3  TestRMSE(orig)=0.5705  TestNLL(z)=1.1133  R²=0.4163
[06/20] MoE best_ep=  4  TestRMSE(orig)=0.6169  TestNLL(z)=1.1464  R²=0.4065
[07/20] MoE best_ep=  7  TestRMSE(orig)=0.6311  TestNLL(z)=1.3918  R²=0.4794
[08/20] MoE best_ep=  2  TestRMSE(orig)=0.6536  TestNLL(z)=1.1885  R²=0.4416
[09/20] MoE best_ep=  5  TestRMSE(orig)=0.5806  TestNLL(z)=1.1549  R²=0.4523
[10/20] MoE best_ep=  9  TestRMSE(orig)=0.6134  TestNLL(z)=1.3013  R²=0.5039
[11/20] MoE best_ep=  7  TestRMSE(orig)=0.6394  TestNLL(z)=1.5736  R²=0.4249
[12/20] MoE best_ep=  8  TestRMSE(orig)=0.6377  TestNLL(z)=1.1673  R²=0.2898
[13/20] MoE b

In [11]:
# === Run on Housing — Calibration Ablation (NO post-hoc CAL), leak-free ===
set_seed(1)
X, y = load_dataset("yacht", verbose=True)

SEED = 1
n = X.shape[0]
rng = np.random.RandomState(SEED)

# 20 × (90/10) 外层划分
splits = []
for i in range(20):
    perm = rng.permutation(n)
    tr = perm[: round(n * 0.9)]
    te = perm[round(n * 0.9):]
    splits.append((tr, te))

# 超参（与 Anchor-MoE 主设定一致）
STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS     = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS     = 2, 0.05

rmses, nlls, r2s = [], [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):
    rmse, nll, best_ep, r2 = train_one_split_no_cal(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}  R²={r2:.4f}")
    rmses.append(rmse); nlls.append(nll); r2s.append(r2)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# 汇总
rmses = np.asarray(rmses, dtype=np.float64)
nlls  = np.asarray(nlls,  dtype=np.float64)
r2s   = np.asarray(r2s,   dtype=np.float64)
se = lambda a: a.std(ddof=1)/np.sqrt(len(a))

print("\n== Anchor-MoE (Calibration Ablation: NO CAL) on Housing ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")
print(f" R²         = {r2s.mean():.4f} ± {se(r2s):.4f}")

[Yacht] X.shape=(308, 6) y.shape=(308,) | y∈[0.010,62.420]
[01/20] MoE best_ep=394  TestRMSE(orig)=0.6753  TestNLL(z)=-1.6954  R²=0.9981
[02/20] MoE best_ep=298  TestRMSE(orig)=0.4221  TestNLL(z)=-1.9212  R²=0.9991
[03/20] MoE best_ep=380  TestRMSE(orig)=0.3478  TestNLL(z)=-1.9736  R²=0.9990
[04/20] MoE best_ep=188  TestRMSE(orig)=0.5106  TestNLL(z)=-1.8341  R²=0.9992
[05/20] MoE best_ep=303  TestRMSE(orig)=0.3559  TestNLL(z)=-1.9596  R²=0.9996
[06/20] MoE best_ep=199  TestRMSE(orig)=0.4025  TestNLL(z)=-1.9257  R²=0.9988
[07/20] MoE best_ep=244  TestRMSE(orig)=0.2699  TestNLL(z)=-2.0159  R²=0.9982
[08/20] MoE best_ep= 62  TestRMSE(orig)=0.8250  TestNLL(z)=-1.5515  R²=0.9968
[09/20] MoE best_ep=188  TestRMSE(orig)=0.4697  TestNLL(z)=-1.8853  R²=0.9987
[10/20] MoE best_ep=289  TestRMSE(orig)=0.3375  TestNLL(z)=-1.9937  R²=0.9987
[11/20] MoE best_ep=267  TestRMSE(orig)=0.6499  TestNLL(z)=-1.7082  R²=0.9986
[12/20] MoE best_ep=178  TestRMSE(orig)=0.7129  TestNLL(z)=-1.7633  R²=0.9967
[13/2