# **Ablating Anchor**

## Model Definition

In [17]:
# =========================
# Notebook "definitions" cell (no side effects)
# =========================
import os, math, gc
import numpy as np
import pandas as pd
from typing import Tuple, Dict, List, Optional

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# ---------- Globals ----------
LOG2PI  = math.log(2*math.pi)
DEVICE  = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.set_default_dtype(torch.float32)

# ---------- Utilities ----------
def set_seed(seed: int = 1) -> None:
    np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def rmse_score(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def zscore_fit(y_train: np.ndarray) -> Tuple[float, float]:
    my = float(np.mean(y_train)); sy = float(np.std(y_train) + 1e-8)
    return my, sy

def to_tensor(x: np.ndarray, yz: Optional[np.ndarray] = None,
              mx: Optional[np.ndarray] = None, sx: Optional[np.ndarray] = None):
    if mx is None: mx = 0.0
    if sx is None: sx = 1.0
    Xt = torch.tensor(((x - mx)/sx).astype(np.float32), device=DEVICE)
    if yz is None: return Xt
    Yt = torch.tensor(yz.astype(np.float32), device=DEVICE)
    return Xt, Yt

def make_random_splits(n: int, train_frac: float = 0.9, n_splits: int = 20,
                       seed: int = 1) -> List[Tuple[np.ndarray, np.ndarray]]:
    rng = np.random.RandomState(seed)
    splits = []
    for _ in range(n_splits):
        perm = rng.permutation(n)
        tr = perm[: round(n*train_frac)]
        te = perm[round(n*train_frac):]
        splits.append((tr, te))
    return splits

def _clean_numeric(df: pd.DataFrame) -> pd.DataFrame:
    df = df.apply(pd.to_numeric, errors="coerce")
    df = df.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
    return df

# ---------- Dataset loaders ----------
def load_dataset(name: str, verbose: bool = False, **kwargs) -> Tuple[np.ndarray, np.ndarray]:
    """
    Supported public datasets (mainly numeric):
      - 'housing' (UCI Boston Housing)
      - 'concrete' (UCI Concrete compressive strength)
      - 'energy'   (ENB2012_data.xlsx, using Y2 as target)
      - 'wine'     (Wine Quality - red)
      - 'yacht'    (Yacht Hydrodynamics)
    Local-file datasets (provide paths via kwargs):
      - 'kin8nm'  -> kin8nm.csv
      - 'protein' -> protein.csv
      - 'naval'   -> naval.txt
      - 'power'   -> power.xlsx (UCI CCPP preprocessed)
      - 'msd'     -> YearPredictionMSD.txt
    """
    key = name.lower().strip()

    if key == "housing":
        df = pd.read_csv(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
            header=None, sep=r"\s+", engine="python", comment="#", skip_blank_lines=True
        )
        if df.shape[1] == 1:  # some mirrors pack all columns into one
            df = df[0].astype(str).str.strip().str.split(r"\s+", expand=True)
        df = _clean_numeric(df); assert df.shape[1] >= 14
        X = df.iloc[:, :-1].to_numpy(np.float64); y = df.iloc[:, -1].to_numpy(np.float64)

    elif key == "concrete":
        df = pd.read_excel(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls"
        )
        df = _clean_numeric(df)
        X = df.iloc[:, :-1].to_numpy(np.float64); y = df.iloc[:, -1].to_numpy(np.float64)

    elif key == "energy":
        df = pd.read_excel(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"
        ).iloc[:, :-1]
        df = _clean_numeric(df)
        X = df.iloc[:, :-1].to_numpy(np.float64)  # features: all but last
        y = df.iloc[:, -1].to_numpy(np.float64)   # target: last (Y2)

    elif key == "wine":
        df = pd.read_csv(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
            delimiter=";"
        )
        df = _clean_numeric(df)
        X = df.iloc[:, :-1].to_numpy(np.float64); y = df.iloc[:, -1].to_numpy(np.float64)

    elif key == "yacht":
        df = pd.read_csv(
            "https://archive.ics.uci.edu/ml/machine-learning-databases/00243/yacht_hydrodynamics.data",
            header=None, sep=r"\s+", engine="python", comment="#", skip_blank_lines=True
        )
        if df.shape[1] == 1:
            df = df[0].astype(str).str.strip().str.split(r"\s+", expand=True)
        df = _clean_numeric(df); assert df.shape[1] >= 7
        X = df.iloc[:, :-1].to_numpy(np.float64); y = df.iloc[:, -1].to_numpy(np.float64)

    elif key == "kin8nm":
        local = kwargs.get("path", "kin8nm.csv")
        if not os.path.exists(local): raise FileNotFoundError(f"[kin8nm] missing file: {local}")
        df = pd.read_csv(local); df = _clean_numeric(df)
        X = df.iloc[:, :-1].to_numpy(np.float64); y = df.iloc[:, -1].to_numpy(np.float64)

    elif key == "protein":
        local = kwargs.get("path", "protein.csv")
        if not os.path.exists(local): raise FileNotFoundError(f"[protein] missing file: {local}")
        df = pd.read_csv(local, sep=None, engine="python")
        df = _clean_numeric(df)
        y = df.iloc[:, 0].to_numpy(np.float64)
        X = df.iloc[:, 1:].to_numpy(np.float64)

    elif key == "naval":
        local = kwargs.get("path", "naval.txt")
        if not os.path.exists(local): raise FileNotFoundError(f"[naval] missing file: {local}")
        df = pd.read_csv(local, sep=r"\s+", header=None, engine="python")
        df = df.iloc[:, :-1]  # last column often duplicate/unused
        df = _clean_numeric(df)
        X = df.iloc[:, :-1].to_numpy(np.float64)
        y = df.iloc[:, -1].to_numpy(np.float64)

    elif key == "power":
        local = kwargs.get("path", "power.xlsx")
        if not os.path.exists(local): raise FileNotFoundError(f"[power] missing file: {local}")
        df = pd.read_excel(local)
        df = _clean_numeric(df)
        X = df.iloc[:, :-1].to_numpy(np.float64)
        y = df.iloc[:, -1].to_numpy(np.float64)

    elif key == "msd":
        local = kwargs.get("path", "YearPredictionMSD.txt")
        if not os.path.exists(local): raise FileNotFoundError(f"[msd] missing file: {local}")
        df = pd.read_csv(local, header=None)
        df = df.iloc[:, ::-1]  # make target last
        df = _clean_numeric(df)
        X = df.iloc[:, :-1].to_numpy(np.float64)
        y = df.iloc[:, -1].to_numpy(np.float64)

    else:
        raise ValueError("Unknown dataset name.")

    if verbose:
        print(f"[{name}] X.shape={X.shape} y.shape={y.shape} | y∈[{np.min(y):.3f},{np.max(y):.3f}]")
    return X, y

# ---------- MoE (No-Anchor) ----------
def _topk_mask(w: torch.Tensor, k: int = 2) -> torch.Tensor:
    _, topi = torch.topk(w, k, dim=-1)
    mask = torch.zeros_like(w).scatter_(-1, topi, 1.0)
    w2 = w * mask
    return w2 / (w2.sum(dim=-1, keepdim=True) + 1e-12)

def _topk_mask_smooth(w: torch.Tensor, k: int = 2, eps: float = 0.05) -> torch.Tensor:
    _, topi = torch.topk(w, k, dim=-1)
    mask = torch.zeros_like(w).scatter_(-1, topi, 1.0)
    w_top = w * mask
    w_top = w_top / (w_top.sum(dim=-1, keepdim=True) + 1e-12)
    return (1.0 - eps) * w_top + (eps / k) * mask

class Projection(nn.Module):
    def __init__(self, d: int, D: int):
        super().__init__()
        self.w = nn.Linear(d, D, bias=True)
        nn.init.xavier_uniform_(self.w.weight); nn.init.zeros_(self.w.bias)
    def forward(self, x): return self.w(x)

class Window(nn.Module):
    def __init__(self, K: int, D: int, min_log_s: float = -2.5, max_log_s: float = 1.0):
        super().__init__()
        self.c         = nn.Parameter(torch.randn(K, D))
        self.log_s     = nn.Parameter(torch.zeros(K, D))
        self.min_log_s = min_log_s; self.max_log_s = max_log_s
    def forward(self, z):
        log_s = torch.clamp(self.log_s, min=self.min_log_s, max=self.max_log_s)
        diff2 = ((z[:, None] - self.c)**2) / (2 * torch.exp(log_s)**2)
        return torch.exp(-diff2.sum(dim=-1)) + 1e-12  # [B,K]

class Router(nn.Module):
    def __init__(self, D: int, K: int):
        super().__init__()
        self.q   = nn.Linear(D, 64)
        self.k   = nn.Parameter(torch.randn(K, 64))
        self.tau = 3.0
    def forward(self, z):
        logits = (self.q(z) @ self.k.T) / math.sqrt(64)
        return F.softmax(logits / self.tau, dim=-1)

class ExpertMDN(nn.Module):
    def __init__(self, d: int, h: int, nc: int, sigma_min: float = 5e-2, sigma_max: float = 1.0):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d, h), nn.ReLU(),
            nn.Linear(h, h), nn.ReLU()
        )
        self.logits = nn.Linear(h, nc)
        self.means  = nn.Linear(h, nc)
        self.log_sc = nn.Linear(h, nc)
        self.sigma_min = float(sigma_min)
        self.sigma_max = float(sigma_max)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight); nn.init.zeros_(m.bias)
    def forward(self, x):
        h  = self.net(x)
        pi = F.softmax(self.logits(h), dim=-1)
        mu = self.means(h)
        sg = torch.exp(self.log_sc(h)).clamp(self.sigma_min, self.sigma_max)
        return pi, mu, sg

class BLRMoE(nn.Module):
    """
    No-Anchor ablation:
      - Means come purely from experts (no anchor injection/coupling)
      - Keep API `mu_anchor_z` for compatibility, but it's ignored.
    """
    def __init__(self, d: int, D: int, K: int, hid: int, nc: int,
                 w_ent_warm: float = +1e-3, w_ent_cool: float = -2e-4,
                 l2_win: float = 1e-4, lb_coef: float = 1e-3,
                 sigma_min: float = 5e-2, sigma_max: float = 1.0,
                 topk: int = 2, smooth_eps: float = 0.05):
        super().__init__()
        self.proj   = Projection(d, D)
        self.win    = Window(K, D)
        self.router = Router(D, K)
        self.exps   = nn.ModuleList([ExpertMDN(d, hid, nc, sigma_min, sigma_max) for _ in range(K)])
        self.w_ent_warm = w_ent_warm
        self.w_ent_cool = w_ent_cool
        self.l2_win     = l2_win
        self.lb_coef    = lb_coef
        self.topk       = topk
        self.smooth_eps = smooth_eps
        self.K = K; self.nc = nc

    def _mixture_params(self, X: torch.Tensor, train: bool = True):
        z = self.proj(X)
        w = self.win(z) * self.router(z)
        w = w / (w.sum(dim=-1, keepdim=True) + 1e-12)
        w = _topk_mask_smooth(w, k=self.topk, eps=self.smooth_eps) if train else _topk_mask(w, k=self.topk)

        B, K, C = X.size(0), self.K, self.nc
        Pi = torch.full((B, K, C), 1.0/C, device=X.device)
        Mu = torch.zeros(B, K, C, device=X.device)
        Sg = torch.ones(B, K, C, device=X.device)

        _, topi = torch.topk(w, self.topk, dim=-1)
        uniq = torch.unique(topi)
        for j in uniq.tolist():
            pi_j, mu_j, sg_j = self.exps[j](X)
            mask = (topi == j).any(dim=1).float().unsqueeze(-1)
            Pi[:, j, :] = pi_j * mask + Pi[:, j, :]*(1 - mask)
            Mu[:, j, :] = mu_j * mask + Mu[:, j, :]*(1 - mask)
            Sg[:, j, :] = sg_j * mask + Sg[:, j, :]*(1 - mask)
        return w, Pi, Mu, Sg, z

    # keep signature; mu_anchor_z is ignored
    def nll(self, X: torch.Tensor, y_z: torch.Tensor, mu_anchor_z=None,
            epoch: int = 1, warmup_epochs: int = 150) -> torch.Tensor:
        train_flag = self.training
        w, Pi, Mu, Sg, z = self._mixture_params(X, train=train_flag)

        Mu_eff = Mu  # no-anchor
        yv   = y_z[:, None, None]
        logp = -0.5 * ((yv - Mu_eff)/Sg)**2 - torch.log(Sg) - 0.5*LOG2PI

        w3   = w[:, :, None]
        logw = torch.where(w3 > 0, torch.log(w3 + 1e-12), torch.full_like(w3, -1e9))
        logpi= torch.log(Pi + 1e-12)
        logmix = torch.logsumexp(logw + logpi + logp, dim=(1,2))
        nll = -logmix.mean()

        if train_flag:
            w_ent = self.w_ent_warm if epoch <= warmup_epochs else self.w_ent_cool
            p = self.router(z)
            ent = (p * torch.log(p + 1e-12)).sum(dim=1).mean()
            l2w = (self.win.log_s**2).mean()
            rho = p.mean(dim=0)
            lb_loss = ((rho - 1.0/p.size(1))**2).sum()
            nll = nll + w_ent*ent + self.l2_win*l2w + self.lb_coef*lb_loss
        return nll

    @torch.no_grad()
    def predict_mean_var(self, X: torch.Tensor, mu_anchor_z=None):
        w, Pi, Mu, Sg, _ = self._mixture_params(X, train=False)
        Mu_eff = Mu
        mu_e  = (Pi * Mu_eff).sum(dim=2)
        m2_e  = (Pi * (Sg**2 + Mu_eff**2)).sum(dim=2)
        mu_z  = (w * mu_e).sum(dim=1)
        second= (w * m2_e).sum(dim=1)
        var_z = torch.clamp(second - mu_z**2, min=1e-9)
        return mu_z, var_z

# ---------- Training/Eval (Leak-free, No-Anchor) ----------
def train_one_split_no_anchor(
    X_all: np.ndarray, y_all: np.ndarray,    # 90% train-side (outer split)
    X_te:  np.ndarray, y_te:  np.ndarray,    # 10% test-side (outer split, only for final eval)
    standardize_x: bool = True,
    D: int = 4, K: int = 8, HID: int = 128, NC: int = 3,
    LR: float = 1e-3, EPOCHS: int = 400,
    SIGMA_MIN: float = 5e-2, SIGMA_MAX: float = 1.0,
    TOPK: int = 2, SMOOTH_EPS: float = 0.05,
    seed: int = 1
) -> Tuple[float, float, int, float]:
    """
    Leak-free protocol:
      - X_all/y_all (90%) -> TV + CAL  (CAL only for linear post-cal)
      - TV -> TR/VA       (early stopping on VA NLL)
      - Phase-2: train on TV for best_ep (start from best_state)
      - Evaluate: linear post-cal fitted on CAL, then RMSE/R2 on TEST; NLL on TEST z-score
    Returns: (rmse, test_nll, best_ep, r2)
    """
    # TV/CAL split (CAL only for linear calibration)
    X_tv, X_cal, y_tv, y_cal = train_test_split(X_all, y_all, test_size=0.125, random_state=seed)
    # TR/VA split for early stopping
    X_tr, X_va, y_tr, y_va   = train_test_split(X_tv,  y_tv,  test_size=0.2,   random_state=seed)

    # Feature standardization (fit on TR only)
    if standardize_x:
        mx_tr = X_tr.mean(0, keepdims=True); sx_tr = X_tr.std(0, keepdims=True) + 1e-8
    else:
        mx_tr = np.zeros((1, X_tr.shape[1])); sx_tr = np.ones((1, X_tr.shape[1]))

    # y z-score (Phase-1 on TR stats)
    my_tr, sy_tr = zscore_fit(y_tr)
    y_tr_z = (y_tr - my_tr) / sy_tr
    y_va_z = (y_va - my_tr) / sy_tr

    # Tensors
    Xtr_t, ytr_t = to_tensor(X_tr, y_tr_z, mx_tr, sx_tr)
    Xva_t, yva_t = to_tensor(X_va, y_va_z, mx_tr, sx_tr)

    # Model
    model = BLRMoE(
        d=X_tr.shape[1], D=D, K=K, hid=HID, nc=NC,
        sigma_min=SIGMA_MIN, sigma_max=SIGMA_MAX,
        topk=TOPK, smooth_eps=SMOOTH_EPS
    ).to(DEVICE)
    opt = optim.AdamW(model.parameters(), lr=LR, weight_decay=3e-4)

    # Early stopping on VA-NLL
    best_ep, best_vnll, best_state = 0, +1e9, None
    for ep in range(1, EPOCHS+1):
        model.train()
        opt.zero_grad()
        loss = model.nll(Xtr_t, ytr_t, mu_anchor_z=None, epoch=ep, warmup_epochs=150)
        loss.backward(); nn.utils.clip_grad_norm_(model.parameters(), 2.0); opt.step()
        model.router.tau = max(model.router.tau * 0.995, 1.0)

        model.eval()
        with torch.no_grad():
            vnll = float(model.nll(Xva_t, yva_t, mu_anchor_z=None, epoch=ep, warmup_epochs=150).cpu().item())
        if vnll < best_vnll:
            best_vnll = vnll; best_ep = ep
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    # Phase-2 retrain on TV for best_ep (y z-score now uses TV stats; features still TR stats)
    my_tv, sy_tv = zscore_fit(y_tv)
    X_tv_t, y_tv_t = to_tensor(X_tv, (y_tv - my_tv)/sy_tv, mx_tr, sx_tr)

    model.load_state_dict(best_state)
    model.train()
    opt = optim.AdamW(model.parameters(), lr=LR, weight_decay=3e-4)
    for ep in range(1, best_ep+1):
        opt.zero_grad()
        loss = model.nll(X_tv_t, y_tv_t, mu_anchor_z=None, epoch=ep, warmup_epochs=150)
        loss.backward(); nn.utils.clip_grad_norm_(model.parameters(), 2.0); opt.step()
        model.router.tau = max(model.router.tau * 0.995, 1.0)

    # Evaluate: CAL-only linear calibration; TEST-only metrics
    model.eval()
    with torch.no_grad():
        Xcal_t = to_tensor(X_cal, None, mx_tr, sx_tr)
        Xte_t  = to_tensor(X_te,  None, mx_tr, sx_tr)

        # NLL on TEST (using TV stats)
        y_te_z = (y_te - my_tv)/sy_tv
        yte_t  = torch.tensor(y_te_z.astype(np.float32), device=DEVICE)
        test_nll = float(model.nll(Xte_t, yte_t, mu_anchor_z=None).cpu().item())

        # Means back to original units using TV stats
        mu_z_cal, _ = model.predict_mean_var(Xcal_t, mu_anchor_z=None)
        mu_cal_orig = mu_z_cal.cpu().numpy().astype(np.float64) * sy_tv + my_tv

        mu_z_te,  _ = model.predict_mean_var(Xte_t,  mu_anchor_z=None)
        mu_te_orig  = mu_z_te.cpu().numpy().astype(np.float64)  * sy_tv + my_tv

    # Linear post-calibration on CAL
    A = np.vstack([mu_cal_orig, np.ones_like(mu_cal_orig)]).T
    ab, *_ = np.linalg.lstsq(A, y_cal.astype(np.float64), rcond=None)
    a, b = float(ab[0]), float(ab[1])

    mu_te_cal = a * mu_te_orig + b
    rmse = rmse_score(y_te.astype(np.float64), mu_te_cal)
    r2   = float(r2_score(y_te.astype(np.float64), mu_te_cal))
    return rmse, test_nll, best_ep, r2

def run_no_anchor_experiment(
    X: np.ndarray, y: np.ndarray,
    n_splits: int = 20, outer_train_frac: float = 0.9, seed: int = 1,
    standardize_x: bool = True,
    D: int = 4, K: int = 8, HID: int = 128, NC: int = 3,
    LR: float = 1e-3, EPOCHS: int = 400,
    SIGMA_MIN: float = 5e-2, SIGMA_MAX: float = 1.0,
    TOPK: int = 2, SMOOTH_EPS: float = 0.05
) -> Dict[str, np.ndarray]:
    """
    Convenience wrapper: run multiple outer splits (leak-free) and return arrays of metrics.
    No printing inside; you can print/aggregate in your own cell.
    """
    splits = make_random_splits(len(X), train_frac=outer_train_frac, n_splits=n_splits, seed=seed)
    rmses, nlls, r2s, best_eps = [], [], [], []
    for itr, (tr_idx, te_idx) in enumerate(splits, 1):
        rmse, nll, best_ep, r2 = train_one_split_no_anchor(
            X[tr_idx].copy(), y[tr_idx].copy(),
            X[te_idx].copy(),  y[te_idx].copy(),
            standardize_x=standardize_x,
            D=D, K=K, HID=HID, NC=NC,
            LR=LR, EPOCHS=EPOCHS,
            SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
            TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
            seed=seed+itr
        )
        rmses.append(rmse); nlls.append(nll); r2s.append(r2); best_eps.append(best_ep)
        gc.collect();
        if torch.cuda.is_available(): torch.cuda.empty_cache()

    return {
        "rmse": np.asarray(rmses, dtype=np.float64),
        "nll":  np.asarray(nlls,  dtype=np.float64),
        "r2":   np.asarray(r2s,   dtype=np.float64),
        "best_ep": np.asarray(best_eps, dtype=np.int32),
    }

## Housing

In [19]:
# === Run on Housing (UCI Boston) — MoE w/o Anchor (leak-free) ===
set_seed(1)
X, y = load_dataset("housing", verbose=True)

SEED = 1
np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# 20 次 90/10 外层划分（与 NGBoost 协议一致）
n = X.shape[0]
splits = []
for i in range(20):
    perm = np.random.choice(np.arange(n), n, replace=False)
    splits.append((perm[: round(n * 0.9)], perm[round(n * 0.9):]))

# 与原脚本保持相同的超参（除了移除 anchor 相关的）
STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS    = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS = 2, 0.05

rmses, nlls = [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):
    # 训练并评估（无 anchor、严格无泄漏）
    rmse, nll, best_ep, _r2 = train_one_split_no_anchor(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}")
    rmses.append(rmse); nlls.append(nll)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

rmses = np.array(rmses, dtype=np.float64)
nlls  = np.array(nlls,  dtype=np.float64)
se = lambda a: a.std(ddof=1) / np.sqrt(len(a))
print("\n== MoE (No-Anchor, no-leak) on Housing ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")

[housing] X.shape=(506, 13) y.shape=(506,) | y∈[5.000,50.000]
[01/20] MoE best_ep= 21  TestRMSE(orig)=2.7858  TestNLL(z)=0.2485
[02/20] MoE best_ep= 25  TestRMSE(orig)=2.7184  TestNLL(z)=0.4500
[03/20] MoE best_ep= 40  TestRMSE(orig)=5.8159  TestNLL(z)=5.3371
[04/20] MoE best_ep= 21  TestRMSE(orig)=3.9054  TestNLL(z)=0.6567
[05/20] MoE best_ep= 31  TestRMSE(orig)=5.5873  TestNLL(z)=0.8501
[06/20] MoE best_ep= 20  TestRMSE(orig)=3.1320  TestNLL(z)=0.3391
[07/20] MoE best_ep= 21  TestRMSE(orig)=2.3982  TestNLL(z)=0.2805
[08/20] MoE best_ep= 16  TestRMSE(orig)=4.6720  TestNLL(z)=0.3427
[09/20] MoE best_ep= 23  TestRMSE(orig)=3.9425  TestNLL(z)=1.2395
[10/20] MoE best_ep= 19  TestRMSE(orig)=5.7289  TestNLL(z)=0.6667
[11/20] MoE best_ep= 29  TestRMSE(orig)=5.9790  TestNLL(z)=0.9890
[12/20] MoE best_ep= 17  TestRMSE(orig)=5.2189  TestNLL(z)=0.5063
[13/20] MoE best_ep= 19  TestRMSE(orig)=3.0202  TestNLL(z)=0.3520
[14/20] MoE best_ep= 17  TestRMSE(orig)=4.4091  TestNLL(z)=0.9967
[15/20] MoE be

## Concrete

In [22]:
# === Run on Housing (UCI Concrete) — MoE w/o Anchor (leak-free) ===
set_seed(1)
X, y = load_dataset("concrete", verbose=True)

SEED = 1
np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

n = X.shape[0]
splits = []
for i in range(20):
    perm = np.random.choice(np.arange(n), n, replace=False)
    splits.append((perm[: round(n * 0.9)], perm[round(n * 0.9):]))

STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS    = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS = 2, 0.05

rmses, nlls = [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):

    rmse, nll, best_ep, _r2 = train_one_split_no_anchor(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}")
    rmses.append(rmse); nlls.append(nll)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

rmses = np.array(rmses, dtype=np.float64)
nlls  = np.array(nlls,  dtype=np.float64)
se = lambda a: a.std(ddof=1) / np.sqrt(len(a))
print("\n== MoE (No-Anchor, no-leak) on Concrete ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")

[concrete] X.shape=(1030, 8) y.shape=(1030,) | y∈[2.332,82.599]
[01/20] MoE best_ep= 31  TestRMSE(orig)=8.6908  TestNLL(z)=0.8044
[02/20] MoE best_ep= 29  TestRMSE(orig)=9.0519  TestNLL(z)=0.8425
[03/20] MoE best_ep= 34  TestRMSE(orig)=6.4428  TestNLL(z)=0.5905
[04/20] MoE best_ep= 34  TestRMSE(orig)=7.3591  TestNLL(z)=0.7132
[05/20] MoE best_ep= 51  TestRMSE(orig)=7.4073  TestNLL(z)=1.1140
[06/20] MoE best_ep= 26  TestRMSE(orig)=8.3383  TestNLL(z)=0.6317
[07/20] MoE best_ep= 27  TestRMSE(orig)=8.6215  TestNLL(z)=0.7152
[08/20] MoE best_ep= 41  TestRMSE(orig)=7.5074  TestNLL(z)=0.8342
[09/20] MoE best_ep= 48  TestRMSE(orig)=7.9033  TestNLL(z)=0.4478
[10/20] MoE best_ep= 42  TestRMSE(orig)=7.2035  TestNLL(z)=0.6737
[11/20] MoE best_ep= 28  TestRMSE(orig)=7.0081  TestNLL(z)=0.6334
[12/20] MoE best_ep= 28  TestRMSE(orig)=7.7609  TestNLL(z)=0.6468
[13/20] MoE best_ep= 57  TestRMSE(orig)=7.3684  TestNLL(z)=0.6108
[14/20] MoE best_ep= 70  TestRMSE(orig)=6.6325  TestNLL(z)=0.9026
[15/20] MoE 

## Energy

In [26]:
# === Run on Housing (UCI Concrete) — MoE w/o Anchor (leak-free) ===
set_seed(1)
X, y = load_dataset("energy", verbose=True)

SEED = 1
np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

n = X.shape[0]
splits = []
for i in range(20):
    perm = np.random.choice(np.arange(n), n, replace=False)
    splits.append((perm[: round(n * 0.9)], perm[round(n * 0.9):]))

STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS    = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS = 2, 0.05

rmses, nlls = [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):

    rmse, nll, best_ep, _r2 = train_one_split_no_anchor(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}")
    rmses.append(rmse); nlls.append(nll)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

rmses = np.array(rmses, dtype=np.float64)
nlls  = np.array(nlls,  dtype=np.float64)
se = lambda a: a.std(ddof=1) / np.sqrt(len(a))
print("\n== MoE (No-Anchor, no-leak) on Concrete ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")

[energy] X.shape=(768, 8) y.shape=(768,) | y∈[6.010,43.100]
[01/20] MoE best_ep=352  TestRMSE(orig)=0.6891  TestNLL(z)=-1.6435
[02/20] MoE best_ep=399  TestRMSE(orig)=1.7665  TestNLL(z)=-1.1952
[03/20] MoE best_ep=400  TestRMSE(orig)=1.0104  TestNLL(z)=-1.5112
[04/20] MoE best_ep=291  TestRMSE(orig)=2.0335  TestNLL(z)=-1.2995
[05/20] MoE best_ep=359  TestRMSE(orig)=1.3648  TestNLL(z)=-1.0606
[06/20] MoE best_ep=399  TestRMSE(orig)=0.7653  TestNLL(z)=-1.4509
[07/20] MoE best_ep=397  TestRMSE(orig)=1.4622  TestNLL(z)=-1.2443
[08/20] MoE best_ep=342  TestRMSE(orig)=2.2867  TestNLL(z)=-1.0840
[09/20] MoE best_ep=396  TestRMSE(orig)=0.8420  TestNLL(z)=-1.4574
[10/20] MoE best_ep=355  TestRMSE(orig)=1.0255  TestNLL(z)=-0.7187
[11/20] MoE best_ep=194  TestRMSE(orig)=2.1353  TestNLL(z)=-1.2581
[12/20] MoE best_ep=344  TestRMSE(orig)=2.1433  TestNLL(z)=-1.4744
[13/20] MoE best_ep=396  TestRMSE(orig)=2.2099  TestNLL(z)=-1.2638
[14/20] MoE best_ep=317  TestRMSE(orig)=1.2415  TestNLL(z)=-1.2514
[1

## Kin8nm


In [27]:
# === Run on Housing (UCI Concrete) — MoE w/o Anchor (leak-free) ===
set_seed(1)
X, y = load_dataset("kin8nm", verbose=True)

SEED = 1
np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

n = X.shape[0]
splits = []
for i in range(20):
    perm = np.random.choice(np.arange(n), n, replace=False)
    splits.append((perm[: round(n * 0.9)], perm[round(n * 0.9):]))

STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS    = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS = 2, 0.05

rmses, nlls = [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):

    rmse, nll, best_ep, _r2 = train_one_split_no_anchor(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}")
    rmses.append(rmse); nlls.append(nll)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

rmses = np.array(rmses, dtype=np.float64)
nlls  = np.array(nlls,  dtype=np.float64)
se = lambda a: a.std(ddof=1) / np.sqrt(len(a))
print("\n== MoE (No-Anchor, no-leak) on Concrete ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")


[kin8nm] X.shape=(8785, 8) y.shape=(8785,) | y∈[0.040,1.459]
[01/20] MoE best_ep= 49  TestRMSE(orig)=0.1083  TestNLL(z)=0.7036
[02/20] MoE best_ep= 44  TestRMSE(orig)=0.1113  TestNLL(z)=0.6597
[03/20] MoE best_ep= 62  TestRMSE(orig)=0.0971  TestNLL(z)=0.6340
[04/20] MoE best_ep= 55  TestRMSE(orig)=0.1097  TestNLL(z)=0.7729
[05/20] MoE best_ep= 58  TestRMSE(orig)=0.0982  TestNLL(z)=0.6317
[06/20] MoE best_ep= 49  TestRMSE(orig)=0.1095  TestNLL(z)=0.6828
[07/20] MoE best_ep= 41  TestRMSE(orig)=0.1095  TestNLL(z)=0.4907
[08/20] MoE best_ep= 57  TestRMSE(orig)=0.1102  TestNLL(z)=0.7145
[09/20] MoE best_ep= 60  TestRMSE(orig)=0.1059  TestNLL(z)=0.8208
[10/20] MoE best_ep= 53  TestRMSE(orig)=0.1089  TestNLL(z)=0.6962
[11/20] MoE best_ep= 55  TestRMSE(orig)=0.1049  TestNLL(z)=0.7532
[12/20] MoE best_ep= 56  TestRMSE(orig)=0.0999  TestNLL(z)=0.6729
[13/20] MoE best_ep= 39  TestRMSE(orig)=0.1224  TestNLL(z)=0.6240
[14/20] MoE best_ep= 55  TestRMSE(orig)=0.1023  TestNLL(z)=0.7362
[15/20] MoE bes

## Naval

In [28]:
# === Run on Housing (UCI Concrete) — MoE w/o Anchor (leak-free) ===
set_seed(1)
X, y = load_dataset("naval", verbose=True)

SEED = 1
np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

n = X.shape[0]
splits = []
for i in range(20):
    perm = np.random.choice(np.arange(n), n, replace=False)
    splits.append((perm[: round(n * 0.9)], perm[round(n * 0.9):]))

STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS    = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS = 2, 0.05

rmses, nlls = [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):

    rmse, nll, best_ep, _r2 = train_one_split_no_anchor(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}")
    rmses.append(rmse); nlls.append(nll)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

rmses = np.array(rmses, dtype=np.float64)
nlls  = np.array(nlls,  dtype=np.float64)
se = lambda a: a.std(ddof=1) / np.sqrt(len(a))
print("\n== MoE (No-Anchor, no-leak) on Concrete ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")

[naval] X.shape=(11934, 16) y.shape=(11934,) | y∈[0.950,1.000]
[01/20] MoE best_ep=398  TestRMSE(orig)=0.0028  TestNLL(z)=-0.6314
[02/20] MoE best_ep=398  TestRMSE(orig)=0.0020  TestNLL(z)=-0.8896
[03/20] MoE best_ep=400  TestRMSE(orig)=0.0019  TestNLL(z)=-0.5113
[04/20] MoE best_ep=399  TestRMSE(orig)=0.0023  TestNLL(z)=-0.8911
[05/20] MoE best_ep=400  TestRMSE(orig)=0.0022  TestNLL(z)=-0.9101
[06/20] MoE best_ep=400  TestRMSE(orig)=0.0026  TestNLL(z)=-0.6032
[07/20] MoE best_ep=389  TestRMSE(orig)=0.0027  TestNLL(z)=-0.5455
[08/20] MoE best_ep=398  TestRMSE(orig)=0.0021  TestNLL(z)=-0.6815
[09/20] MoE best_ep=398  TestRMSE(orig)=0.0038  TestNLL(z)=-0.2339
[10/20] MoE best_ep=399  TestRMSE(orig)=0.0018  TestNLL(z)=-0.8764
[11/20] MoE best_ep=394  TestRMSE(orig)=0.0022  TestNLL(z)=-0.5710
[12/20] MoE best_ep=396  TestRMSE(orig)=0.0023  TestNLL(z)=-0.6328
[13/20] MoE best_ep=356  TestRMSE(orig)=0.0025  TestNLL(z)=-0.3788
[14/20] MoE best_ep=400  TestRMSE(orig)=0.0020  TestNLL(z)=-0.8213

## Power

In [30]:
# === Run on Housing (UCI Concrete) — MoE w/o Anchor (leak-free) ===
set_seed(1)
X, y = load_dataset("power", verbose=True)

SEED = 1
np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

n = X.shape[0]
splits = []
for i in range(20):
    perm = np.random.choice(np.arange(n), n, replace=False)
    splits.append((perm[: round(n * 0.9)], perm[round(n * 0.9):]))

STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS    = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS = 2, 0.05

rmses, nlls = [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):

    rmse, nll, best_ep, _r2 = train_one_split_no_anchor(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}")
    rmses.append(rmse); nlls.append(nll)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

rmses = np.array(rmses, dtype=np.float64)
nlls  = np.array(nlls,  dtype=np.float64)
se = lambda a: a.std(ddof=1) / np.sqrt(len(a))
print("\n== MoE (No-Anchor, no-leak) on Concrete ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")


[power] X.shape=(9568, 4) y.shape=(9568,) | y∈[420.260,495.760]
[01/20] MoE best_ep=160  TestRMSE(orig)=4.0409  TestNLL(z)=-0.0505
[02/20] MoE best_ep=182  TestRMSE(orig)=4.0588  TestNLL(z)=-0.0333
[03/20] MoE best_ep=105  TestRMSE(orig)=4.0045  TestNLL(z)=-0.0843
[04/20] MoE best_ep= 94  TestRMSE(orig)=3.9047  TestNLL(z)=-0.1493
[05/20] MoE best_ep= 97  TestRMSE(orig)=4.3740  TestNLL(z)=-0.0607
[06/20] MoE best_ep=268  TestRMSE(orig)=4.2723  TestNLL(z)=0.3420
[07/20] MoE best_ep= 72  TestRMSE(orig)=4.0344  TestNLL(z)=-0.0915
[08/20] MoE best_ep=113  TestRMSE(orig)=3.9922  TestNLL(z)=-0.0535
[09/20] MoE best_ep=151  TestRMSE(orig)=4.1195  TestNLL(z)=0.0094
[10/20] MoE best_ep=187  TestRMSE(orig)=4.0239  TestNLL(z)=-0.1253
[11/20] MoE best_ep=138  TestRMSE(orig)=3.8732  TestNLL(z)=-0.0933
[12/20] MoE best_ep=148  TestRMSE(orig)=4.0663  TestNLL(z)=-0.1012
[13/20] MoE best_ep=119  TestRMSE(orig)=3.9346  TestNLL(z)=-0.0924
[14/20] MoE best_ep=202  TestRMSE(orig)=3.6629  TestNLL(z)=-0.1210


## Protein

In [31]:
# === Run on Protein (CASP) — MoE w/o Anchor (leak-free, downsample to 10k) ===
import gc, numpy as np, torch

set_seed(1)
# 若你之前的 load_dataset 支持 verbose 参数，就保留；否则去掉 verbose=True
X, y = load_dataset("protein", verbose=True)

SEED = 1
np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# ---- 下采样到 10k（仅使用随机索引，不看 y，避免任何泄漏）----
MAX_N = 10_000
orig_n = X.shape[0]
if orig_n > MAX_N:
    rng_ds = np.random.RandomState(SEED)
    keep = rng_ds.choice(orig_n, size=MAX_N, replace=False)
    X = X[keep].copy()
    y = y[keep].copy()
    print(f"[Protein] Downsampled from {orig_n} to {X.shape[0]} samples.")
else:
    print(f"[Protein] Size {orig_n} <= {MAX_N}, no downsampling.")

# ---- 20 次 90/10 外层划分（与既有协议一致）----
n = X.shape[0]
splits = []
rng = np.random.RandomState(SEED)
for i in range(20):
    perm = rng.permutation(n)
    splits.append((perm[: round(n * 0.9)], perm[round(n * 0.9):]))

# ---- 超参（与前面无 Anchor 版本一致）----
STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS    = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS = 2, 0.05

rmses, nlls = [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):
    rmse, nll, best_ep, _ = train_one_split_no_anchor(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}")
    rmses.append(rmse); nlls.append(nll)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

rmses = np.array(rmses, dtype=np.float64)
nlls  = np.array(nlls,  dtype=np.float64)
se = lambda a: a.std(ddof=1) / np.sqrt(len(a))
print("\n== MoE (No-Anchor, no-leak) on Protein (10k downsample) ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")

[protein] X.shape=(45730, 9) y.shape=(45730,) | y∈[0.000,20.999]
[Protein] Downsampled from 45730 to 10000 samples.
[01/20] MoE best_ep=199  TestRMSE(orig)=4.5878  TestNLL(z)=0.6299
[02/20] MoE best_ep=166  TestRMSE(orig)=4.8964  TestNLL(z)=0.7341
[03/20] MoE best_ep=163  TestRMSE(orig)=4.7943  TestNLL(z)=0.6521
[04/20] MoE best_ep=136  TestRMSE(orig)=4.7698  TestNLL(z)=0.5796
[05/20] MoE best_ep=156  TestRMSE(orig)=4.6493  TestNLL(z)=0.5348
[06/20] MoE best_ep=154  TestRMSE(orig)=4.6814  TestNLL(z)=0.7044
[07/20] MoE best_ep=121  TestRMSE(orig)=4.8829  TestNLL(z)=0.7170
[08/20] MoE best_ep=153  TestRMSE(orig)=4.7358  TestNLL(z)=0.6628
[09/20] MoE best_ep=127  TestRMSE(orig)=4.5731  TestNLL(z)=0.7679
[10/20] MoE best_ep=265  TestRMSE(orig)=4.4756  TestNLL(z)=0.6018
[11/20] MoE best_ep=215  TestRMSE(orig)=4.4832  TestNLL(z)=0.5566
[12/20] MoE best_ep=157  TestRMSE(orig)=4.7060  TestNLL(z)=0.6141
[13/20] MoE best_ep=193  TestRMSE(orig)=4.6114  TestNLL(z)=0.6180
[14/20] MoE best_ep=229  T


## Wine


In [33]:
# === Run on Housing (UCI Concrete) — MoE w/o Anchor (leak-free) ===
set_seed(1)
X, y = load_dataset("wine", verbose=True)

SEED = 1
np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

n = X.shape[0]
splits = []
for i in range(20):
    perm = np.random.choice(np.arange(n), n, replace=False)
    splits.append((perm[: round(n * 0.9)], perm[round(n * 0.9):]))

STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS    = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS = 2, 0.05

rmses, nlls = [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):

    rmse, nll, best_ep, _r2 = train_one_split_no_anchor(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}")
    rmses.append(rmse); nlls.append(nll)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

rmses = np.array(rmses, dtype=np.float64)
nlls  = np.array(nlls,  dtype=np.float64)
se = lambda a: a.std(ddof=1) / np.sqrt(len(a))
print("\n== MoE (No-Anchor, no-leak) on Concrete ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")


[wine] X.shape=(1599, 11) y.shape=(1599,) | y∈[3.000,8.000]
[01/20] MoE best_ep= 18  TestRMSE(orig)=0.6403  TestNLL(z)=0.9994
[02/20] MoE best_ep= 18  TestRMSE(orig)=0.6666  TestNLL(z)=1.1749
[03/20] MoE best_ep=101  TestRMSE(orig)=0.7408  TestNLL(z)=0.6099
[04/20] MoE best_ep= 72  TestRMSE(orig)=0.6307  TestNLL(z)=0.4046
[05/20] MoE best_ep= 44  TestRMSE(orig)=0.5915  TestNLL(z)=0.4972
[06/20] MoE best_ep= 17  TestRMSE(orig)=0.6495  TestNLL(z)=1.0928
[07/20] MoE best_ep= 30  TestRMSE(orig)=0.6773  TestNLL(z)=1.0013
[08/20] MoE best_ep=201  TestRMSE(orig)=0.7201  TestNLL(z)=6.6193
[09/20] MoE best_ep= 84  TestRMSE(orig)=0.6552  TestNLL(z)=0.1123
[10/20] MoE best_ep= 41  TestRMSE(orig)=0.6587  TestNLL(z)=1.1153
[11/20] MoE best_ep=110  TestRMSE(orig)=0.6476  TestNLL(z)=6.8915
[12/20] MoE best_ep= 29  TestRMSE(orig)=0.6484  TestNLL(z)=0.7153
[13/20] MoE best_ep= 20  TestRMSE(orig)=0.6568  TestNLL(z)=1.0497
[14/20] MoE best_ep=205  TestRMSE(orig)=0.6829  TestNLL(z)=3.5953
[15/20] MoE best

## Yacht

In [35]:
# === Run on Housing (UCI Concrete) — MoE w/o Anchor (leak-free) ===
set_seed(1)
X, y = load_dataset("yacht", verbose=True)

SEED = 1
np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

n = X.shape[0]
splits = []
for i in range(20):
    perm = np.random.choice(np.arange(n), n, replace=False)
    splits.append((perm[: round(n * 0.9)], perm[round(n * 0.9):]))

STANDARDIZE_X = True
D, K, HID, NC = 2, 8, 128, 3
LR, EPOCHS    = 1e-3, 400
SIGMA_MIN, SIGMA_MAX = 5e-2, 1.0
TOPK, SMOOTH_EPS = 2, 0.05

rmses, nlls = [], []
for itr, (tr_idx, te_idx) in enumerate(splits, 1):

    rmse, nll, best_ep, _r2 = train_one_split_no_anchor(
        X[tr_idx].copy(), y[tr_idx].copy(),
        X[te_idx].copy(),  y[te_idx].copy(),
        standardize_x=STANDARDIZE_X,
        D=D, K=K, HID=HID, NC=NC,
        LR=LR, EPOCHS=EPOCHS,
        SIGMA_MIN=SIGMA_MIN, SIGMA_MAX=SIGMA_MAX,
        TOPK=TOPK, SMOOTH_EPS=SMOOTH_EPS,
        seed=SEED + itr
    )
    print(f"[{itr:02d}/20] MoE best_ep={best_ep:3d}  "
          f"TestRMSE(orig)={rmse:.4f}  TestNLL(z)={nll:.4f}")
    rmses.append(rmse); nlls.append(nll)
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

rmses = np.array(rmses, dtype=np.float64)
nlls  = np.array(nlls,  dtype=np.float64)
se = lambda a: a.std(ddof=1) / np.sqrt(len(a))
print("\n== MoE (No-Anchor, no-leak) ==")
print(f"RMSE (orig) = {rmses.mean():.4f} ± {se(rmses):.4f}")
print(f"NLL  (z)    = {nlls.mean():.4f} ± {se(nlls):.4f}")


[yacht] X.shape=(308, 6) y.shape=(308,) | y∈[0.010,62.420]
[01/20] MoE best_ep= 49  TestRMSE(orig)=2.8491  TestNLL(z)=-0.6580
[02/20] MoE best_ep= 64  TestRMSE(orig)=6.3435  TestNLL(z)=-0.8389
[03/20] MoE best_ep= 61  TestRMSE(orig)=3.0756  TestNLL(z)=-0.3659
[04/20] MoE best_ep=380  TestRMSE(orig)=3.8330  TestNLL(z)=-0.4752
[05/20] MoE best_ep=183  TestRMSE(orig)=4.8626  TestNLL(z)=-0.7710
[06/20] MoE best_ep= 55  TestRMSE(orig)=4.0677  TestNLL(z)=-0.4541
[07/20] MoE best_ep=371  TestRMSE(orig)=1.2637  TestNLL(z)=-1.2975
[08/20] MoE best_ep=326  TestRMSE(orig)=3.0515  TestNLL(z)=-0.9351
[09/20] MoE best_ep= 49  TestRMSE(orig)=4.8756  TestNLL(z)=-0.4014
[10/20] MoE best_ep= 48  TestRMSE(orig)=2.7649  TestNLL(z)=-0.5411
[11/20] MoE best_ep=138  TestRMSE(orig)=4.0778  TestNLL(z)=0.2368
[12/20] MoE best_ep=381  TestRMSE(orig)=3.0999  TestNLL(z)=0.2827
[13/20] MoE best_ep=180  TestRMSE(orig)=5.2409  TestNLL(z)=2.3303
[14/20] MoE best_ep=156  TestRMSE(orig)=4.7693  TestNLL(z)=-0.8719
[15/20