#A) Imports & Config

In [12]:
# ==========================================
# A) IMPORTS & SAFE CONFIG
# ==========================================
import os, math, json, random, warnings
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score, recall_score, f1_score,
    average_precision_score
)

warnings.filterwarnings("ignore")

# --- device & seeding (robust to stale CUDA states) ---
SEED = 42
random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if DEVICE.type == "cuda":
    try:
        torch.cuda.manual_seed_all(SEED)
    except RuntimeError as e:
        print("CUDA seed failed; using CPU. Error:", repr(e))
        DEVICE = torch.device("cpu")
print("Device:", DEVICE)

# --- paths ---
HUMAN_PATH = "data/train/train_human.npy"
AI_PATH    = "data/train/train_ai.npy"
ARTIFACTS  = Path("artifacts"); ARTIFACTS.mkdir(parents=True, exist_ok=True)

# --- preprocessing ---
MASK_TOPK_VAR_DIMS = 64      # zero-out top-K high-variance channels
SEGMENTS           = 3       # early/mid/late splits

# --- model capacity (UltraHybrid-Balanced++) ---
D_IN     = 768
D_MODEL  = 224
N_HEADS  = 4
N_LAYERS = 2
D_GRU    = 160                 # per-direction
CNN_KS   = (3, 5, 7)
CNN_OUT  = 96                  # per-kernel

# --- regularization ---
DROPOUT      = 0.35
GAUSS_NOISE  = 0.05            # std for Gaussian noise on tokens
CHANNEL_DROP = 0.10            # SpatialDropout1D-style channel dropout

# --- training ---
BATCH_SIZE   = 128
EPOCHS       = 30
LR           = 2e-4
WEIGHT_DECAY = 1e-4
WARMUP_FRAC  = 0.10
ES_PATIENCE  = 6

# --- mixup (both tokens & features) ---
USE_MIXUP    = True
MIXUP_P      = 0.35
MIXUP_ALPHA  = 0.4             # Beta(alpha, alpha)


Device: cuda


In [13]:
# ==========================================
# B) DATA UTILS & ENGINEERED FEATURES
# ==========================================
from sklearn.metrics.pairwise import cosine_similarity

def variance_mask(X: np.ndarray, topk: int) -> np.ndarray:
    """Zero-out top-K highest-variance channels across tokens+samples."""
    if topk is None or topk <= 0:
        return X
    flat = X.reshape(-1, X.shape[-1])            # (N*T, D)
    var  = flat.var(axis=0)                      # (D,)
    idx  = np.argsort(var)[::-1][:topk]
    Xc   = X.copy()
    Xc[..., idx] = 0.0
    return Xc

def segment_norms(X: np.ndarray, segments: int = 3) -> np.ndarray:
    """Mean L2 norm per segment (early/mid/late) — (N, segments)."""
    N, T, D = X.shape
    split = T // segments
    chunks = []
    for s in range(segments):
        a = s*split
        b = (s+1)*split if s < segments-1 else T
        seg = X[:, a:b, :]
        chunks.append(np.linalg.norm(seg, axis=2).mean(axis=1))
    return np.stack(chunks, axis=1).astype(np.float32)

def token_variance(X: np.ndarray) -> np.ndarray:
    """Average variance across tokens — (N,1)."""
    return X.var(axis=1).mean(axis=1, keepdims=True).astype(np.float32)

def sharpness_cosine(X: np.ndarray) -> np.ndarray:
    """Std of cosine distance between each token and the sample mean — (N,1)."""
    from scipy.spatial.distance import cosine as cosdist
    vals = []
    for sample in X:
        mu = sample.mean(axis=0)
        if np.linalg.norm(mu) < 1e-8:
            vals.append(0.0); continue
        d = [cosdist(tok, mu) for tok in sample]
        vals.appe


In [14]:
# ==========================================
# C) SPLIT, CENTROIDS & SCALING (TRAIN-ONLY FIT)
# ==========================================
# helper: class centroids from mean pooled tokens
def compute_centroids(X: np.ndarray, y: np.ndarray) -> dict:
    """Compute per-class centroids over mean pooled token embeddings."""
    Xm = X.mean(axis=1)  # mean over 100 tokens
    return {
        "human": Xm[y == 0].mean(axis=0),
        "ai": Xm[y == 1].mean(axis=0)
    }

# helper: engineered features
def build_features(X: np.ndarray, cents: dict, segments: int = 4) -> np.ndarray:
    """
    Build engineered features from token sequences.
    - segment means
    - cosine sim to centroids
    - variance and norms
    """
    N, T, D = X.shape
    feats = []

    # mean over all tokens
    Xm = X.mean(axis=1)
    feats.append(Xm)

    # cosine sim to class centroids
    from sklearn.metrics.pairwise import cosine_similarity
    for c in ["human", "ai"]:
        sim = cosine_similarity(Xm, cents[c][None, :])[:, 0]
        feats.append(sim[:, None])

    # split sequence into segments
    seg_size = T // segments
    for s in range(segments):
        seg = X[:, s * seg_size:(s + 1) * seg_size, :].mean(axis=1)
        feats.append(seg)

    # variance + L2 norms
    feats.append(X.var(axis=(1, 2))[:, None])
    feats.append(np.linalg.norm(Xm, axis=1)[:, None])

    return np.concatenate(feats, axis=1).astype(np.float32)


# load arrays + labels
X_h = np.load(HUMAN_PATH).astype(np.float32)    # (Nh,100,768)
X_a = np.load(AI_PATH).astype(np.float32)       # (Na,100,768)
y_h = np.zeros(len(X_h), dtype=np.int64)
y_a = np.ones(len(X_a),  dtype=np.int64)

X_all = np.concatenate([X_h, X_a], axis=0)
y_all = np.concatenate([y_h, y_a], axis=0)

# mask once for all
X_all_m = variance_mask(X_all, MASK_TOPK_VAR_DIMS)

# stratified split
Xt_tr, Xt_va, y_tr, y_va = train_test_split(
    X_all_m, y_all, test_size=0.15, random_state=SEED, stratify=y_all
)

# centroids on TRAIN only
cents_tr = compute_centroids(Xt_tr, y_tr)

# engineered features
Xf_tr = build_features(Xt_tr, cents_tr, segments=SEGMENTS)
Xf_va = build_features(Xt_va, cents_tr, segments=SEGMENTS)

# scale features on TRAIN only
scaler = StandardScaler()
Xf_tr_s = scaler.fit_transform(Xf_tr).astype(np.float32)
Xf_va_s = scaler.transform(Xf_va).astype(np.float32)

# label sanity (avoid CUDA asserts)
assert set(np.unique(y_tr)).issubset({0,1})
assert set(np.unique(y_va)).issubset({0,1})

print("Tokens:", Xt_tr.shape, Xt_va.shape, "| Feats:", Xf_tr_s.shape, Xf_va_s.shape)


Tokens: (13873, 100, 768) (2449, 100, 768) | Feats: (13873, 3076) (2449, 3076)


#D) UltraHybrid-Lite++ model (Transformer + CNN + BiGRU), with robust regularization

In [15]:
# ==========================================
# D) MODEL: UltraHybrid-Balanced++ (v2)
# ==========================================
# Drop-in replacement for the class you posted. Keeps interface:
#   model = UltraHybridBalancedPPv2(feat_dim)
#   logits = model(xtokens, xfeats)  # (B,)
import torch
import torch.nn as nn
import torch.nn.functional as F

# ---------- small building blocks ----------
class GaussianNoise(nn.Module):
    def __init__(self, std=0.0):
        super().__init__(); self.std = float(std)
    def forward(self, x):
        if not self.training or self.std <= 0: return x
        return x + torch.randn_like(x) * self.std

class ChannelDropout1D(nn.Module):
    """Drop entire embedding channels (SpatialDropout1D analogue)."""
    def __init__(self, p: float):
        super().__init__(); self.p = float(p)
    def forward(self, x):  # x: (B,T,D)
        if not self.training or self.p <= 0: return x
        B,T,D = x.shape
        mask = (torch.rand(B,1,D, device=x.device) > self.p).float()
        return x * mask

class DropPath(nn.Module):
    """Stochastic depth; drops residual branch per-sample."""
    def __init__(self, p: float = 0.0):
        super().__init__(); self.p = float(p)
    def forward(self, x):
        if not self.training or self.p <= 0: return x
        keep = 1 - self.p
        shape = (x.size(0),) + (1,) * (x.ndim - 1)
        mask = x.new_empty(shape).bernoulli_(keep).div(keep)
        return x * mask

class AttentionPooling(nn.Module):
    """
    Single learnable query attends over tokens, returns pooled vector.
    Much more robust than mean/max pooling under distribution shift.
    """
    def __init__(self, d_model: int, n_heads: int = 4, dropout: float = 0.0):
        super().__init__()
        self.q = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)
        self.attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
        self.ln = nn.LayerNorm(d_model)
    def forward(self, x):          # x: (B,T,D)
        B = x.size(0)
        q = self.q.expand(B, -1, -1)         # (B,1,D)
        out, _ = self.attn(q, x, x, need_weights=False)  # (B,1,D)
        return self.ln(out.squeeze(1))       # (B,D)

class SEBlock(nn.Module):
    """Squeeze-Excitation for channel reweighting."""
    def __init__(self, c: int, r: int = 8):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(c, max(1, c//r)), nn.SiLU(),
            nn.Linear(max(1, c//r), c), nn.Sigmoid()
        )
    def forward(self, x):           # x: (B,C)
        return x * self.fc(x)

class DepthwiseSeparableConv1d(nn.Module):
    """
    Depthwise + Pointwise conv with GLU gating.
    In: (B, D_model, T)  Out: (B, C_out)
    """
    def __init__(self, d_in: int, c_out: int, k: int, dropout: float):
        super().__init__()
        padding = k // 2
        self.dw = nn.Conv1d(d_in, d_in, kernel_size=k, padding=padding, groups=d_in, bias=False)
        self.pw = nn.Conv1d(d_in, 2*c_out, kernel_size=1, bias=True)   # 2*c_out for GLU
        self.bn = nn.BatchNorm1d(2*c_out)
        self.drop = nn.Dropout(dropout)
        self.se = SEBlock(c_out)
    def forward(self, x):           # x: (B,D,T)
        h = self.dw(x)
        h = self.pw(h)
        h = self.bn(h)
        a, b = torch.chunk(h, 2, dim=1)      # GLU
        h = a * torch.sigmoid(b)             # (B,C_out,T)
        h = F.adaptive_max_pool1d(h, 1).squeeze(-1)  # (B,C_out)
        h = self.drop(h)
        return self.se(h)

class FeatureGate(nn.Module):
    """
    Learns how much to trust engineered features.
    Applies LN -> small MLP -> sigmoid gate, then scales features.
    """
    def __init__(self, fdim: int):
        super().__init__()
        self.ln = nn.LayerNorm(fdim)
        self.gate = nn.Sequential(
            nn.Linear(fdim, max(8, fdim//2)), nn.GELU(),
            nn.Linear(max(8, fdim//2), fdim), nn.Sigmoid()
        )
    def forward(self, f):           # (B,F)
        g = self.gate(self.ln(f))
        return f * g

# ---------- main model ----------
class UltraHybridBalancedPPv2(nn.Module):
    """
    Upgrades over your UltraHybridBalancedPP:
      • Transformer branch w/ AttentionPooling (learned query)
      • CNN branch: Depthwise-Separable + GLU + SE (per kernel)
      • BiGRU branch w/ AttentionPooling
      • FeatureGate on engineered features
      • DropPath on residual-like fusions
      • Multi-sample dropout in the head for smoother logits
    Returns logits (use BCEWithLogitsLoss).
    """
    def __init__(self, feat_dim: int,
                 d_in: int = 768, d_model: int = 224,
                 n_heads: int = 4, n_layers: int = 2,
                 cnn_kernels=(3,5,7), cnn_out: int = 96,
                 gru_h: int = 160, dropout: float = 0.35,
                 channel_drop: float = 0.10, gauss_noise: float = 0.05,
                 drop_path: float = 0.05, ms_dropout_samples: int = 4):
        super().__init__()
        self.ms_dropout_samples = ms_dropout_samples

        # input regularizers
        self.noise = GaussianNoise(gauss_noise)
        self.cdrop = ChannelDropout1D(channel_drop)

        # shared projection
        self.proj  = nn.Linear(d_in, d_model)

        # ---- Transformer branch ----
        enc = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dropout=dropout,
            activation="gelu", batch_first=True, norm_first=True
        )
        self.tr_encoder = nn.TransformerEncoder(enc, num_layers=n_layers)
        self.tr_pool    = AttentionPooling(d_model, n_heads=n_heads, dropout=dropout)

        # ---- CNN branch (multi-kernel DS-Conv + GLU + SE) ----
        self.cnn_blocks = nn.ModuleList([
            DepthwiseSeparableConv1d(d_model, cnn_out, k, dropout) for k in cnn_kernels
        ])
        self.cnn_ln  = nn.LayerNorm(len(cnn_kernels)*cnn_out)

        # ---- BiGRU branch ----
        self.gru  = nn.GRU(d_model, gru_h, num_layers=1, batch_first=True, bidirectional=True)
        self.gru_pool = AttentionPooling(2*gru_h, n_heads=min(4, n_heads), dropout=dropout)

        # ---- Engineered features gate ----
        self.fgate = FeatureGate(feat_dim)

        # ---- Fusion & head ----
        fusion_dim = d_model + (len(cnn_kernels)*cnn_out) + (2*gru_h) + feat_dim
        self.drop_path = DropPath(drop_path)
        self.pre_head  = nn.Sequential(
            nn.LayerNorm(fusion_dim),
            nn.Linear(fusion_dim, 256), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(256, 64), nn.GELU()
        )
        # multi-sample dropout heads averaged at forward
        self.head_dropout = nn.Dropout(dropout)
        self.head_last    = nn.Linear(64, 1)  # logits

    def forward(self, xtokens: torch.Tensor, xfeats: torch.Tensor):
        # xtokens: (B,T,768), xfeats: (B,F)
        # input reg
        x = self.noise(xtokens)
        x = self.cdrop(x)
        x = self.proj(x)                               # (B,T,D)

        # transformer branch
        t = self.tr_encoder(x)                         # (B,T,D)
        t = self.tr_pool(t)                            # (B,D)

        # cnn branch
        xc = x.transpose(1,2)                          # (B,D,T)
        c_parts = [blk(xc) for blk in self.cnn_blocks] # list of (B,C_out)
        c = torch.cat(c_parts, dim=1)                  # (B,sumC)
        c = self.cnn_ln(c)

        # bigru branch
        g, _ = self.gru(x)                             # (B,T,2*H)
        g = self.gru_pool(g)                           # (B,2*H)

        # gated engineered features
        f = self.fgate(xfeats)                         # (B,F)

        # fuse + light residual via DropPath
        z = torch.cat([t, c, g, f], dim=1)
        z = z + self.drop_path(z)                      # stochastic depth-like perturbation
        z = self.pre_head(z)                           # (B,64)

        # multi-sample dropout (averaged logits for smoother training)
        if self.training and self.ms_dropout_samples > 1:
            logits = 0.0
            for _ in range(self.ms_dropout_samples):
                logits = logits + self.head_last(self.head_dropout(z))
            logits = logits / float(self.ms_dropout_samples)
            return logits.squeeze(1)
        else:
            return self.head_last(self.head_dropout(z)).squeeze(1)


#E) training utilities & loop (with BCEWithLogitsLoss)

In [16]:
# ==========================================
# E) TRAINING UTILITIES & LOOP
# ==========================================
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score
)

# ---- dataset ----
class LangDataset(Dataset):
    def __init__(self, Xt, Xf, y):
        self.Xt = torch.from_numpy(Xt).float()
        self.Xf = torch.from_numpy(Xf).float()
        self.y  = torch.from_numpy(y.astype(np.float32))
    def __len__(self): return len(self.y)
    def __getitem__(self, i): return self.Xt[i], self.Xf[i], self.y[i]

# ---- mixup over tokens+features+labels ----
def mixup_batch(xt, xf, y, alpha: float):
    if alpha <= 0:
        return xt, xf, None
    lam = np.random.beta(alpha, alpha)
    idx = torch.randperm(xt.size(0), device=xt.device)
    xtm = lam*xt + (1-lam)*xt[idx]
    xfm = lam*xf + (1-lam)*xf[idx]
    return xtm, xfm, (y, y[idx], lam)

# ---- metrics from logits ----
def metrics_from_logits(y_true_np, logits_np, thr=0.5):
    p = 1.0 / (1.0 + np.exp(-logits_np))
    yhat = (p >= thr).astype(int)
    return dict(
        auc = float(roc_auc_score(y_true_np, p)),
        ap  = float(average_precision_score(y_true_np, p)),
        acc = float(accuracy_score(y_true_np, yhat)),
        prec= float(precision_score(y_true_np, yhat, zero_division=0)),
        rec = float(recall_score(y_true_np, yhat)),
        f1  = float(f1_score(y_true_np, yhat)),
    )

@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()
    logits_all, y_all = [], []
    for xt, xf, yy in loader:
        xt, xf = xt.to(device), xf.to(device)
        logits = model(xt, xf).cpu().numpy()
        logits_all.append(logits); y_all.append(yy.numpy())
    logits_all = np.concatenate(logits_all)
    y_all      = np.concatenate(y_all)
    return metrics_from_logits(y_all, logits_all, thr=0.5), logits_all, y_all

def train_model(Xt_tr, Xf_tr, y_tr, Xt_va, Xf_va, y_va,
                batch_size=128, epochs=30, lr=2e-4,
                weight_decay=1e-4, warmup_frac=0.10,
                use_mixup=True, mixup_p=0.35, mixup_alpha=0.4,
                es_patience=6, device=None):
    """
    Train UltraHybridBalancedPPv2 with BCEWithLogitsLoss,
    OneCycleLR warmup-like schedule, early stopping on val AUC,
    and (optionally) mixup.
    """
    assert device is not None, "Pass DEVICE"
    tr_ds = LangDataset(Xt_tr, Xf_tr, y_tr)
    va_ds = LangDataset(Xt_va, Xf_va, y_va)
    tr_ld = DataLoader(tr_ds, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=0)
    va_ld = DataLoader(va_ds, batch_size=batch_size, shuffle=False, num_workers=0)

    feat_dim = Xf_tr.shape[1]
    model = UltraHybridBalancedPPv2(
        feat_dim=feat_dim,
        d_in=768, d_model=D_MODEL, n_heads=N_HEADS, n_layers=N_LAYERS,
        cnn_kernels=CNN_KS, cnn_out=CNN_OUT, gru_h=D_GRU,
        dropout=DROPOUT, channel_drop=CHANNEL_DROP,
        gauss_noise=GAUSS_NOISE, drop_path=0.05, ms_dropout_samples=4
    ).to(device)

    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.OneCycleLR(
        opt, max_lr=lr, steps_per_epoch=max(1,len(tr_ld)), epochs=epochs, pct_start=warmup_frac
    )
    criterion = nn.BCEWithLogitsLoss()

    best_auc, best_state, patience = -1.0, None, es_patience

    for ep in range(1, epochs+1):
        model.train()
        for xt, xf, yy in tr_ld:
            xt, xf, yy = xt.to(device), xf.to(device), yy.to(device)

            if use_mixup and np.random.rand() < mixup_p:
                xtm, xfm, mix = mixup_batch(xt, xf, yy, alpha=mixup_alpha)
                if mix is None:
                    logits = model(xt, xf); loss = criterion(logits, yy)
                else:
                    y_a, y_b, lam = mix
                    logits = model(xtm, xfm)
                    loss = lam*criterion(logits, y_a) + (1-lam)*criterion(logits, y_b)
            else:
                logits = model(xt, xf)
                loss = criterion(logits, yy)

            opt.zero_grad(set_to_none=True)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0)
            opt.step(); sched.step()

        # ---- end epoch: evaluate ----
        tr_metrics, _, _ = evaluate(model, tr_ld, device)
        va_metrics, _, _ = evaluate(model, va_ld, device)
        print(f"Epoch {ep:02d} | "
              f"TR AUC {tr_metrics['auc']:.4f} | TR ACC {tr_metrics['acc']:.4f} | TR F1 {tr_metrics['f1']:.4f} || "
              f"VA AUC {va_metrics['auc']:.4f} | VA ACC {va_metrics['acc']:.4f} | VA F1 {va_metrics['f1']:.4f}")

        # ---- early stopping on val AUC ----
        if va_metrics["auc"] > best_auc:
            best_auc = va_metrics["auc"]
            best_state = {k: v.detach().cpu() for k, v in model.state_dict().items()}
            patience = es_patience
        else:
            patience -= 1
            if patience <= 0:
                print("Early stopping.")
                break

    if best_state is not None:
        model.load_state_dict(best_state, strict=True)

    # final metrics
    tr_metrics, _, _ = evaluate(model, tr_ld, device)
    va_metrics, _, _ = evaluate(model, va_ld, device)

    # save artifacts if the globals exist
    try:
        torch.save(model.state_dict(), ARTIFACTS/"ultrahybrid_balanced_pp_v2.pt")
    except Exception as _:
        pass

    return model, tr_metrics, va_metrics


#F) Run training and print full metrics (train & validation)

In [17]:
# ==========================================
# F) RUN TRAINING & REPORT METRICS
# ==========================================
model, tr_m, va_m = train_model(
    Xt_tr, Xf_tr_s, y_tr,
    Xt_va, Xf_va_s, y_va,
    batch_size=BATCH_SIZE, epochs=EPOCHS, lr=LR,
    weight_decay=WEIGHT_DECAY, warmup_frac=WARMUP_FRAC,
    use_mixup=USE_MIXUP, mixup_p=MIXUP_P, mixup_alpha=MIXUP_ALPHA,
    es_patience=ES_PATIENCE, device=DEVICE
)

def pretty(m, name):
    print(f"\n[{name}]")
    print(f"ROC-AUC: {m['auc']:.5f} | AP: {m['ap']:.5f}")
    print(f"ACC: {m['acc']:.5f} | PREC: {m['prec']:.5f} | REC: {m['rec']:.5f} | F1: {m['f1']:.5f}")

pretty(tr_m, "TRAIN")
pretty(va_m, "VALIDATION")

# (optional) persist scaler & centroids for later inference
try:
    import joblib
    joblib.dump(scaler, ARTIFACTS/"scaler.pkl")
    np.save(ARTIFACTS/"centroid_h.npy", cents_tr["human"])
    np.save(ARTIFACTS/"centroid_a.npy", cents_tr["ai"])
    print("\nArtifacts saved to:", ARTIFACTS.resolve())
except Exception as _:
    pass


Epoch 01 | TR AUC 0.8987 | TR ACC 0.8152 | TR F1 0.8133 || VA AUC 0.8920 | VA ACC 0.8073 | VA F1 0.8051
Epoch 02 | TR AUC 0.9554 | TR ACC 0.8830 | TR F1 0.8808 || VA AUC 0.9466 | VA ACC 0.8763 | VA F1 0.8738
Epoch 03 | TR AUC 0.9651 | TR ACC 0.9008 | TR F1 0.9015 || VA AUC 0.9542 | VA ACC 0.8877 | VA F1 0.8892
Epoch 04 | TR AUC 0.9706 | TR ACC 0.9080 | TR F1 0.9105 || VA AUC 0.9565 | VA ACC 0.8930 | VA F1 0.8969
Epoch 05 | TR AUC 0.9726 | TR ACC 0.9141 | TR F1 0.9131 || VA AUC 0.9530 | VA ACC 0.8795 | VA F1 0.8795
Epoch 06 | TR AUC 0.9801 | TR ACC 0.9294 | TR F1 0.9300 || VA AUC 0.9591 | VA ACC 0.8934 | VA F1 0.8954
Epoch 07 | TR AUC 0.9850 | TR ACC 0.9402 | TR F1 0.9401 || VA AUC 0.9607 | VA ACC 0.8918 | VA F1 0.8921
Epoch 08 | TR AUC 0.9875 | TR ACC 0.9407 | TR F1 0.9422 || VA AUC 0.9604 | VA ACC 0.8906 | VA F1 0.8955
Epoch 09 | TR AUC 0.9908 | TR ACC 0.9501 | TR F1 0.9511 || VA AUC 0.9608 | VA ACC 0.8926 | VA F1 0.8968
Epoch 10 | TR AUC 0.9931 | TR ACC 0.9577 | TR F1 0.9586 || VA AU

In [18]:
# ==========================================
# G) OPTIONAL — THRESHOLD TUNING ON VALIDATION
# ==========================================
import numpy as np

@torch.no_grad()
def val_logits(model):
    model.eval()
    dl = DataLoader(LangDataset(Xt_va, Xf_va_s, y_va), batch_size=256, shuffle=False)
    logits, ys = [], []
    for xt, xf, yy in dl:
        xt, xf = xt.to(DEVICE), xf.to(DEVICE)
        logits.append(model(xt, xf).cpu().numpy())
        ys.append(yy.numpy())
    return np.concatenate(logits), np.concatenate(ys)

def tune_threshold(logits, y_true, grid=np.linspace(0.1, 0.9, 33)):
    p = 1/(1+np.exp(-logits))
    best_thr, best_f1 = 0.5, -1
    for thr in grid:
        yhat = (p >= thr).astype(int)
        f1 = f1_score(y_true, yhat)
        if f1 > best_f1:
            best_f1, best_thr = f1, thr
    return best_thr, best_f1

logits_va, y_va_true = val_logits(model)
best_thr, best_f1 = tune_threshold(logits_va, y_va_true)
print(f"\nBest validation threshold (by F1): {best_thr:.3f} | F1={best_f1:.4f}")



Best validation threshold (by F1): 0.600 | F1=0.9005


# ==========================================
# H) TEST → SUBMISSION (paragraph-level, 1 row per id)

In [19]:
# ==========================================
# H) TEST → SUBMISSION (PARAGRAPH-LEVEL, SINGLE CSV WITH y_prob)
# ==========================================
import os, glob, json
import numpy as np
import pandas as pd
from pathlib import Path
import torch
from torch.utils.data import DataLoader, Dataset

# -----------------------------
# 0) Required runtime objects
# -----------------------------
assert 'model' in globals(), "Model not found. Run training first so `model` is defined."

# Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(DEVICE)

# Output dir
if 'ARTIFACTS' not in globals():
    ARTIFACTS = Path("./outputs")
ARTIFACTS.mkdir(parents=True, exist_ok=True)

# ------------------------------------
# 1) Try to reuse your preprocessing
#    (variance_mask, build_features,
#     scaler, cents_tr, SEGMENTS, etc.)
#    If missing, use safe fallbacks.
# ------------------------------------
USE_FALLBACKS = False
if not all(k in globals() for k in ['build_features', 'scaler']):
    USE_FALLBACKS = True

if 'SEGMENTS' not in globals():
    SEGMENTS = (33, 33, 34)   # default split used by many setups

if 'MASK_TOPK_VAR_DIMS' not in globals():
    MASK_TOPK_VAR_DIMS = None

def _variance_mask_fallback(x, topk=None):
    # No-op: just return input unchanged
    return x

variance_mask_fn = globals().get('variance_mask', _variance_mask_fallback)

def _build_features_fallback(Xt_tokens, cents=None, segments=SEGMENTS):
    """
    Minimal, robust features if your custom `build_features` is not available.
    - Mean-pool 100x768 -> 768
    - Early/Mid/Late segment L2-norm means -> 3
    Output: (N, 771)
    """
    # mean pool
    pooled = Xt_tokens.mean(axis=1)  # (N, 768)

    # segment L2 stats
    e = Xt_tokens[:, :segments[0], :]
    m = Xt_tokens[:, segments[0]:segments[0]+segments[1], :]
    l = Xt_tokens[:, -segments[2]:, :]
    def mean_l2(a): return np.linalg.norm(a, axis=2).mean(axis=1, keepdims=True)
    seg = np.hstack([mean_l2(e), mean_l2(m), mean_l2(l)])  # (N, 3)

    return np.hstack([pooled, seg]).astype(np.float32)

build_features_fn = globals().get('build_features', _build_features_fallback)

# Standardize features if a scaler is provided; else pass-through
def _maybe_scale(Xf):
    if 'scaler' in globals():
        return globals()['scaler'].transform(Xf).astype(np.float32)
    return Xf.astype(np.float32)

# ------------------------------------
# 2) Locate test JSONL
# ------------------------------------
def find_test_jsonl():
    for p in [
        "../data/test/test_features.jsonl",
        "../data/test/test.jsonl",
        "../data/test_features.jsonl",
        "data/test/test_features.jsonl",
        "data/test/test.jsonl",
    ]:
        if os.path.exists(p):
            return p
    files = glob.glob("**/*test*features*.jsonl", recursive=True) or glob.glob("**/*.jsonl", recursive=True)
    if not files:
        raise FileNotFoundError("Could not find any test .jsonl file")
    return files[0]

TEST_JSONL = find_test_jsonl()
print("Test file:", TEST_JSONL)

# ------------------------------------
# 3) Parse test JSONL grouped by paragraph id
#    Returns list of (id_str, np.array (M,100,768))
# ------------------------------------
def parse_jsonl_grouped(path):
    grouped = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            obj = json.loads(line)
            base_id = obj.get("id", obj.get("guid", obj.get("index", i)))
            feats = obj.get("features")
            if feats is None:
                continue

            arrs = []
            if isinstance(feats, list):
                for a in feats:
                    a = np.asarray(a).squeeze()
                    if a.shape == (100, 768):
                        arrs.append(a.astype(np.float32))
            else:
                a = np.asarray(feats).squeeze()
                if a.shape == (100, 768):
                    arrs.append(a.astype(np.float32))

            if len(arrs) == 0:
                # Nothing usable on this line; skip with a light warn
                # print(f"[warn] skipping line {i} (no (100,768) chunks)")
                continue

            X = np.stack(arrs, axis=0)  # (M,100,768)
            grouped.append((str(base_id), X))
    if not grouped:
        raise ValueError("Parsed zero paragraphs from test JSONL.")
    return grouped

grouped = parse_jsonl_grouped(TEST_JSONL)
print(f"Paragraphs parsed: {len(grouped)}")

# ------------------------------------
# 4) Inference helpers
# ------------------------------------
class _SmallSet(Dataset):
    def __init__(self, Xt, Xf):
        self.Xt = torch.from_numpy(Xt).float()
        self.Xf = torch.from_numpy(Xf).float()
        self.y  = torch.zeros(len(Xt), dtype=torch.float32)  # dummy
    def __len__(self): return len(self.Xt)
    def __getitem__(self, i): return self.Xt[i], self.Xf[i], self.y[i]

@torch.no_grad()
def predict_probs_for_tokens(Xt_tokens):  # Xt_tokens: (M,100,768)
    # same masking policy
    Xt_m = variance_mask_fn(Xt_tokens, MASK_TOPK_VAR_DIMS)

    # tabular features
    Xf   = build_features_fn(Xt_m, globals().get('cents_tr', None), segments=SEGMENTS)
    Xf_s = _maybe_scale(Xf)

    dl = DataLoader(_SmallSet(Xt_m, Xf_s), batch_size=256, shuffle=False)
    model.eval()
    out = []
    for xt, xf, _ in dl:
        xt, xf = xt.to(DEVICE), xf.to(DEVICE)
        logits = model(xt, xf)             # expects logits
        out.append(torch.sigmoid(logits).cpu().numpy())
    return np.concatenate(out, axis=0)  # (M,)

# ------------------------------------
# 5) Paragraph aggregator (LOGIT-MEAN)
#    Slightly sharper than plain mean
# ------------------------------------
def pool_paragraph_logit(probs, eps=1e-6, temp=1.0):
    p = np.clip(probs, eps, 1 - eps)
    logits = np.log(p) - np.log(1 - p)
    m = logits.mean() / max(1e-6, temp)
    return float(1 / (1 + np.exp(-m)))

# If you want the plain average instead, use:
# def pool_paragraph_mean(probs): return float(probs.mean())

# ------------------------------------
# 6) Run inference per paragraph
# ------------------------------------
records = []
for pid, Xtok in grouped:
    # per-sentence probabilities
    p_sent = predict_probs_for_tokens(Xtok)
    # single probability per paragraph
    y_prob = pool_paragraph_logit(p_sent)   # <-- "slightly sharper" aggregator
    records.append((pid, y_prob))

# ------------------------------------
# 7) Build EXACT submission and save ONE file
# ------------------------------------
sub = pd.DataFrame(records, columns=["id", "y_prob"])

# Cast id to int if possible (keeps your sample CSV look)
sub["id"] = pd.to_numeric(sub["id"], errors="ignore")

# Sort for determinism
sub = sub.sort_values("id").reset_index(drop=True)

OUT_PATH = ARTIFACTS / "submission_base_prob.csv"
sub.to_csv(OUT_PATH, index=False)

print(sub.head(10))
print(f"\nSaved: {OUT_PATH.resolve()}")
print("Shape:", sub.shape)
if USE_FALLBACKS:
    print("[INFO] Used fallback feature pipeline (no custom scaler/build_features found).")


Test file: data/test/test_features.jsonl


Paragraphs parsed: 180
   id    y_prob
0  15  0.010840
1  16  0.031168
2  17  0.065514
3  18  0.479125
4  19  0.126789
5  21  0.021771
6  24  0.049797
7  25  0.402383
8  27  0.128001
9  29  0.289021

Saved: D:\last_ai_ml\artifacts\submission_base_prob.csv
Shape: (180, 2)
