In [3]:
# %% [markdown]
# # AI vs Human Text Classifier — Baseline (Part 1)
# - Mean pooling over sentence embeddings (100x768) → paragraph vector (768)
# - StandardScaler + Logistic Regression (Calibrated)
# - 5-fold CV, OOF metrics, threshold tuning
# - Save artefacts, run test inference, write submission.csv

# %%
import os, json, math, gc, random, time
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_recall_fscore_support

import joblib
from tqdm.auto import tqdm
np.set_printoptions(suppress=True)


In [4]:
# %%
# ==== CONFIG ====
DATA_DIR = Path("./data")  # put your files here
TRAIN_AI_PATH = DATA_DIR / "train/train_ai.npy"
TRAIN_HUMAN_PATH = DATA_DIR / "train/train_human.npy"
VAL_JSONL = DATA_DIR / "train/validation.jsonl"       # optional sanity set
TEST_JSONL = DATA_DIR / "test/test_features.jsonl"   # required for submission

MODELS_DIR = Path("./models")
MODELS_DIR.mkdir(parents=True, exist_ok=True)

N_FOLDS = 5
SEED = 42
random.seed(SEED); np.random.seed(SEED)

SUBMISSION_PATH = Path("./submission.csv")
ART_SCALER = MODELS_DIR / "scaler.pkl"
ART_MODEL = MODELS_DIR / "logreg_calibrated.pkl"
ART_CONFIG = MODELS_DIR / "config.json"
ART_THRESHOLD = MODELS_DIR / "final_threshold.txt"


In [6]:
# %%
def load_train(train_ai_path, train_human_path):
    X_ai = np.load(train_ai_path)      # (n_ai, 100, 768)
    X_h  = np.load(train_human_path)   # (n_h, 100, 768)
    y = np.array([1]*len(X_ai) + [0]*len(X_h), dtype=np.int64)
    X = np.concatenate([X_ai, X_h], axis=0)     # (n, 100, 768)
    return X, y

def read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            yield json.loads(line)

def load_jsonl_features(path):
    """Returns: list of dicts with keys: id, features(np.array of shape (<=100,768))"""
    items = []
    for obj in read_jsonl(path):
        # Expect at least {"id": ..., "features": [[...],[...],...]}
        pid = obj.get("id")
        feats = np.asarray(obj.get("features"), dtype=np.float32)
        # Ensure 2D (n_sent, 768)
        if feats.ndim == 1:
            feats = feats.reshape(1, -1)
        items.append({"id": pid, "features": feats})
    return items

def sentence_mask(x, eps=1e-8):
    """
    Works for (..., 768) – returns mask of shape (...,)
    True where the 768-d vector is non-zero.
    """
    x = np.asarray(x, dtype=np.float32)
    norms = np.linalg.norm(x, axis=-1)
    return (norms > eps)

def mean_pool(x):
    """
    Robust mean pooling:
    - Accepts (100,768)
    - Accepts (segments, 100, 768)
    - Flattens all leading dims into a single sequence of 768-d vectors,
      masks near-zero rows, and averages.
    Returns: (768,)
    """
    x = np.asarray(x, dtype=np.float32)
    if x.ndim < 2:
        # Unexpected shape – return zeros
        return np.zeros((768,), dtype=np.float32)

    # Ensure last dim is feature dim
    feat_dim = x.shape[-1]
    if feat_dim != 768:
        raise ValueError(f"Expected last dim = 768, got {feat_dim} with shape {x.shape}")

    # Flatten all but last dim to get (N, 768), where N = product of leading dims
    x_flat = x.reshape(-1, feat_dim)  # (N, 768)

    m = sentence_mask(x_flat).astype(np.float32)  # (N,)
    count = m.sum()
    if count < 1:
        return np.zeros((feat_dim,), dtype=np.float32)

    return (x_flat * m[:, None]).sum(axis=0) / count

def to_paragraph_matrix_from_npy_block(X):
    """X: (n, 100, 768) -> (n, 768) pooled"""
    pooled = np.zeros((X.shape[0], X.shape[2]), dtype=np.float32)
    for i in range(X.shape[0]):
        pooled[i] = mean_pool(X[i])
    return pooled

def to_paragraph_matrix_from_jsonl_items(items):
    """items: list of dict with features (shape (100,768) or (S,100,768)) -> DataFrame(id, vector)"""
    rows = []
    for obj in items:
        pid = obj["id"]
        vec = mean_pool(obj["features"])
        rows.append({"id": pid, "vector": vec})
    return pd.DataFrame(rows)


In [7]:
# %%
X_raw, y = load_train(TRAIN_AI_PATH, TRAIN_HUMAN_PATH)
print("Train raw shape:", X_raw.shape, "Labels:", y.shape, "Pos rate:", y.mean())

X = to_paragraph_matrix_from_npy_block(X_raw)   # (n, 768)
print("Pooled train shape:", X.shape)


Train raw shape: (16322, 100, 768) Labels: (16322,) Pos rate: 0.5
Pooled train shape: (16322, 768)


In [9]:
# %%
from sklearn.base import clone

def fit_oof_cv(X, y, n_folds=5, seed=42):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    oof_pred = np.zeros(len(y), dtype=np.float32)
    oof_thr_list, fold_metrics = [], []
    C_grid = [0.5, 1.0, 2.0]

    for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y), 1):
        X_tr, X_va = X[trn_idx], X[val_idx]
        y_tr, y_va = y[trn_idx], y[val_idx]

        best_auc, best_model = -1, None
        for C in C_grid:
            base = Pipeline([
                ("scaler", StandardScaler(with_mean=True, with_std=True)),
                ("clf", LogisticRegression(
                    penalty="l2", solver="saga", max_iter=4000, C=C, n_jobs=-1, random_state=seed))
            ])
            cal = CalibratedClassifierCV(estimator=base, method="isotonic", cv=3)
            cal.fit(X_tr, y_tr)
            p_va = cal.predict_proba(X_va)[:, 1]
            auc = roc_auc_score(y_va, p_va)
            if auc > best_auc:
                best_auc, best_model, best_p_va = auc, cal, p_va

        p_va = best_p_va
        oof_pred[val_idx] = p_va

        # F1-optimal threshold
        best_thr, best_f1 = 0.5, -1
        for t in np.linspace(0.01, 0.99, 199):
            f1 = f1_score(y_va, (p_va >= t).astype(int))
            if f1 > best_f1:
                best_f1, best_thr = f1, t

        pred_lbl = (p_va >= best_thr).astype(int)
        auc = roc_auc_score(y_va, p_va)
        acc = accuracy_score(y_va, pred_lbl)
        prec, rec, f1, _ = precision_recall_fscore_support(y_va, pred_lbl, average="binary", zero_division=0)
        fold_metrics.append({"fold": fold, "AUC": auc, "F1": f1, "Acc": acc, "Prec": prec, "Rec": rec, "thr": best_thr})
        oof_thr_list.append(best_thr)
        print(f"[Fold {fold}] AUC={auc:.4f} F1={f1:.4f} Acc={acc:.4f} Prec={prec:.4f} Rec={rec:.4f} Thr={best_thr:.3f}")

    oof_auc = roc_auc_score(y, oof_pred)
    final_thr = float(np.median(oof_thr_list))
    pred_lbl = (oof_pred >= final_thr).astype(int)
    oof_acc = accuracy_score(y, pred_lbl)
    oof_prec, oof_rec, oof_f1, _ = precision_recall_fscore_support(y, pred_lbl, average="binary", zero_division=0)
    summary = {"OOF_AUC": oof_auc, "OOF_F1": oof_f1, "OOF_Acc": oof_acc, "OOF_Prec": oof_prec, "OOF_Rec": oof_rec,
               "final_thr": final_thr, "folds": fold_metrics}
    return oof_pred, summary


oof_pred, summary = fit_oof_cv(X, y, n_folds=N_FOLDS, seed=SEED)
print("\n=== OOF Summary ===")
for k, v in summary.items():
    if k != "folds":
        print(f"{k}: {v}")
pd.DataFrame(summary["folds"])


[Fold 1] AUC=0.9686 F1=0.9085 Acc=0.9066 Prec=0.8906 Rec=0.9271 Thr=0.465
[Fold 2] AUC=0.9658 F1=0.9033 Acc=0.9029 Prec=0.8992 Rec=0.9075 Thr=0.485
[Fold 3] AUC=0.9649 F1=0.9045 Acc=0.9020 Prec=0.8814 Rec=0.9289 Thr=0.475
[Fold 4] AUC=0.9644 F1=0.9033 Acc=0.9017 Prec=0.8881 Rec=0.9191 Thr=0.416
[Fold 5] AUC=0.9679 F1=0.9057 Acc=0.9035 Prec=0.8853 Rec=0.9271 Thr=0.455

=== OOF Summary ===
OOF_AUC: 0.9660641019648668
OOF_F1: 0.9043258223882396
OOF_Acc: 0.9027080014704081
OOF_Prec: 0.8895341946189403
OOF_Rec: 0.9196176939100601
final_thr: 0.46535353535353535


Unnamed: 0,fold,AUC,F1,Acc,Prec,Rec,thr
0,1,0.968649,0.908491,0.906585,0.890588,0.927128,0.465354
1,2,0.965775,0.903324,0.90291,0.899211,0.907475,0.485152
2,3,0.964851,0.904535,0.901961,0.881395,0.928922,0.475253
3,4,0.964383,0.903342,0.901654,0.888099,0.919118,0.415859
4,5,0.967931,0.905717,0.903493,0.885313,0.927083,0.455455


In [8]:
# === Final fit (LogReg + isotonic calibration), threshold selection, and artefact saving ===

# 1) Fit calibrated final model on ALL training data
best_C = 1.0  # keep or pick from your CV sweep
final_base = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(
        penalty="l2", solver="saga", max_iter=4000, C=best_C, n_jobs=-1, random_state=SEED))
])
final_model = CalibratedClassifierCV(estimator=final_base, method="isotonic", cv=5)
final_model.fit(X, y)

# 2) Choose threshold
#    Start from OOF-derived median threshold, then (optionally) nudge down a bit to trade precision for recall.
thr_oof = float(summary["final_thr"])
thr_val = max(0.0, thr_oof - 0.05)  # small nudge; adjust if needed

print("== OOF-based threshold ==", thr_oof)
_ = evaluate_on_validation(VAL_JSONL, final_model, thr_oof)

print("== Val-nudged threshold ==", thr_val)
_ = evaluate_on_validation(VAL_JSONL, final_model, thr_val)

# Pick which threshold you want to use going forward:
chosen_threshold = thr_val   # or use thr_oof if you prefer higher precision

# 3) Save artefacts
joblib.dump(final_model, ART_MODEL)

config = {
    "seed": SEED,
    "n_folds": N_FOLDS,
    "clf": "LogReg(saga)+Calibrated(isotonic)",
    "C": best_C,
    "vector": "mean_pool_768",
    "final_threshold_oof": thr_oof,
    "final_threshold_chosen": chosen_threshold
}
json.dump(config, open(ART_CONFIG, "w"))

with open(ART_THRESHOLD, "w") as f:
    f.write(str(chosen_threshold))

print("Saved:", ART_MODEL, ART_CONFIG, ART_THRESHOLD)


NameError: name 'summary' is not defined

In [27]:
# %%
def evaluate_on_validation(val_jsonl_path, model, threshold):
    if not Path(val_jsonl_path).exists():
        print("No validation file found. Skipping.")
        return None
    items = load_jsonl_features(val_jsonl_path)
    dfv = to_paragraph_matrix_from_jsonl_items(items)
    # Expect labels exist in validation.jsonl; if not, skip
    # Common schema: {"id":..., "features":[...], "label": 0/1}
    labels = []
    for obj in read_jsonl(val_jsonl_path):
        labels.append(int(obj.get("label", 0)))
    yv = np.array(labels, dtype=np.int64)

    Xv = np.stack(dfv["vector"].values)  # (n, 768)
    pv = model.predict_proba(Xv)[:, 1]
    auc = roc_auc_score(yv, pv)
    yhat = (pv >= threshold).astype(int)
    acc = accuracy_score(yv, yhat)
    prec, rec, f1, _ = precision_recall_fscore_support(yv, yhat, average="binary", zero_division=0)
    print(f"[Validation] AUC={auc:.4f} F1={f1:.4f} Acc={acc:.4f} Prec={prec:.4f} Rec={rec:.4f}")
    return {"AUC":auc, "F1":f1, "Acc":acc, "Prec":prec, "Rec":rec}

_ = evaluate_on_validation(VAL_JSONL, final_model, summary["final_thr"])


[Validation] AUC=0.9900 F1=0.6667 Acc=0.7500 Prec=1.0000 Rec=0.5000


In [40]:
# === Kaggle-compliant: id,y_prob (no thresholding) ===
def predict_test_and_write_submission_prob(test_jsonl_path, model, out_csv_path):
    items = load_jsonl_features(test_jsonl_path)
    dft = to_paragraph_matrix_from_jsonl_items(items)
    Xt = np.stack(dft["vector"].values)
    y_prob = model.predict_proba(Xt)[:, 1]          # <-- probability of class 1
    sub = pd.DataFrame({"id": dft["id"], "y_prob": y_prob})
    sub.to_csv(out_csv_path, index=False)
    print("Wrote submission to:", out_csv_path)
    return sub

# write the baseline prob file (good quick uplift)
SUBMISSION_BASE_PROB = Path("./submission_base_prob.csv")
_ = predict_test_and_write_submission_prob(TEST_JSONL, final_model, SUBMISSION_BASE_PROB)


Wrote submission to: submission_base_prob.csv


In [10]:
# %% [markdown]
# # Part 2 — Lightweight Transformer + Attention Pooling (PyTorch)

# %%
import os, json, math, gc, random, time
from pathlib import Path

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score

from tqdm.auto import tqdm

torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cpu')

In [10]:
# %%
# Reuse DATA_DIR, MODELS_DIR, VAL_JSONL, TEST_JSONL, SEED, etc. from Part 1
# If not defined above, uncomment and set them here:
# DATA_DIR = Path("./data")
# MODELS_DIR = Path("./models")
# VAL_JSONL = DATA_DIR / "validation.jsonl"
# TEST_JSONL = DATA_DIR / "test_features.jsonl"
# SEED = 42
# N_FOLDS = 5

TRANS_DIR = MODELS_DIR / "transformer_v1"
TRANS_DIR.mkdir(parents=True, exist_ok=True)

CFG = {
    "max_len": 100,           # fixed number of sentences per item
    "feat_dim": 768,          # embedding size
    "d_model": 256,           # transformer hidden size
    "n_heads": 4,
    "n_layers": 2,
    "dropout": 0.1,
    "batch_size": 32,
    "lr": 3e-4,
    "weight_decay": 1e-4,
    "epochs": 20,
    "early_stop_patience": 5,
    "amp": True,              # mixed precision if CUDA
    "seed": SEED
}
json.dump(CFG, open(TRANS_DIR / "config.json", "w"))
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)


In [11]:
# %%
def pad_or_truncate(x: np.ndarray, max_len=100):
    """
    x: (n_sent, 768) or (100,768). Returns (max_len, 768) padded with zeros or truncated.
    """
    x = np.asarray(x, dtype=np.float32)
    if x.ndim == 3:
        # If given (segments, 100, 768), flatten to (segments*100, 768) first
        x = x.reshape(-1, x.shape[-1])
    n = x.shape[0]
    if n == max_len:
        return x
    if n > max_len:
        return x[:max_len]
    # pad
    out = np.zeros((max_len, x.shape[1]), dtype=np.float32)
    out[:n] = x
    return out

def make_mask(x: np.ndarray):
    """
    x: (max_len, 768). Mask True where row is non-zero.
    """
    return (np.linalg.norm(x, axis=-1) > 1e-8).astype(np.bool_)

class ParagraphDataset(Dataset):
    def __init__(self, X, y=None, max_len=100):
        """
        X:
          - if training from npy: shape (N, 100, 768)
          - if from jsonl items: list of dicts {id, features}
        y: labels or None
        """
        self.max_len = max_len
        self.ids = None

        if isinstance(X, np.ndarray):
            self.X = X  # (N, 100, 768)
            self.ids = np.arange(len(X))
        elif isinstance(X, list):  # jsonl items
            self.ids = [obj["id"] for obj in X]
            mats, masks = [], []
            for obj in X:
                mat = pad_or_truncate(np.asarray(obj["features"], dtype=np.float32), max_len=self.max_len)
                mats.append(mat)
                masks.append(make_mask(mat))
            self.X = np.stack(mats)   # (N, max_len, 768)
            self._masks = np.stack(masks)  # (N, max_len)
        else:
            raise ValueError("Unsupported X type.")

        self.y = None if y is None else np.asarray(y, dtype=np.int64)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        item = {
            "x": torch.from_numpy(self.X[idx]),                    # (max_len, 768)
            "mask": torch.from_numpy(make_mask(self.X[idx])) if not hasattr(self, "_masks") else torch.from_numpy(self._masks[idx])  # (max_len,)
        }
        if self.y is not None:
            item["y"] = torch.tensor(self.y[idx], dtype=torch.long)
        if self.ids is not None:
            item["id"] = self.ids[idx]
        return item

def make_loader(dataset, batch_size, shuffle=False, num_workers=0, pin_memory=None):
    """
    Use num_workers=0 in notebooks (esp. on macOS) to avoid:
    AttributeError: Can't get attribute 'ParagraphDataset' on <module '__main__'>
    """
    import torch
    if pin_memory is None:
        pin_memory = (torch.cuda.is_available() and torch.device("cuda").type == "cuda")
    return DataLoader(dataset,
                      batch_size=batch_size,
                      shuffle=shuffle,
                      num_workers=0,          # <— force single-process workers
                      pin_memory=pin_memory,
                      drop_last=False)


In [12]:
# %%
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)  # (max_len, d_model)

    def forward(self, x):
        # x: (B, T, d_model)
        T = x.size(1)
        return x + self.pe[:T].unsqueeze(0)

class AttnPool(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.attn = nn.Linear(d_model, 1)

    def forward(self, x, mask):
        # x: (B,T,d), mask: (B,T) [True for valid]
        attn_logits = self.attn(x).squeeze(-1)                  # (B,T)
        attn_logits = attn_logits.masked_fill(~mask, -1e9)
        w = F.softmax(attn_logits, dim=-1).unsqueeze(-1)        # (B,T,1)
        pooled = (x * w).sum(dim=1)                             # (B,d)
        return pooled

class LightTransformer(nn.Module):
    def __init__(self, feat_dim=768, d_model=256, n_heads=4, n_layers=2, dropout=0.1, max_len=100):
        super().__init__()
        self.proj = nn.Linear(feat_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=d_model*4,
                                                   dropout=dropout, batch_first=True, activation="gelu")
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.posenc = PositionalEncoding(d_model, max_len=max_len)
        self.pool = AttnPool(d_model)
        self.head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Dropout(dropout),
            nn.Linear(d_model, 128),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1)
        )

    def forward(self, x, mask):
        """
        x: (B,T,768), mask: (B,T) bool
        """
        h = self.proj(x)                  # (B,T,d)
        h = self.posenc(h)
        # Transformer expects mask where True=positions to ignore
        src_key_padding_mask = ~mask      # (B,T)
        h = self.encoder(h, src_key_padding_mask=src_key_padding_mask)
        pooled = self.pool(h, mask)       # (B,d)
        logits = self.head(pooled).squeeze(-1)  # (B,)
        return logits


In [13]:
# %%
def train_one_epoch(model, loader, optim, scaler, device):
    model.train()
    total = 0.0
    for batch in loader:
        x = batch["x"].to(device).float()
        mask = batch["mask"].to(device)
        y = batch["y"].to(device).float()

        optim.zero_grad(set_to_none=True)
        if scaler is not None:
            with torch.cuda.amp.autocast():
                logits = model(x, mask)
                loss = F.binary_cross_entropy_with_logits(logits, y)
            scaler.scale(loss).backward()
            scaler.step(optim)
            scaler.update()
        else:
            logits = model(x, mask)
            loss = F.binary_cross_entropy_with_logits(logits, y)
            loss.backward()
            optim.step()
        total += loss.item() * x.size(0)
    return total / len(loader.dataset)

@torch.no_grad()
def predict_proba(model, loader, device):
    model.eval()
    probs = []
    for batch in loader:
        x = batch["x"].to(device).float()
        mask = batch["mask"].to(device)
        logits = model(x, mask)
        p = torch.sigmoid(logits).detach().cpu().numpy()
        probs.append(p)
    return np.concatenate(probs, axis=0)

def fit_cv_transformer(X_raw, y, cfg, n_folds=5, seed=42, out_dir=TRANS_DIR):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    oof = np.zeros(len(y), dtype=np.float32)
    best_thresholds, fold_paths = [], []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_raw, y), 1):
        print(f"\n=== Fold {fold}/{n_folds} ===")
        tr_ds = ParagraphDataset(X_raw[tr_idx], y=y[tr_idx], max_len=cfg["max_len"])
        va_ds = ParagraphDataset(X_raw[va_idx], y=y[va_idx], max_len=cfg["max_len"])
        tr_loader = make_loader(tr_ds, cfg["batch_size"], shuffle=True)
        va_loader = make_loader(va_ds, cfg["batch_size"], shuffle=False)

        model = LightTransformer(
            feat_dim=cfg["feat_dim"], d_model=cfg["d_model"], n_heads=cfg["n_heads"],
            n_layers=cfg["n_layers"], dropout=cfg["dropout"], max_len=cfg["max_len"]
        ).to(device)

        optim = torch.optim.AdamW(model.parameters(), lr=cfg["lr"], weight_decay=cfg["weight_decay"])
        scaler = torch.cuda.amp.GradScaler(enabled=(cfg["amp"] and device.type == "cuda"))

        best_auc, best_state, no_improve = -1.0, None, 0
        for epoch in range(1, cfg["epochs"]+1):
            tr_loss = train_one_epoch(model, tr_loader, optim, scaler, device)
            # validation
            va_probs = predict_proba(model, va_loader, device)
            auc = roc_auc_score(y[va_idx], va_probs)
            print(f"Epoch {epoch:02d} | train_loss={tr_loss:.4f} | val_auc={auc:.5f}")

            if auc > best_auc + 1e-5:
                best_auc = auc
                best_state = {k: v.cpu() for k, v in model.state_dict().items()}
                no_improve = 0
            else:
                no_improve += 1
                if no_improve >= cfg["early_stop_patience"]:
                    print(f"Early stopping at epoch {epoch} (best AUC {best_auc:.5f})")
                    break

        # restore best
        model.load_state_dict(best_state)

        # final fold validation probabilities
        va_probs = predict_proba(model, va_loader, device)
        oof[va_idx] = va_probs

        # choose best threshold on this fold (F1)
        best_thr, best_f1 = 0.5, -1.0
        for t in np.linspace(0.01, 0.99, 199):
            f1 = f1_score(y[va_idx], (va_probs >= t).astype(int))
            if f1 > best_f1:
                best_f1, best_thr = f1, t
        best_thresholds.append(float(best_thr))

        # save fold checkpoint
        fold_path = out_dir / f"fold{fold}.pt"
        torch.save({"model_state": best_state, "cfg": cfg}, fold_path)
        fold_paths.append(str(fold_path))
        del model, optim, scaler, tr_loader, va_loader
        gc.collect(); torch.cuda.empty_cache()

    # OOF summary
    oof_auc = roc_auc_score(y, oof)
    print(f"\nOOF AUC: {oof_auc:.5f}")
    final_thr = float(np.median(best_thresholds))
    return oof, oof_auc, best_thresholds, final_thr, fold_paths

import types
def _patched_fit_cv_transformer(X_raw, y, cfg, n_folds=5, seed=42, out_dir=TRANS_DIR):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    oof = np.zeros(len(y), dtype=np.float32)
    best_thresholds, fold_paths = [], []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_raw, y), 1):
        print(f"\n=== Fold {fold}/{n_folds} ===")
        tr_ds = ParagraphDataset(X_raw[tr_idx], y=y[tr_idx], max_len=cfg["max_len"])
        va_ds = ParagraphDataset(X_raw[va_idx], y=y[va_idx], max_len=cfg["max_len"])
        tr_loader = make_loader(tr_ds, cfg["batch_size"], shuffle=True, num_workers=0)
        va_loader = make_loader(va_ds, cfg["batch_size"], shuffle=False, num_workers=0)

        model = LightTransformer(
            feat_dim=cfg["feat_dim"], d_model=cfg["d_model"], n_heads=cfg["n_heads"],
            n_layers=cfg["n_layers"], dropout=cfg["dropout"], max_len=cfg["max_len"]
        ).to(device)

        optim = torch.optim.AdamW(model.parameters(), lr=cfg["lr"], weight_decay=cfg["weight_decay"])
        # Updated AMP scaler API
        scaler = torch.amp.GradScaler("cuda", enabled=(cfg["amp"] and device.type == "cuda"))

        best_auc, best_state, no_improve = -1.0, None, 0
        for epoch in range(1, cfg["epochs"]+1):
            tr_loss = train_one_epoch(model, tr_loader, optim, scaler, device)
            va_probs = predict_proba(model, va_loader, device)
            auc = roc_auc_score(y[va_idx], va_probs)
            print(f"Epoch {epoch:02d} | train_loss={tr_loss:.4f} | val_auc={auc:.5f}")

            if auc > best_auc + 1e-5:
                best_auc = auc
                best_state = {k: v.cpu() for k, v in model.state_dict().items()}
                no_improve = 0
            else:
                no_improve += 1
                if no_improve >= cfg["early_stop_patience"]:
                    print(f"Early stopping at epoch {epoch} (best AUC {best_auc:.5f})")
                    break

        model.load_state_dict(best_state)

        va_probs = predict_proba(model, va_loader, device)
        oof[va_idx] = va_probs

        # F1-optimal threshold on this fold
        best_thr, best_f1 = 0.5, -1.0
        for t in np.linspace(0.01, 0.99, 199):
            f1 = f1_score(y[va_idx], (va_probs >= t).astype(int))
            if f1 > best_f1:
                best_f1, best_thr = f1, t
        best_thresholds.append(float(best_thr))

        fold_path = out_dir / f"fold{fold}.pt"
        torch.save({"model_state": best_state, "cfg": cfg}, fold_path)
        fold_paths.append(str(fold_path))

        del model, optim, scaler, tr_loader, va_loader
        gc.collect()
        if device.type == "cuda":
            torch.cuda.empty_cache()

    oof_auc = roc_auc_score(y, oof)
    print(f"\nOOF AUC: {oof_auc:.5f}")
    final_thr = float(np.median(best_thresholds))
    return oof, oof_auc, best_thresholds, final_thr, fold_paths

# replace the original function reference so your next cell uses the patched one
fit_cv_transformer = _patched_fit_cv_transformer


In [17]:
# %%
# X_raw, y already loaded in Part 1 (np.load + concat). If not, reload them here.
# X_raw: (N, 100, 768), y: (N,)
oof_probs_nn, oof_auc_nn, fold_thrs_nn, final_thr_nn, fold_paths = fit_cv_transformer(
    X_raw, y, CFG, n_folds=N_FOLDS, seed=SEED, out_dir=TRANS_DIR
)
print("Fold thresholds (F1-optimal):", [round(t, 3) for t in fold_thrs_nn])
print("Final threshold (median):", round(final_thr_nn, 4))



=== Fold 1/5 ===


  with torch.cuda.amp.autocast():
  output = torch._nested_tensor_from_mask(


Epoch 01 | train_loss=0.4150 | val_auc=0.94423


  with torch.cuda.amp.autocast():


Epoch 02 | train_loss=0.3052 | val_auc=0.96196


  with torch.cuda.amp.autocast():


Epoch 03 | train_loss=0.2782 | val_auc=0.95162


  with torch.cuda.amp.autocast():


Epoch 04 | train_loss=0.2600 | val_auc=0.96495


  with torch.cuda.amp.autocast():


Epoch 05 | train_loss=0.2618 | val_auc=0.96229


  with torch.cuda.amp.autocast():


Epoch 06 | train_loss=0.2428 | val_auc=0.96720


  with torch.cuda.amp.autocast():


Epoch 07 | train_loss=0.2334 | val_auc=0.96460


  with torch.cuda.amp.autocast():


Epoch 08 | train_loss=0.2342 | val_auc=0.96114


  with torch.cuda.amp.autocast():


Epoch 09 | train_loss=0.2175 | val_auc=0.96210


  with torch.cuda.amp.autocast():


Epoch 10 | train_loss=0.2155 | val_auc=0.96316


  with torch.cuda.amp.autocast():


Epoch 11 | train_loss=0.2147 | val_auc=0.96797


  with torch.cuda.amp.autocast():


Epoch 12 | train_loss=0.2009 | val_auc=0.96664


  with torch.cuda.amp.autocast():


Epoch 13 | train_loss=0.1927 | val_auc=0.96684


  with torch.cuda.amp.autocast():


Epoch 14 | train_loss=0.1959 | val_auc=0.96321


  with torch.cuda.amp.autocast():


Epoch 15 | train_loss=0.1872 | val_auc=0.96265


  with torch.cuda.amp.autocast():


Epoch 16 | train_loss=0.1841 | val_auc=0.96504
Early stopping at epoch 16 (best AUC 0.96797)

=== Fold 2/5 ===


  with torch.cuda.amp.autocast():


Epoch 01 | train_loss=0.4201 | val_auc=0.94703


  with torch.cuda.amp.autocast():


Epoch 02 | train_loss=0.3068 | val_auc=0.95439


  with torch.cuda.amp.autocast():


Epoch 03 | train_loss=0.2911 | val_auc=0.95886


  with torch.cuda.amp.autocast():


Epoch 04 | train_loss=0.2734 | val_auc=0.96153


  with torch.cuda.amp.autocast():


Epoch 05 | train_loss=0.2517 | val_auc=0.95317


  with torch.cuda.amp.autocast():


Epoch 06 | train_loss=0.2450 | val_auc=0.96277


  with torch.cuda.amp.autocast():


Epoch 07 | train_loss=0.2313 | val_auc=0.96469


  with torch.cuda.amp.autocast():


Epoch 08 | train_loss=0.2265 | val_auc=0.96538


  with torch.cuda.amp.autocast():


Epoch 09 | train_loss=0.2255 | val_auc=0.96449


  with torch.cuda.amp.autocast():


Epoch 10 | train_loss=0.2107 | val_auc=0.96472


  with torch.cuda.amp.autocast():


Epoch 11 | train_loss=0.2022 | val_auc=0.96481


  with torch.cuda.amp.autocast():


Epoch 12 | train_loss=0.2004 | val_auc=0.96086


  with torch.cuda.amp.autocast():


Epoch 13 | train_loss=0.1906 | val_auc=0.96495
Early stopping at epoch 13 (best AUC 0.96538)

=== Fold 3/5 ===


  with torch.cuda.amp.autocast():


Epoch 01 | train_loss=0.4020 | val_auc=0.94827


  with torch.cuda.amp.autocast():


Epoch 02 | train_loss=0.3091 | val_auc=0.95493


  with torch.cuda.amp.autocast():


Epoch 03 | train_loss=0.2830 | val_auc=0.95808


  with torch.cuda.amp.autocast():


Epoch 04 | train_loss=0.2599 | val_auc=0.96130


  with torch.cuda.amp.autocast():


Epoch 05 | train_loss=0.2490 | val_auc=0.94852


  with torch.cuda.amp.autocast():


Epoch 06 | train_loss=0.2351 | val_auc=0.96368


  with torch.cuda.amp.autocast():


Epoch 07 | train_loss=0.2224 | val_auc=0.96334


  with torch.cuda.amp.autocast():


Epoch 08 | train_loss=0.2213 | val_auc=0.96400


  with torch.cuda.amp.autocast():


Epoch 09 | train_loss=0.2084 | val_auc=0.96214


  with torch.cuda.amp.autocast():


Epoch 10 | train_loss=0.2015 | val_auc=0.96291


  with torch.cuda.amp.autocast():


Epoch 11 | train_loss=0.1929 | val_auc=0.96350


  with torch.cuda.amp.autocast():


Epoch 12 | train_loss=0.1887 | val_auc=0.96215


  with torch.cuda.amp.autocast():


Epoch 13 | train_loss=0.1867 | val_auc=0.96104
Early stopping at epoch 13 (best AUC 0.96400)

=== Fold 4/5 ===


  with torch.cuda.amp.autocast():


Epoch 01 | train_loss=0.4232 | val_auc=0.93440


  with torch.cuda.amp.autocast():


Epoch 02 | train_loss=0.3094 | val_auc=0.95429


  with torch.cuda.amp.autocast():


Epoch 03 | train_loss=0.3113 | val_auc=0.95783


  with torch.cuda.amp.autocast():


Epoch 04 | train_loss=0.2584 | val_auc=0.95740


  with torch.cuda.amp.autocast():


Epoch 05 | train_loss=0.2756 | val_auc=0.95836


  with torch.cuda.amp.autocast():


Epoch 06 | train_loss=0.2447 | val_auc=0.96253


  with torch.cuda.amp.autocast():


Epoch 07 | train_loss=0.2340 | val_auc=0.95897


  with torch.cuda.amp.autocast():


Epoch 08 | train_loss=0.2265 | val_auc=0.96236


  with torch.cuda.amp.autocast():


Epoch 09 | train_loss=0.2193 | val_auc=0.96406


  with torch.cuda.amp.autocast():


Epoch 10 | train_loss=0.2143 | val_auc=0.96122


  with torch.cuda.amp.autocast():


Epoch 11 | train_loss=0.2075 | val_auc=0.96264


  with torch.cuda.amp.autocast():


Epoch 12 | train_loss=0.1986 | val_auc=0.95914


  with torch.cuda.amp.autocast():


Epoch 13 | train_loss=0.1878 | val_auc=0.96002


  with torch.cuda.amp.autocast():


Epoch 14 | train_loss=0.1783 | val_auc=0.96344
Early stopping at epoch 14 (best AUC 0.96406)

=== Fold 5/5 ===


  with torch.cuda.amp.autocast():


Epoch 01 | train_loss=0.4174 | val_auc=0.94363


  with torch.cuda.amp.autocast():


Epoch 02 | train_loss=0.3107 | val_auc=0.95593


  with torch.cuda.amp.autocast():


Epoch 03 | train_loss=0.2832 | val_auc=0.96335


  with torch.cuda.amp.autocast():


Epoch 04 | train_loss=0.2701 | val_auc=0.95562


  with torch.cuda.amp.autocast():


Epoch 05 | train_loss=0.2446 | val_auc=0.96526


  with torch.cuda.amp.autocast():


Epoch 06 | train_loss=0.2472 | val_auc=0.96526


  with torch.cuda.amp.autocast():


Epoch 07 | train_loss=0.2319 | val_auc=0.96551


  with torch.cuda.amp.autocast():


Epoch 08 | train_loss=0.2302 | val_auc=0.96379


  with torch.cuda.amp.autocast():


Epoch 09 | train_loss=0.2237 | val_auc=0.96591


  with torch.cuda.amp.autocast():


Epoch 10 | train_loss=0.2139 | val_auc=0.96372


  with torch.cuda.amp.autocast():


Epoch 11 | train_loss=0.2111 | val_auc=0.95510


  with torch.cuda.amp.autocast():


Epoch 12 | train_loss=0.2053 | val_auc=0.96388


  with torch.cuda.amp.autocast():


Epoch 13 | train_loss=0.2113 | val_auc=0.96615


  with torch.cuda.amp.autocast():


Epoch 14 | train_loss=0.1975 | val_auc=0.96393


  with torch.cuda.amp.autocast():


Epoch 15 | train_loss=0.1903 | val_auc=0.96456


  with torch.cuda.amp.autocast():


Epoch 16 | train_loss=0.1879 | val_auc=0.96357


  with torch.cuda.amp.autocast():


Epoch 17 | train_loss=0.1784 | val_auc=0.95766


  with torch.cuda.amp.autocast():


Epoch 18 | train_loss=0.1739 | val_auc=0.95736
Early stopping at epoch 18 (best AUC 0.96615)

OOF AUC: 0.95645
Fold thresholds (F1-optimal): [0.495, 0.094, 0.525, 0.356, 0.495]
Final threshold (median): 0.4951


In [18]:
# %%
class FoldEnsembleAdapter:
    """
    Provides predict_proba(X) like a scikit model.
    X must be (N, 768) **if using mean-pooling**; BUT we want (N, 100, 768).
    So for this adapter, we'll expect raw matrices and do the padding here.
    To integrate with your existing evaluate_on_validation/predict_test...,
    we add small wrappers below.
    """
    def __init__(self, fold_paths, cfg, device):
        self.fold_paths = fold_paths
        self.cfg = cfg
        self.device = device
        self.models = []
        self._load_models()

    def _load_models(self):
        self.models = []
        for p in self.fold_paths:
            ckpt = torch.load(p, map_location="cpu")
            m = LightTransformer(
                feat_dim=self.cfg["feat_dim"], d_model=self.cfg["d_model"],
                n_heads=self.cfg["n_heads"], n_layers=self.cfg["n_layers"],
                dropout=self.cfg["dropout"], max_len=self.cfg["max_len"]
            ).to(self.device)
            m.load_state_dict(ckpt["model_state"])
            m.eval()
            self.models.append(m)

    @torch.no_grad()
    def _predict_loader(self, loader):
        probs_fold = []
        for m in self.models:
            p = predict_proba(m, loader, self.device)
            probs_fold.append(p)
        probs = np.mean(np.stack(probs_fold, axis=0), axis=0)
        return probs

    # Convenience to mimic scikit's predict_proba for (N,768) inputs
    # We'll override with wrappers below that build proper datasets.


In [19]:
# %%
def evaluate_validation_with_nn(val_jsonl_path, adapter, threshold):
    if not Path(val_jsonl_path).exists():
        print("No validation file found. Skipping.")
        return None

    # Load JSONL items (reuse your function from Part 1)
    items = load_jsonl_features(val_jsonl_path)

    # Build dataset/loader for the NN (pads to max_len)
    val_ds = ParagraphDataset(items, y=None, max_len=CFG["max_len"])
    val_loader = make_loader(val_ds, CFG["batch_size"], shuffle=False)

    # Collect labels from jsonl
    labels = [int(obj.get("label", 0)) for obj in read_jsonl(val_jsonl_path)]
    yv = np.array(labels, dtype=np.int64)

    # Predict (averaging folds)
    pv = adapter._predict_loader(val_loader)
    auc = roc_auc_score(yv, pv)
    yhat = (pv >= threshold).astype(int)

    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    acc = accuracy_score(yv, yhat)
    prec, rec, f1, _ = precision_recall_fscore_support(yv, yhat, average="binary", zero_division=0)
    print(f"[NN Validation] AUC={auc:.4f} F1={f1:.4f} Acc={acc:.4f} Prec={prec:.4f} Rec={rec:.4f}")
    return {"AUC":auc, "F1":f1, "Acc":acc, "Prec":prec, "Rec":rec}


In [23]:
# %%
nn_adapter = FoldEnsembleAdapter(fold_paths, CFG, device)

print("== Using NN median threshold ==")
_ = evaluate_validation_with_nn(VAL_JSONL, nn_adapter, final_thr_nn)

print("== Using LR threshold (from Part 1) ==")
_ = evaluate_validation_with_nn(VAL_JSONL, nn_adapter, summary["final_thr"])  # from Part 1 baseline


== Using NN median threshold ==
[NN Validation] AUC=0.7700 F1=0.6000 Acc=0.6000 Prec=0.6000 Rec=0.6000
== Using LR threshold (from Part 1) ==
[NN Validation] AUC=0.7700 F1=0.6000 Acc=0.6000 Prec=0.6000 Rec=0.6000


In [24]:
# %%
def nn_test_inference_and_submit(test_jsonl_path, adapter, threshold, out_csv_path):
    items = load_jsonl_features(test_jsonl_path)
    test_ds = ParagraphDataset(items, y=None, max_len=CFG["max_len"])
    test_loader = make_loader(test_ds, CFG["batch_size"], shuffle=False)

    pt = adapter._predict_loader(test_loader)
    yhat = (pt >= threshold).astype(int)

    df = pd.DataFrame({"id": [obj["id"] for obj in items], "label": yhat})
    df.to_csv(out_csv_path, index=False)
    print("Wrote submission:", out_csv_path)
    return df

# Choose a threshold for NN (median of fold bests is a good start)
SUBMISSION_PATH_NN = Path("./submission_nn.csv")
_ = nn_test_inference_and_submit(TEST_JSONL, nn_adapter, final_thr_nn, SUBMISSION_PATH_NN)


Wrote submission: submission_nn.csv


In [41]:
def nn_prob_submission(test_jsonl_path, nn_adapter, out_csv_path):
    items = load_jsonl_features(test_jsonl_path)
    test_ds = ParagraphDataset(items, y=None, max_len=CFG["max_len"])
    test_loader = make_loader(test_ds, CFG["batch_size"], shuffle=False)
    y_prob = nn_adapter._predict_loader(test_loader)
    pd.DataFrame({"id": [o["id"] for o in items], "y_prob": y_prob}).to_csv(out_csv_path, index=False)
    print("Wrote:", out_csv_path)

SUBMISSION_NN_PROB = Path("./submission_nn_prob.csv")
nn_prob_submission(TEST_JSONL, nn_adapter, SUBMISSION_NN_PROB)


Wrote: submission_nn_prob.csv


In [42]:
def meta_prob_submission(test_jsonl_path, base_model, meta_model, out_csv_path):
    items = load_jsonl_features(test_jsonl_path)
    dft = to_paragraph_matrix_from_jsonl_items(items)
    Xt = np.stack(dft["vector"].values)
    p_base = base_model.predict_proba(Xt)[:, 1]
    feat = mmd_features_batch(Xt, BANK_AI, BANK_HU, gamma, mAA, mHH)
    feat["p_base"] = p_base
    Xf = feat[["p_base","k_xA","k_xH","diff_xA_xH","glob_mAA_mHH"]].values
    y_prob = meta_full.predict_proba(Xf)[:, 1]
    pd.DataFrame({"id": dft["id"], "y_prob": y_prob}).to_csv(out_csv_path, index=False)
    print("Wrote:", out_csv_path)

SUBMISSION_MMD_PROB = Path("./submission_mmd_prob.csv")
meta_prob_submission(TEST_JSONL, final_model, meta_full, SUBMISSION_MMD_PROB)


Wrote: submission_mmd_prob.csv


In [29]:
# %%
def blended_validation(val_jsonl_path, scikit_model, scikit_thr, nn_adapter, nn_thr, alpha=0.5):
    if not Path(val_jsonl_path).exists():
        print("No validation file found. Skipping.")
        return None
    items = load_jsonl_features(val_jsonl_path)
    val_ds = ParagraphDataset(items, y=None, max_len=CFG["max_len"])
    val_loader = make_loader(val_ds, CFG["batch_size"], shuffle=False)

    # labels
    labels = [int(obj.get("label", 0)) for obj in read_jsonl(val_jsonl_path)]
    yv = np.array(labels, dtype=np.int64)

    # probs from baseline (mean-pooled)
    dfv = to_paragraph_matrix_from_jsonl_items(items)
    Xv = np.stack(dfv["vector"].values)
    pb = scikit_model.predict_proba(Xv)[:, 1]

    # probs from NN
    pn = nn_adapter._predict_loader(val_loader)

    p = alpha*pn + (1-alpha)*pb

    # pick threshold by simple scan (or blend of thresholds)
    best_t, best_f1 = 0.5, -1
    for t in np.linspace(0.01, 0.99, 199):
        f1 = f1_score(yv, (p >= t).astype(int))
        if f1 > best_f1:
            best_f1, best_t = f1, t

    from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support
    auc = roc_auc_score(yv, p)
    yhat = (p >= best_t).astype(int)
    acc = accuracy_score(yv, yhat)
    prec, rec, f1, _ = precision_recall_fscore_support(yv, yhat, average="binary", zero_division=0)
    print(f"[Blend @alpha={alpha}] AUC={auc:.4f} F1={f1:.4f} Acc={acc:.4f} Prec={prec:.4f} Rec={rec:.4f} Thr={best_t:.3f}")
    return dict(AUC=auc, F1=f1, Acc=acc, Prec=prec, Rec=rec, Thr=best_t)

print("== Blend sanity check (alpha=0.5) ==")
_ = blended_validation(VAL_JSONL, final_model, summary["final_thr"], nn_adapter, final_thr_nn, alpha=0.5)


== Blend sanity check (alpha=0.5) ==
[Blend @alpha=0.5] AUC=0.8200 F1=0.7826 Acc=0.7500 Prec=0.6923 Rec=0.9000 Thr=0.094


In [30]:
# %% [markdown]
# ## Part 3 — Relative-test (MMD) features + Meta-classifier

# %%
import numpy as np, pandas as pd, json, joblib, gc
from pathlib import Path
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

MMD_DIR = MODELS_DIR / "mmd_v1"
MMD_DIR.mkdir(parents=True, exist_ok=True)

MMD_CFG = {
    "bank_size": 512,     # per class; reduce to 256 if RAM is tight
    "subset_for_sigma": 4000,  # how many points to use for median heuristic
    "seed": SEED
}
json.dump(MMD_CFG, open(MMD_DIR / "config.json", "w"))
rng = np.random.default_rng(MMD_CFG["seed"])


In [31]:
# %%
def rbf_kernel_dot(a, b, gamma):
    """
    Fast RBF via ||a-b||^2 = ||a||^2 + ||b||^2 - 2 a b^T
    a: (n, d), b: (m, d) -> K: (n, m)
    """
    a2 = (a*a).sum(axis=1, keepdims=True)   # (n,1)
    b2 = (b*b).sum(axis=1, keepdims=True).T # (1,m)
    d2 = a2 + b2 - 2.0 * (a @ b.T)
    return np.exp(-gamma * np.clip(d2, 0.0, None))

def median_heuristic_sigma(X, n_subset=4000):
    n = len(X)
    idx = rng.choice(n, size=min(n_subset, n), replace=False)
    S = X[idx]
    # sample pairs for robustness
    jdx = rng.choice(len(S), size=min(2000, len(S)), replace=False)
    P = S[jdx]
    Q = S[rng.choice(len(S), size=len(jdx), replace=False)]
    d2 = ((P-Q)**2).sum(axis=1)
    sigma = np.sqrt(np.median(d2) / 2.0) + 1e-8
    return float(sigma)

# Build pooled train if not available
# (You already have X: (n, 768) from Part 1)
assert X.shape[1] == 768 and len(X) == len(y)

# Stratified sample banks
ai_idx = np.where(y==1)[0]
hu_idx = np.where(y==0)[0]
rng.shuffle(ai_idx); rng.shuffle(hu_idx)

K = min(MMD_CFG["bank_size"], len(ai_idx), len(hu_idx))
BANK_AI = X[ai_idx[:K]]
BANK_HU = X[hu_idx[:K]]

# Kernel bandwidth (median heuristic on full train or subset)
sigma = median_heuristic_sigma(X, n_subset=MMD_CFG["subset_for_sigma"])
gamma = 1.0 / (2.0 * sigma * sigma)

# Precompute class self-similarity means (excluding diagonal)
def offdiag_mean_selfsim(B, gamma):
    Kmat = rbf_kernel_dot(B, B, gamma)
    n = Kmat.shape[0]
    s = (Kmat.sum() - np.trace(Kmat)) / (n*(n-1))
    return float(s)

mAA = offdiag_mean_selfsim(BANK_AI, gamma)
mHH = offdiag_mean_selfsim(BANK_HU, gamma)

# Persist banks & params
np.save(MMD_DIR / "bank_ai.npy", BANK_AI)
np.save(MMD_DIR / "bank_hu.npy", BANK_HU)
json.dump({"sigma": sigma, "gamma": gamma, "mAA": mAA, "mHH": mHH, "K": K},
          open(MMD_DIR / "kernel.json", "w"))
print(f"Bank size per class: {K} | sigma={sigma:.6f} | gamma={gamma:.6f} | mAA={mAA:.6f} | mHH={mHH:.6f}")


Bank size per class: 512 | sigma=2.290693 | gamma=0.095288 | mAA=0.443174 | mHH=0.457128


In [32]:
# %%
def mmd_features_batch(Z, bank_ai, bank_hu, gamma, mAA, mHH):
    """
    Z: (n, 768) pooled vectors
    Returns DataFrame with:
      k_xA = mean RBF(x, A)
      k_xH = mean RBF(x, H)
      diff = k_xA - k_xH   (positive => closer to AI)
      glob = mAA - mHH     (same for all x; keeps the "relative test" flavor)
    """
    KA = rbf_kernel_dot(Z, bank_ai, gamma)   # (n, K)
    KH = rbf_kernel_dot(Z, bank_hu, gamma)   # (n, K)
    k_xA = KA.mean(axis=1)
    k_xH = KH.mean(axis=1)
    diff = k_xA - k_xH
    glob = np.full_like(diff, fill_value=(mAA - mHH))
    return pd.DataFrame({
        "k_xA": k_xA, "k_xH": k_xH, "diff_xA_xH": diff, "glob_mAA_mHH": glob
    })


In [33]:
# %%
# You already have: oof_pred (baseline OOF probabilities), X (pooled train), y
feat_train = mmd_features_batch(X, BANK_AI, BANK_HU, gamma, mAA, mHH)
feat_train["p_base"] = oof_pred  # from Part 1 CV

cols = ["p_base", "k_xA", "k_xH", "diff_xA_xH", "glob_mAA_mHH"]
feat_train_arr = feat_train[cols].values

# Train meta model with honest OOF
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
oof_meta = np.zeros(len(y), dtype=np.float32)
thr_list, folds_meta = [], []

for fold, (tr, va) in enumerate(skf.split(feat_train_arr, y), 1):
    Xtr, Xva = feat_train_arr[tr], feat_train_arr[va]
    ytr, yva = y[tr], y[va]

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            penalty="l2", solver="lbfgs", max_iter=2000, C=1.0, random_state=SEED))
    ])
    meta = CalibratedClassifierCV(estimator=pipe, method="isotonic", cv=3)
    meta.fit(Xtr, ytr)
    pva = meta.predict_proba(Xva)[:,1]
    oof_meta[va] = pva

    # pick F1-optimal threshold on this fold
    best_t, best_f1 = 0.5, -1
    for t in np.linspace(0.01, 0.99, 199):
        f1 = f1_score(yva, (pva>=t).astype(int))
        if f1 > best_f1:
            best_f1, best_t = f1, t
    thr_list.append(float(best_t))
    auc = roc_auc_score(yva, pva)
    print(f"[META Fold {fold}] AUC={auc:.4f} F1@best={best_f1:.4f} thr={best_t:.3f}")

auc_oof = roc_auc_score(y, oof_meta)
t_star = float(np.median(thr_list))
pred_lbl = (oof_meta >= t_star).astype(int)
acc = accuracy_score(y, pred_lbl)
prec, rec, f1, _ = precision_recall_fscore_support(y, pred_lbl, average="binary")
print(f"\n[META OOF] AUC={auc_oof:.5f} F1={f1:.4f} Acc={acc:.4f} Prec={prec:.4f} Rec={rec:.4f} Thr={t_star:.3f}")


[META Fold 1] AUC=0.9680 F1@best=0.9079 thr=0.569
[META Fold 2] AUC=0.9654 F1@best=0.9033 thr=0.421
[META Fold 3] AUC=0.9648 F1@best=0.9058 thr=0.441
[META Fold 4] AUC=0.9637 F1@best=0.9034 thr=0.436
[META Fold 5] AUC=0.9681 F1@best=0.9060 thr=0.460

[META OOF] AUC=0.96526 F1=0.9051 Acc=0.9034 Prec=0.8897 Rec=0.9211 Thr=0.441


In [34]:
# %%
# Fit on ALL data
pipe_full = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        penalty="l2", solver="lbfgs", max_iter=3000, C=1.0, random_state=SEED))
])
meta_full = CalibratedClassifierCV(estimator=pipe_full, method="isotonic", cv=5)
meta_full.fit(feat_train_arr, y)

# Decide final threshold (use OOF-median t_star; optionally nudge by -0.02 ~ -0.05)
t_chosen = max(0.0, t_star - 0.02)

joblib.dump(meta_full, MMD_DIR / "meta_model.pkl")
feat_train.sample(3, random_state=SEED)[:0]  # no-op to anchor variable in notebook

json.dump({
    "cols": cols, "thr_oof": t_star, "thr_chosen": t_chosen,
    "uses": "p_base + MMD features (k_xA,k_xH,diff,glob)"
}, open(MMD_DIR / "meta_model.json", "w"))

print("Saved:", MMD_DIR / "meta_model.pkl", MMD_DIR / "meta_model.json")


Saved: models/mmd_v1/meta_model.pkl models/mmd_v1/meta_model.json


In [35]:
# %%
def features_from_jsonl_for_meta(val_jsonl_path):
    items = load_jsonl_features(val_jsonl_path)
    # baseline prob on pooled features
    dfv = to_paragraph_matrix_from_jsonl_items(items)
    Xv = np.stack(dfv["vector"].values)

    # baseline calibrated prob (from your Part 1 final_model)
    p_base = final_model.predict_proba(Xv)[:,1]

    # MMD features
    feat = mmd_features_batch(Xv, BANK_AI, BANK_HU, gamma, mAA, mHH)
    feat["p_base"] = p_base
    return np.array([int(o.get("label",0)) for o in read_jsonl(val_jsonl_path)]), feat, [obj["id"] for obj in items]

def evaluate_meta_on_validation(val_jsonl_path, meta_model, thr):
    if not Path(val_jsonl_path).exists():
        print("No validation file.")
        return None
    yv, feat_df, _ = features_from_jsonl_for_meta(val_jsonl_path)
    Xf = feat_df[cols].values
    pv = meta_model.predict_proba(Xf)[:,1]
    auc = roc_auc_score(yv, pv)
    yhat = (pv >= thr).astype(int)
    acc = accuracy_score(yv, yhat)
    prec, rec, f1, _ = precision_recall_fscore_support(yv, yhat, average="binary", zero_division=0)
    print(f"[META Validation] AUC={auc:.4f} F1={f1:.4f} Acc={acc:.4f} Prec={prec:.4f} Rec={rec:.4f} Thr={thr:.3f}")
    return {"AUC":auc, "F1":f1, "Acc":acc, "Prec":prec, "Rec":rec}

_ = evaluate_meta_on_validation(VAL_JSONL, meta_full, t_chosen)


[META Validation] AUC=0.9600 F1=0.6667 Acc=0.7500 Prec=1.0000 Rec=0.5000 Thr=0.421


In [36]:
# %%
def meta_test_and_submit(test_jsonl_path, meta_model, thr, out_csv):
    items = load_jsonl_features(test_jsonl_path)
    # pooled vectors
    dft = to_paragraph_matrix_from_jsonl_items(items)
    Xt = np.stack(dft["vector"].values)

    # baseline probs
    p_base = final_model.predict_proba(Xt)[:,1]

    # MMD features
    feat = mmd_features_batch(Xt, BANK_AI, BANK_HU, gamma, mAA, mHH)
    feat["p_base"] = p_base
    Xf = feat[cols].values

    pt = meta_model.predict_proba(Xf)[:,1]
    yhat = (pt >= thr).astype(int)

    sub = pd.DataFrame({"id": dft["id"], "label": yhat})
    sub.to_csv(out_csv, index=False)
    print("Wrote submission:", out_csv)
    return sub

SUBMISSION_PATH_MMD = Path("./submission_mmd.csv")
_ = meta_test_and_submit(TEST_JSONL, meta_full, t_chosen, SUBMISSION_PATH_MMD)


Wrote submission: submission_mmd.csv


In [37]:
# === 3-way blend probability submission: id,y_prob ===
def blend3_prob_submission(test_jsonl_path, base_model, nn_adapter, meta_model, alpha, beta, out_csv_path):
    # alpha = weight for NN, beta = weight for MMD meta, (1-alpha-beta) for baseline
    items = load_jsonl_features(test_jsonl_path)

    # baseline probs
    dft = to_paragraph_matrix_from_jsonl_items(items)
    Xt = np.stack(dft["vector"].values)
    pb = base_model.predict_proba(Xt)[:, 1]

    # NN probs
    test_ds = ParagraphDataset(items, y=None, max_len=CFG["max_len"])
    test_loader = make_loader(test_ds, CFG["batch_size"], shuffle=False)
    pn = nn_adapter._predict_loader(test_loader)

    # meta probs
    feat = mmd_features_batch(Xt, BANK_AI, BANK_HU, gamma, mAA, mHH)
    feat["p_base"] = pb
    Xf = feat[["p_base","k_xA","k_xH","diff_xA_xH","glob_mAA_mHH"]].values
    pm = meta_model.predict_proba(Xf)[:, 1]

    y_prob = (1 - alpha - beta) * pb + alpha * pn + beta * pm
    sub = pd.DataFrame({"id": dft["id"], "y_prob": y_prob})
    sub.to_csv(out_csv_path, index=False)
    print(f"Wrote 3-way blend (alpha={alpha}, beta={beta}) prob submission:", out_csv_path)
    return sub

SUBMISSION_BLEND3_PROB = Path("./submission_blend3_prob.csv")
_ = blend3_prob_submission(TEST_JSONL, final_model, nn_adapter, meta_full,
                           alpha=0.35, beta=0.35, out_csv_path=SUBMISSION_BLEND3_PROB)


Wrote 3-way blend (alpha=0.35, beta=0.35) prob submission: submission_blend3_prob.csv


In [43]:
from itertools import product
from sklearn.metrics import roc_auc_score

# get validation probs from each model
items_val = load_jsonl_features(VAL_JSONL)
dfv_val = to_paragraph_matrix_from_jsonl_items(items_val)
Xv = np.stack(dfv_val["vector"].values)
yv = np.array([int(o.get("label", 0)) for o in read_jsonl(VAL_JSONL)], dtype=np.int64)

# baseline probs
pb = final_model.predict_proba(Xv)[:, 1]

# NN probs
val_ds = ParagraphDataset(items_val, y=None, max_len=CFG["max_len"])
val_loader = make_loader(val_ds, CFG["batch_size"], shuffle=False)
pn = nn_adapter._predict_loader(val_loader)

# MMD probs
featv = mmd_features_batch(Xv, BANK_AI, BANK_HU, gamma, mAA, mHH)
featv["p_base"] = pb
pm = meta_full.predict_proba(featv[["p_base","k_xA","k_xH","diff_xA_xH","glob_mAA_mHH"]].values)[:,1]

# grid search weights (alpha for NN, beta for MMD, 1-alpha-beta for baseline)
best = (-1, None)
for a, b in product(np.linspace(0,1,21), repeat=2):
    if a + b > 1: continue
    p = (1 - a - b)*pb + a*pn + b*pm
    auc = roc_auc_score(yv, p)
    if auc > best[0]:
        best = (auc, (a, b, 1 - a - b))
best_auc, (alpha, beta, gamma_w) = best
print("Best val AUC:", round(best_auc, 5), "| weights NN/ MMD/ Base =", alpha, beta, gamma_w)


Best val AUC: 0.99 | weights NN/ MMD/ Base = 0.0 0.0 1.0


In [44]:
def blend3_prob_submission(test_jsonl_path, base_model, nn_adapter, meta_model, alpha, beta, out_csv_path):
    items = load_jsonl_features(test_jsonl_path)
    dft = to_paragraph_matrix_from_jsonl_items(items)
    Xt = np.stack(dft["vector"].values)

    pb = base_model.predict_proba(Xt)[:, 1]
    test_ds = ParagraphDataset(items, y=None, max_len=CFG["max_len"])
    test_loader = make_loader(test_ds, CFG["batch_size"], shuffle=False)
    pn = nn_adapter._predict_loader(test_loader)

    feat = mmd_features_batch(Xt, BANK_AI, BANK_HU, gamma, mAA, mHH)
    feat["p_base"] = pb
    pm = meta_model.predict_proba(feat[["p_base","k_xA","k_xH","diff_xA_xH","glob_mAA_mHH"]].values)[:,1]

    y_prob = (1 - alpha - beta)*pb + alpha*pn + beta*pm
    pd.DataFrame({"id": dft["id"], "y_prob": y_prob}).to_csv(out_csv_path, index=False)
    print("Wrote blend:", out_csv_path)

SUBMISSION_BLEND3_PROB = Path("./submission_blend3_prob.csv")
blend3_prob_submission(TEST_JSONL, final_model, nn_adapter, meta_full, alpha=alpha, beta=beta, out_csv_path=SUBMISSION_BLEND3_PROB)


Wrote blend: submission_blend3_prob.csv
