In [1]:
!nvidia-smi -L || echo "No GPU detected"
!python --version

import os, random, math, json, gc
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed);
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
seed_everything(42)


GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-13b75609-9ebf-dbff-2bb3-aee89348f97b)
Python 3.12.11
Device: cuda


In [9]:
from google.colab import drive
drive.mount('/content/drive')

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# EDIT THESE to where you placed your files on Drive:
DRIVE_BASE = Path("/content/drive/MyDrive")  # <--- change if needed

TRAIN_AI = DRIVE_BASE / "train_ai.npy"
TRAIN_HU = DRIVE_BASE / "train_human.npy"
VAL_JSONL = DRIVE_BASE / "validation.jsonl"
TEST_JSONL = DRIVE_BASE / "test_features.jsonl"

OUT_DIR = DRIVE_BASE / "colab_outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

for p in [TRAIN_AI, TRAIN_HU, VAL_JSONL, TEST_JSONL]:
    assert p.exists(), f"Missing: {p}"
print("All input files found")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
All input files found


In [3]:
!pip install ujson



In [4]:
import ujson

# A token row is padding if all dims are 0
def compute_token_mask(x_2d: np.ndarray) -> np.ndarray:
    # x_2d: (T=100, D=768)
    return (np.abs(x_2d).sum(axis=1) > 0).astype(np.float32)

def load_train_arrays(ai_path, hu_path):
    ai = np.load(ai_path, allow_pickle=True).astype(np.float32)   # (Na,100,768)
    hu = np.load(hu_path, allow_pickle=True).astype(np.float32)   # (Nh,100,768)
    X = np.concatenate([ai, hu], axis=0)
    y = np.concatenate([np.ones(len(ai), dtype=np.int64),
                        np.zeros(len(hu), dtype=np.int64)], axis=0)
    return X, y

def read_grouped_jsonl(path, expect_label=False):
    """
    Returns:
      ids: list of paragraph ids (length = num paragraphs)
      mats: list of np.array of shape (S,100,768) per paragraph (S sentences)
      labels: np.array [num paragraphs] if expect_label
    """
    ids, mats, labels = [], [], []
    with open(path, "r") as f:
        for line in f:
            obj = ujson.loads(line)
            ids.append(obj["id"])
            feats = np.array(obj["features"], dtype=np.float32)  # (S,100,768)
            mats.append(feats)
            if expect_label: labels.append(int(obj["label"]))
    if expect_label:
        return np.array(ids), mats, np.array(labels)
    return np.array(ids), mats


In [5]:
class SentenceDataset(Dataset):
    """
    Training dataset: (N,100,768) sentences with labels.
    Optional simple augmentations: token dropout + light Gaussian noise.
    """
    def __init__(self, X, y, aug=True, token_drop_p=0.1, noise_std=0.01):
        self.X = X
        self.y = y
        self.aug = aug
        self.token_drop_p = token_drop_p
        self.noise_std = noise_std

    def __len__(self): return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]  # (100,768)
        y = self.y[idx]
        m = compute_token_mask(x)  # (100,)

        if self.aug:
            # Randomly drop some valid tokens (set to 0)
            if self.token_drop_p > 0:
                valid_idx = np.where(m > 0.5)[0]
                if len(valid_idx) > 0:
                    k = max(1, int(len(valid_idx) * self.token_drop_p))
                    drop = np.random.choice(valid_idx, size=k, replace=False)
                    x = x.copy()
                    x[drop] = 0.0
                    m = compute_token_mask(x)

            # Add tiny Gaussian noise to valid tokens
            if self.noise_std > 0:
                noise = np.random.normal(0, self.noise_std, size=x.shape).astype(np.float32)
                x = x + noise * m[:,None]

        return torch.from_numpy(x), torch.tensor(y, dtype=torch.long), torch.from_numpy(m)

class SentenceEvalDataset(Dataset):
    """
    For validation/test: flattened sentences with paragraph id mapping.
    """
    def __init__(self, mats, para_ids):
        # flatten sentences, keep mapping to paragraph id
        self.samples = []
        for pid, para in zip(para_ids, mats):
            for s in range(len(para)):
                self.samples.append((para[s], pid))
        self.n = len(self.samples)

    def __len__(self): return self.n

    def __getitem__(self, idx):
        x, pid = self.samples[idx]  # (100,768)
        m = compute_token_mask(x)
        return torch.from_numpy(x), torch.from_numpy(m), pid


In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int = 100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, max_len, d_model)

    def forward(self, x):
        # x: (B, T, C)
        T = x.size(1)
        return x + self.pe[:, :T, :]

class TokenTransformer(nn.Module):
    def __init__(self, in_dim=768, model_dim=256, n_heads=8, ff_dim=512, n_layers=2, dropout=0.1):
        super().__init__()
        self.proj = nn.Linear(in_dim, model_dim)
        self.posenc = PositionalEncoding(model_dim, max_len=100)
        encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=n_heads,
                                                   dim_feedforward=ff_dim, dropout=dropout,
                                                   batch_first=True, activation="gelu", norm_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.pool_attn = nn.Sequential(
            nn.Linear(model_dim, model_dim),
            nn.Tanh(),
            nn.Linear(model_dim, 1)
        )
        self.head = nn.Sequential(
            nn.LayerNorm(model_dim*2),
            nn.Dropout(dropout),
            nn.Linear(model_dim*2, 1)
        )

    def forward(self, x, mask):
      # x: (B,T,768), mask: (B,T) where 1 = valid token
      x = self.proj(x)                          # (B,T,C)
      x = self.posenc(x)

      pad_mask = ~mask.bool()                   # True where padding
      h = self.encoder(x, src_key_padding_mask=pad_mask)  # (B,T,C)

      # masked mean/max pooling
      m = mask.unsqueeze(-1)                    # (B,T,1)
      h_masked = h * m
      denom = m.sum(dim=1).clamp(min=1.0)
      mean_pool = h_masked.sum(dim=1) / denom

      # use a fp16-safe negative value (fp16 min ≈ -65504)
      NEG_LARGE = -1e4
      max_pool = h.masked_fill(pad_mask.unsqueeze(-1), NEG_LARGE).max(dim=1).values

      # attention pooling (learned)
      attn_logits = self.pool_attn(h)           # (B,T,1)
      attn_logits = attn_logits.masked_fill(pad_mask.unsqueeze(-1), NEG_LARGE)
      attn = torch.softmax(attn_logits, dim=1)
      attn_pool = (attn * h).sum(dim=1)

      fused = torch.cat([0.5*(mean_pool+max_pool), attn_pool], dim=-1)  # (B,2C)
      logit = self.head(fused).squeeze(-1)         # (B,)
      return logit



In [7]:
def train_one_epoch(model, loader, optimizer, scaler, scheduler=None):
    model.train()
    total_loss = 0.0
    criterion = nn.BCEWithLogitsLoss()
    for xb, yb, mb in loader:
        xb = xb.to(device, non_blocking=True)
        yb = yb.float().to(device, non_blocking=True)
        mb = mb.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        with torch.amp.autocast('cuda', enabled=(device=="cuda")):
            logits = model(xb, mb)
            loss = criterion(logits, yb)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        if scheduler is not None:
            scheduler.step()
        total_loss += loss.item() * xb.size(0)
    return total_loss / len(loader.dataset)

@torch.no_grad()
def predict_logits(model, loader):
    model.eval()
    all_logits = []
    for batch in loader:
        # batch is either:
        #  - TRAIN/EVAL on fold: (x, y, mask)
        #  - PARAGRAPH eval loaders: (x, mask, pid)
        if len(batch) == 3:
            a, b, c = batch
            # mask is the one with shape (B, T)
            if hasattr(b, "ndim") and b.ndim == 2:
                xb, mb = a, b           # (x, mask, pid) case
            elif hasattr(c, "ndim") and c.ndim == 2:
                xb, mb = a, c           # (x, y, mask) case
            else:
                raise ValueError("Could not locate mask tensor in batch of length 3.")
        else:
            # fallback: (x, mask)
            xb, mb = batch

        xb = xb.to(device, non_blocking=True)
        mb = mb.to(device, non_blocking=True)

        with torch.amp.autocast('cuda', enabled=(device == "cuda")):
            logits = model(xb, mb)
        all_logits.append(logits.detach().float().cpu())
    return torch.cat(all_logits, dim=0).numpy()


def probs_from_logits(logits):
    return 1 / (1 + np.exp(-logits))


In [8]:
# Hyperparams (tweak as needed)
EPOCHS = 8            # 10–12 if you have time
BATCH_TRAIN = 64      # 64–96 on T4 is OK
BATCH_EVAL  = 128
LR = 2e-4
WD = 1e-4
MODEL_DIM = 256
LAYERS = 2
HEADS = 8
FF = 512
DROPOUT = 0.15

# Load data
X_raw, y = load_train_arrays(TRAIN_AI, TRAIN_HU)
print("Train tensors:", X_raw.shape, "Pos:", y.sum(), "Neg:", (y==0).sum())
val_ids, val_mats, val_labels = read_grouped_jsonl(VAL_JSONL, expect_label=True)
test_ids, test_mats = read_grouped_jsonl(TEST_JSONL, expect_label=False)
print("Val paragraphs:", len(val_ids), "Test paragraphs:", len(test_ids))

# Build flattened datasets for validation/test sentence-level inference
val_sent_ds  = SentenceEvalDataset(val_mats, val_ids)   # sentences with id mapping
test_sent_ds = SentenceEvalDataset(test_mats, test_ids)

val_loader_sent  = DataLoader(val_sent_ds,  batch_size=BATCH_EVAL, shuffle=False,
                              num_workers=2, pin_memory=True, collate_fn=None)
test_loader_sent = DataLoader(test_sent_ds, batch_size=BATCH_EVAL, shuffle=False,
                              num_workers=2, pin_memory=True, collate_fn=None)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_logits = np.zeros(len(X_raw), dtype=np.float32)
val_prob_blend = np.zeros(len(val_sent_ds), dtype=np.float32)
test_prob_blend = np.zeros(len(test_sent_ds), dtype=np.float32)

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_raw, y), 1):
    print(f"\n===== Fold {fold} / 5 =====")
    train_ds = SentenceDataset(X_raw[tr_idx], y[tr_idx], aug=True, token_drop_p=0.1, noise_std=0.01)
    valid_ds = SentenceDataset(X_raw[va_idx], y[va_idx], aug=False)

    train_loader = DataLoader(train_ds, batch_size=BATCH_TRAIN, shuffle=True,
                              num_workers=2, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_ds, batch_size=BATCH_EVAL, shuffle=False,
                              num_workers=2, pin_memory=True)

    model = TokenTransformer(in_dim=768, model_dim=MODEL_DIM, n_heads=HEADS,
                             ff_dim=FF, n_layers=LAYERS, dropout=DROPOUT).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
    # OneCycle or cosine works; OneCycle is simple here:
    total_steps = EPOCHS * len(train_loader)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=LR, total_steps=total_steps, pct_start=0.1, anneal_strategy="cos"
    )
    scaler = torch.amp.GradScaler('cuda', enabled=(device=="cuda"))
    best_auc, best_state = -1, None
    patience, patience_ctr = 3, 0

    for epoch in range(1, EPOCHS+1):
        train_loss = train_one_epoch(model, train_loader, optimizer, scaler, scheduler)
        # eval on fold's validation split
        v_logits = predict_logits(model, valid_loader)
        v_probs  = probs_from_logits(v_logits)
        v_auc    = roc_auc_score(y[va_idx], v_probs)
        v_acc    = accuracy_score(y[va_idx], (v_probs>=0.5).astype(int))
        print(f"Epoch {epoch:02d} | loss {train_loss:.4f} | val AUC {v_auc:.4f} | ACC {v_acc:.4f}")

        if v_auc > best_auc + 1e-4:
            best_auc = v_auc
            best_state = model.state_dict()
            patience_ctr = 0
        else:
            patience_ctr += 1
            if patience_ctr >= patience:
                print("Early stopping.")
                break

    # load best fold weights
    if best_state is not None:
        model.load_state_dict(best_state)

    # save fold model
    fold_path = OUT_DIR / f"transformer_fold{fold}.pt"
    torch.save(model.state_dict(), fold_path)
    print("Saved:", fold_path)

    # OOF logits for train split
    v_logits = predict_logits(model, valid_loader)
    oof_logits[va_idx] = v_logits

    # Predict validation/test sentences for paragraph aggregation
    v_logits_sent = predict_logits(model, val_loader_sent)
    t_logits_sent = predict_logits(model, test_loader_sent)

    v_probs_sent = probs_from_logits(v_logits_sent)
    t_probs_sent = probs_from_logits(t_logits_sent)

    # Blend across folds by averaging probs
    val_prob_blend  += v_probs_sent / skf.n_splits
    test_prob_blend += t_probs_sent / skf.n_splits

# OOF metrics
oof_probs = probs_from_logits(oof_logits)
oof_auc = roc_auc_score(y, oof_probs)
oof_acc = accuracy_score(y, (oof_probs>=0.5).astype(int))
print(f"\nOOF AUC: {oof_auc:.4f} | OOF ACC@0.5: {oof_acc:.4f}")

# Aggregate sentence probs to paragraph probs (mean; you can try median/max too)
val_sent_df = pd.DataFrame({"id":[pid for _,_,pid in val_loader_sent.dataset], "prob":val_prob_blend})
test_sent_df= pd.DataFrame({"id":[pid for _,_,pid in test_loader_sent.dataset], "prob":test_prob_blend})

val_para = val_sent_df.groupby("id")["prob"].mean().reset_index()
test_para= test_sent_df.groupby("id")["prob"].mean().reset_index()

# Validation metrics at paragraph level
val_df = pd.DataFrame({"id": val_ids, "label": val_labels}).groupby("id")["label"].mean().round().astype(int).reset_index()
vv = val_para.merge(val_df, on="id", how="left")
val_auc = roc_auc_score(vv["label"].values, vv["prob"].values)
val_acc = accuracy_score(vv["label"].values, (vv["prob"].values>=0.5).astype(int))
print(f"Validation paragraph-level AUC: {val_auc:.4f} | ACC@0.5: {val_acc:.4f}")

# Kaggle submission (probabilities required)
sub = test_para.rename(columns={"prob":"y_prob"})
sub.to_csv(OUT_DIR / "submission.csv", index=False)
print("Saved submission ->", OUT_DIR / "submission.csv")
sub.head()


Train tensors: (16322, 100, 768) Pos: 8161 Neg: 8161
Val paragraphs: 20 Test paragraphs: 180

===== Fold 1 / 5 =====




Epoch 01 | loss 0.4956 | val AUC 0.9333 | ACC 0.8511
Epoch 02 | loss 0.3403 | val AUC 0.9537 | ACC 0.8646
Epoch 03 | loss 0.2954 | val AUC 0.9555 | ACC 0.8830
Epoch 04 | loss 0.2752 | val AUC 0.9643 | ACC 0.8900
Epoch 05 | loss 0.2633 | val AUC 0.9620 | ACC 0.8891
Epoch 06 | loss 0.2498 | val AUC 0.9661 | ACC 0.8974
Epoch 07 | loss 0.2418 | val AUC 0.9669 | ACC 0.9008
Epoch 08 | loss 0.2348 | val AUC 0.9667 | ACC 0.8998
Saved: /content/drive/MyDrive/colab_outputs/transformer_fold1.pt

===== Fold 2 / 5 =====




Epoch 01 | loss 0.5269 | val AUC 0.9246 | ACC 0.8371
Epoch 02 | loss 0.3386 | val AUC 0.9502 | ACC 0.8711
Epoch 03 | loss 0.2897 | val AUC 0.9546 | ACC 0.8784
Epoch 04 | loss 0.2667 | val AUC 0.9569 | ACC 0.8882
Epoch 05 | loss 0.2621 | val AUC 0.9611 | ACC 0.8897
Epoch 06 | loss 0.2448 | val AUC 0.9625 | ACC 0.8922
Epoch 07 | loss 0.2367 | val AUC 0.9628 | ACC 0.8946
Epoch 08 | loss 0.2323 | val AUC 0.9627 | ACC 0.8943
Saved: /content/drive/MyDrive/colab_outputs/transformer_fold2.pt

===== Fold 3 / 5 =====




Epoch 01 | loss 0.5078 | val AUC 0.9197 | ACC 0.8404
Epoch 02 | loss 0.3311 | val AUC 0.9532 | ACC 0.8778
Epoch 03 | loss 0.2844 | val AUC 0.9570 | ACC 0.8768
Epoch 04 | loss 0.2683 | val AUC 0.9592 | ACC 0.8928
Epoch 05 | loss 0.2572 | val AUC 0.9612 | ACC 0.8934
Epoch 06 | loss 0.2444 | val AUC 0.9618 | ACC 0.8937
Epoch 07 | loss 0.2337 | val AUC 0.9620 | ACC 0.8934
Epoch 08 | loss 0.2290 | val AUC 0.9625 | ACC 0.8931
Saved: /content/drive/MyDrive/colab_outputs/transformer_fold3.pt

===== Fold 4 / 5 =====




Epoch 01 | loss 0.5075 | val AUC 0.9316 | ACC 0.8116
Epoch 02 | loss 0.3323 | val AUC 0.9491 | ACC 0.8750
Epoch 03 | loss 0.2839 | val AUC 0.9582 | ACC 0.8903
Epoch 04 | loss 0.2756 | val AUC 0.9587 | ACC 0.8857
Epoch 05 | loss 0.2539 | val AUC 0.9599 | ACC 0.8885
Epoch 06 | loss 0.2458 | val AUC 0.9615 | ACC 0.8955
Epoch 07 | loss 0.2352 | val AUC 0.9615 | ACC 0.8925
Epoch 08 | loss 0.2312 | val AUC 0.9618 | ACC 0.8937
Saved: /content/drive/MyDrive/colab_outputs/transformer_fold4.pt

===== Fold 5 / 5 =====




Epoch 01 | loss 0.5189 | val AUC 0.9244 | ACC 0.8002
Epoch 02 | loss 0.3346 | val AUC 0.9569 | ACC 0.8719
Epoch 03 | loss 0.2922 | val AUC 0.9604 | ACC 0.8961
Epoch 04 | loss 0.2775 | val AUC 0.9576 | ACC 0.8869
Epoch 05 | loss 0.2602 | val AUC 0.9641 | ACC 0.8989
Epoch 06 | loss 0.2442 | val AUC 0.9637 | ACC 0.8971
Epoch 07 | loss 0.2381 | val AUC 0.9651 | ACC 0.9007
Epoch 08 | loss 0.2332 | val AUC 0.9654 | ACC 0.9026
Saved: /content/drive/MyDrive/colab_outputs/transformer_fold5.pt

OOF AUC: 0.9634 | OOF ACC@0.5: 0.8967
Validation paragraph-level AUC: 0.9200 | ACC@0.5: 0.8000
Saved submission -> /content/drive/MyDrive/colab_outputs/submission.csv


Unnamed: 0,id,y_prob
0,15,0.071721
1,16,0.162859
2,17,0.056796
3,18,0.439647
4,19,0.189955
