In [1]:
# One-line installs (quiet)
!pip install -q transformers datasets scikit-learn accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m118.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m98.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os, math, json, random, glob
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Dict, List, Any, Optional

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)

In [3]:
# Config (tune safely)
# ===========================
SEED              = 42
MODEL_NAME        = "distilbert-base-uncased"   # small & strong baseline
LABELS            = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
MAX_LENGTH        = 128                         # 128 or 256; 128 is lighter
TRAIN_BATCH_SIZE  = 16                          # per device batch
VAL_BATCH_SIZE    = 32
GRAD_ACC_STEPS    = 1                           # increase if you lower batch size
EPOCHS            = 5
LR                = 2e-5
WARMUP_RATIO      = 0.05
WEIGHT_DECAY      = 0.01
SAMPLE_TRAIN      = 30000                       # take subset for speed; set None for full
SAMPLE_VAL        = 3000
CHECKPOINT_DIR    = "/content/checkpoints_distilbert"
BEST_DIR          = "/content/best_model"
RESUME_TRAINING   = True                        # set True to auto-resume if checkpoint exists
DATA_PATH         = "/content/train.csv"                 # upload train.csv from Kaggle Jigsaw

In [4]:
# Reproducibility
# ===========================
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(BEST_DIR, exist_ok=True)

Device: cuda


In [5]:
# Load Data (upload train.csv first)
# ===========================
assert os.path.exists(DATA_PATH), "train.csv not found. Upload it or adjust DATA_PATH."
df = pd.read_csv(DATA_PATH)

# Ensure columns exist
missing = [c for c in ["comment_text"] + LABELS if c not in df.columns]
assert not missing, f"Missing columns in CSV: {missing}"

# Simple subsampling for speed
def stratified_sample(frame, n_total, seed=SEED):
    if n_total is None or n_total >= len(frame):
        return frame
    tmp = frame.copy()
    tmp["_any"] = (tmp[LABELS].sum(axis=1) > 0).astype(int)
    n_pos = min(n_total // 2, tmp["_any"].sum())
    n_neg = n_total - n_pos
    pos = tmp[tmp["_any"]==1].sample(n_pos, random_state=seed)
    neg = tmp[tmp["_any"]==0].sample(n_neg, random_state=seed)
    out = pd.concat([pos, neg]).sample(frac=1, random_state=seed).drop(columns=["_any"])
    return out

df = df.dropna(subset=["comment_text"])
full_train, full_val = train_test_split(
    df, test_size=0.1, random_state=SEED,
    stratify=(df[LABELS].sum(axis=1) > 0).astype(int)
)

train_df = stratified_sample(full_train, SAMPLE_TRAIN, seed=SEED)
val_df   = stratified_sample(full_val,   SAMPLE_VAL,   seed=SEED)

print(f"Train size: {len(train_df):,} | Val size: {len(val_df):,}")


Train size: 30,000 | Val size: 3,000


In [6]:
# Hugging Face Datasets
# ===========================
train_ds = Dataset.from_pandas(train_df, preserve_index=False)
val_ds   = Dataset.from_pandas(val_df,   preserve_index=False)
raw_ds   = DatasetDict({"train": train_ds, "validation": val_ds})

In [7]:
# Tokenizer
# ===========================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

def tokenize_batch(batch):
    toks = tokenizer(
        batch["comment_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
    )
    labels = np.stack([batch[l] for l in LABELS], axis=1).astype(np.float32)
    toks["labels"] = labels
    return toks

tokenized_ds = raw_ds.map(
    tokenize_batch, batched=True, batch_size=1000,
    remove_columns=[c for c in train_df.columns if c not in LABELS + ["comment_text"]]
)
tokenized_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

train_loader = DataLoader(tokenized_ds["train"], batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(tokenized_ds["validation"], batch_size=VAL_BATCH_SIZE, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [8]:
# Model
# ===========================
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS),
    problem_type="multi_label_classification",
)
# For DistilBERT, this is enough; it will output logits of shape [B,6]
model.to(DEVICE)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [9]:
# Class Weights (pos_weight) for BCEWithLogits
#   pos_weight_j = (N - P_j) / P_j  (boost rare classes)
# ===========================
y = np.vstack([train_df[l].values for l in LABELS]).T  # shape [N,6]
P = y.sum(axis=0)                                      # positives per class
N = len(train_df)
pos_weight = torch.tensor((N - P) / np.clip(P, 1, None), dtype=torch.float32).to(DEVICE)
print("pos_weight:", pos_weight.cpu().numpy())

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

pos_weight: [ 1.1788075 19.661158   2.9354584 67.80734    3.2259474 22.56638  ]


In [10]:
# Optimizer & Scheduler
# ===========================
trainable = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(trainable, lr=LR, weight_decay=WEIGHT_DECAY)

num_update_steps_per_epoch = math.ceil(len(train_loader) / GRAD_ACC_STEPS)
num_training_steps = EPOCHS * num_update_steps_per_epoch
num_warmup_steps = int(WARMUP_RATIO * num_training_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
)

scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))


  scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))


In [11]:
# Checkpoint Helpers
# ===========================
def save_checkpoint(step:int, epoch:int, best_macro:float, path:str):
    os.makedirs(path, exist_ok=True)
    torch.save({
        "step": step,
        "epoch": epoch,
        "model_state": model.state_dict(),
        "optimizer_state": optimizer.state_dict(),
        "scheduler_state": scheduler.state_dict(),
        "scaler_state": scaler.state_dict(),
        "best_macro": best_macro,
        "rng_state": {
            "python": random.getstate(),
            "numpy": np.random.get_state(),
            "torch": torch.get_rng_state().cpu().numpy().tolist(),
        },
    }, os.path.join(path, "checkpoint.pt"))
    # Save tokenizer/config for convenience
    tokenizer.save_pretrained(path)
    print(f"✅ Saved checkpoint @ {path}")

def load_latest_checkpoint():
    ckpts = sorted(glob.glob(os.path.join(CHECKPOINT_DIR, "step-*")))
    if not ckpts:
        return None
    latest = ckpts[-1]
    bundle = torch.load(os.path.join(latest, "checkpoint.pt"), map_location="cpu")
    model.load_state_dict(bundle["model_state"])
    optimizer.load_state_dict(bundle["optimizer_state"])
    scheduler.load_state_dict(bundle["scheduler_state"])
    scaler.load_state_dict(bundle["scaler_state"])
    best_macro = bundle.get("best_macro", 0.0)
    step = bundle.get("step", 0)
    epoch = bundle.get("epoch", 0)
    print(f"🔁 Resumed from {latest} (epoch={epoch}, step={step}, best_macro={best_macro:.4f})")
    return {"dir": latest, "step": step, "epoch": epoch, "best_macro": best_macro}

def save_best_model(best_macro: float):
    os.makedirs(BEST_DIR, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(BEST_DIR, "pytorch_model.bin"))
    model.config.save_pretrained(BEST_DIR)
    tokenizer.save_pretrained(BEST_DIR)
    with open(os.path.join(BEST_DIR, "metrics.json"), "w") as f:
        json.dump({"best_macro_f1": best_macro}, f)
    print(f"🏆 Saved BEST model @ {BEST_DIR} (macro F1={best_macro:.4f})")


In [12]:
# Evaluation
# ===========================
@torch.no_grad()
def evaluate(loader, threshold=0.5):
    model.eval()
    all_true, all_prob = [], []
    for batch in loader:
        input_ids = batch["input_ids"].to(DEVICE, non_blocking=True)
        attention_mask = batch["attention_mask"].to(DEVICE, non_blocking=True)
        labels = batch["labels"].cpu().numpy()
        with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
            logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        probs = torch.sigmoid(logits).cpu().numpy()
        all_true.append(labels)
        all_prob.append(probs)
    y_true = np.vstack(all_true)
    y_prob = np.vstack(all_prob)
    y_pred = (y_prob >= threshold).astype(int)
    macro = f1_score(y_true, y_pred, average="macro")
    micro = f1_score(y_true, y_pred, average="micro")
    return macro, micro, y_true, y_pred


In [13]:
# Training Loop (with resume + checkpoints)
# ===========================
start_epoch, global_step, best_macro = 0, 0, 0.0
if RESUME_TRAINING:
    state = load_latest_checkpoint()
    if state:
        # continue after the step/epoch recorded
        start_epoch = state["epoch"]
        global_step = state["step"]
        best_macro = state["best_macro"]

SAVE_EVERY_STEPS = 500   # checkpoint frequency

for epoch in range(start_epoch, EPOCHS):
    model.train()
    running = 0.0
    optimizer.zero_grad(set_to_none=True)

    for step, batch in enumerate(train_loader):
        input_ids = batch["input_ids"].to(DEVICE, non_blocking=True)
        attention_mask = batch["attention_mask"].to(DEVICE, non_blocking=True)
        labels = batch["labels"].to(DEVICE, non_blocking=True)

        with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
            logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
            loss = criterion(logits, labels) / GRAD_ACC_STEPS

        scaler.scale(loss).backward()
        if (step + 1) % GRAD_ACC_STEPS == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()
            global_step += 1

        running += loss.item() * GRAD_ACC_STEPS

        if global_step % 50 == 0:
            print(f"Epoch {epoch+1}/{EPOCHS} | Step {global_step} | Loss {running/50:.4f}")
            running = 0.0

        if global_step % SAVE_EVERY_STEPS == 0:
            ckpt_dir = os.path.join(CHECKPOINT_DIR, f"step-{global_step}")
            save_checkpoint(global_step, epoch, best_macro, ckpt_dir)

    # ---- Validation each epoch
    macro, micro, y_true, y_pred = evaluate(val_loader)
    print(f"Epoch {epoch+1} DONE | Val Macro-F1: {macro:.4f} | Val Micro-F1: {micro:.4f}")

    # Save best
    if macro > best_macro:
        best_macro = macro
        save_best_model(best_macro)

    # Epoch-level checkpoint
    ckpt_dir = os.path.join(CHECKPOINT_DIR, f"epoch-{epoch+1}-step-{global_step}")
    save_checkpoint(global_step, epoch+1, best_macro, ckpt_dir)

print("Training complete.")
print("Best model stored at:", BEST_DIR)

  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):


Epoch 1/5 | Step 50 | Loss 1.1414
Epoch 1/5 | Step 100 | Loss 1.0783
Epoch 1/5 | Step 150 | Loss 1.0809
Epoch 1/5 | Step 200 | Loss 0.9789
Epoch 1/5 | Step 250 | Loss 0.8952
Epoch 1/5 | Step 300 | Loss 0.7769
Epoch 1/5 | Step 350 | Loss 0.6761
Epoch 1/5 | Step 400 | Loss 0.6700
Epoch 1/5 | Step 450 | Loss 0.5674
Epoch 1/5 | Step 500 | Loss 0.5895
✅ Saved checkpoint @ /content/checkpoints_distilbert/step-500
Epoch 1/5 | Step 550 | Loss 0.5441
Epoch 1/5 | Step 600 | Loss 0.4886
Epoch 1/5 | Step 650 | Loss 0.5494
Epoch 1/5 | Step 700 | Loss 0.5070
Epoch 1/5 | Step 750 | Loss 0.5182
Epoch 1/5 | Step 800 | Loss 0.5028
Epoch 1/5 | Step 850 | Loss 0.4467
Epoch 1/5 | Step 900 | Loss 0.5413
Epoch 1/5 | Step 950 | Loss 0.4650
Epoch 1/5 | Step 1000 | Loss 0.4734
✅ Saved checkpoint @ /content/checkpoints_distilbert/step-1000
Epoch 1/5 | Step 1050 | Loss 0.4536
Epoch 1/5 | Step 1100 | Loss 0.4091
Epoch 1/5 | Step 1150 | Loss 0.3819
Epoch 1/5 | Step 1200 | Loss 0.4214
Epoch 1/5 | Step 1250 | Loss 0.

  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):


Epoch 1 DONE | Val Macro-F1: 0.5880 | Val Micro-F1: 0.7436
🏆 Saved BEST model @ /content/best_model (macro F1=0.5880)
✅ Saved checkpoint @ /content/checkpoints_distilbert/epoch-1-step-1875


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):


Epoch 2/5 | Step 1900 | Loss 0.1785
Epoch 2/5 | Step 1950 | Loss 0.3514
Epoch 2/5 | Step 2000 | Loss 0.3727
✅ Saved checkpoint @ /content/checkpoints_distilbert/step-2000
Epoch 2/5 | Step 2050 | Loss 0.3070
Epoch 2/5 | Step 2100 | Loss 0.3329
Epoch 2/5 | Step 2150 | Loss 0.2950
Epoch 2/5 | Step 2200 | Loss 0.3233
Epoch 2/5 | Step 2250 | Loss 0.3211
Epoch 2/5 | Step 2300 | Loss 0.3215
Epoch 2/5 | Step 2350 | Loss 0.3260
Epoch 2/5 | Step 2400 | Loss 0.3225
Epoch 2/5 | Step 2450 | Loss 0.3075
Epoch 2/5 | Step 2500 | Loss 0.3569
✅ Saved checkpoint @ /content/checkpoints_distilbert/step-2500
Epoch 2/5 | Step 2550 | Loss 0.3143
Epoch 2/5 | Step 2600 | Loss 0.3019
Epoch 2/5 | Step 2650 | Loss 0.3126
Epoch 2/5 | Step 2700 | Loss 0.3724
Epoch 2/5 | Step 2750 | Loss 0.3247
Epoch 2/5 | Step 2800 | Loss 0.3283
Epoch 2/5 | Step 2850 | Loss 0.3887
Epoch 2/5 | Step 2900 | Loss 0.3590
Epoch 2/5 | Step 2950 | Loss 0.3291
Epoch 2/5 | Step 3000 | Loss 0.3684
✅ Saved checkpoint @ /content/checkpoints_dist

  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):


Epoch 2 DONE | Val Macro-F1: 0.6003 | Val Micro-F1: 0.7596
🏆 Saved BEST model @ /content/best_model (macro F1=0.6003)
✅ Saved checkpoint @ /content/checkpoints_distilbert/epoch-2-step-3750


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):


Epoch 3/5 | Step 3800 | Loss 0.3207
Epoch 3/5 | Step 3850 | Loss 0.2934
Epoch 3/5 | Step 3900 | Loss 0.2847
Epoch 3/5 | Step 3950 | Loss 0.2711
Epoch 3/5 | Step 4000 | Loss 0.2956
✅ Saved checkpoint @ /content/checkpoints_distilbert/step-4000
Epoch 3/5 | Step 4050 | Loss 0.2852
Epoch 3/5 | Step 4100 | Loss 0.3229
Epoch 3/5 | Step 4150 | Loss 0.2812
Epoch 3/5 | Step 4200 | Loss 0.2500
Epoch 3/5 | Step 4250 | Loss 0.2913
Epoch 3/5 | Step 4300 | Loss 0.2840
Epoch 3/5 | Step 4350 | Loss 0.2511
Epoch 3/5 | Step 4400 | Loss 0.3220
Epoch 3/5 | Step 4450 | Loss 0.2974
Epoch 3/5 | Step 4500 | Loss 0.2743
✅ Saved checkpoint @ /content/checkpoints_distilbert/step-4500
Epoch 3/5 | Step 4550 | Loss 0.2485
Epoch 3/5 | Step 4600 | Loss 0.2630
Epoch 3/5 | Step 4650 | Loss 0.2824
Epoch 3/5 | Step 4700 | Loss 0.2491
Epoch 3/5 | Step 4750 | Loss 0.2508
Epoch 3/5 | Step 4800 | Loss 0.2412
Epoch 3/5 | Step 4850 | Loss 0.2849
Epoch 3/5 | Step 4900 | Loss 0.2591
Epoch 3/5 | Step 4950 | Loss 0.2717
Epoch 3/5 

  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):


Epoch 3 DONE | Val Macro-F1: 0.6306 | Val Micro-F1: 0.7789
🏆 Saved BEST model @ /content/best_model (macro F1=0.6306)
✅ Saved checkpoint @ /content/checkpoints_distilbert/epoch-3-step-5625


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):


Epoch 4/5 | Step 5650 | Loss 0.1228
Epoch 4/5 | Step 5700 | Loss 0.2458
Epoch 4/5 | Step 5750 | Loss 0.2461
Epoch 4/5 | Step 5800 | Loss 0.2145
Epoch 4/5 | Step 5850 | Loss 0.2608
Epoch 4/5 | Step 5900 | Loss 0.2147
Epoch 4/5 | Step 5950 | Loss 0.2729
Epoch 4/5 | Step 6000 | Loss 0.2333
✅ Saved checkpoint @ /content/checkpoints_distilbert/step-6000
Epoch 4/5 | Step 6050 | Loss 0.2354
Epoch 4/5 | Step 6100 | Loss 0.2220
Epoch 4/5 | Step 6150 | Loss 0.2179
Epoch 4/5 | Step 6200 | Loss 0.2317
Epoch 4/5 | Step 6250 | Loss 0.2213
Epoch 4/5 | Step 6300 | Loss 0.2130
Epoch 4/5 | Step 6350 | Loss 0.2365
Epoch 4/5 | Step 6400 | Loss 0.2258
Epoch 4/5 | Step 6450 | Loss 0.2436
Epoch 4/5 | Step 6500 | Loss 0.2383
✅ Saved checkpoint @ /content/checkpoints_distilbert/step-6500
Epoch 4/5 | Step 6550 | Loss 0.2654
Epoch 4/5 | Step 6600 | Loss 0.2356
Epoch 4/5 | Step 6650 | Loss 0.2374
Epoch 4/5 | Step 6700 | Loss 0.2525
Epoch 4/5 | Step 6750 | Loss 0.2302
Epoch 4/5 | Step 6800 | Loss 0.2349
Epoch 4/5 

  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):


Epoch 4 DONE | Val Macro-F1: 0.6404 | Val Micro-F1: 0.7860
🏆 Saved BEST model @ /content/best_model (macro F1=0.6404)
✅ Saved checkpoint @ /content/checkpoints_distilbert/epoch-4-step-7500


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):


Epoch 5/5 | Step 7550 | Loss 0.2033
Epoch 5/5 | Step 7600 | Loss 0.2028
Epoch 5/5 | Step 7650 | Loss 0.2044
Epoch 5/5 | Step 7700 | Loss 0.2130
Epoch 5/5 | Step 7750 | Loss 0.1831
Epoch 5/5 | Step 7800 | Loss 0.1845
Epoch 5/5 | Step 7850 | Loss 0.2037
Epoch 5/5 | Step 7900 | Loss 0.2262
Epoch 5/5 | Step 7950 | Loss 0.2125
Epoch 5/5 | Step 8000 | Loss 0.1974
✅ Saved checkpoint @ /content/checkpoints_distilbert/step-8000
Epoch 5/5 | Step 8050 | Loss 0.1986
Epoch 5/5 | Step 8100 | Loss 0.2276
Epoch 5/5 | Step 8150 | Loss 0.1960
Epoch 5/5 | Step 8200 | Loss 0.2079
Epoch 5/5 | Step 8250 | Loss 0.2115
Epoch 5/5 | Step 8300 | Loss 0.2066
Epoch 5/5 | Step 8350 | Loss 0.2045
Epoch 5/5 | Step 8400 | Loss 0.1957
Epoch 5/5 | Step 8450 | Loss 0.2182
Epoch 5/5 | Step 8500 | Loss 0.2236
✅ Saved checkpoint @ /content/checkpoints_distilbert/step-8500
Epoch 5/5 | Step 8550 | Loss 0.1866
Epoch 5/5 | Step 8600 | Loss 0.2234
Epoch 5/5 | Step 8650 | Loss 0.1883
Epoch 5/5 | Step 8700 | Loss 0.1991
Epoch 5/5 

  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):


Epoch 5 DONE | Val Macro-F1: 0.6527 | Val Micro-F1: 0.7959
🏆 Saved BEST model @ /content/best_model (macro F1=0.6527)
✅ Saved checkpoint @ /content/checkpoints_distilbert/epoch-5-step-9375
Training complete.
Best model stored at: /content/best_model


Inference

In [14]:
# Inference helper
id2label = {i:l for i,l in enumerate(LABELS)}
def predict(texts: List[str], threshold: float = 0.5):
    model.eval()
    enc = tokenizer(
        texts, truncation=True, padding="max_length", max_length=MAX_LENGTH, return_tensors="pt"
    ).to(DEVICE)
    with torch.no_grad():
        logits = model(**enc).logits
        probs = torch.sigmoid(logits).cpu().numpy()
    outputs = []
    for p in probs:
        outputs.append({id2label[i]: float(p[i]) for i in range(len(LABELS))})
    return outputs

samples = [
    "I hate you. You're disgusting.",
    "Have a great day, thank you!",
    "Shut up or I will hurt you."
]
predict(samples)


[{'toxic': 0.9808000326156616,
  'severe_toxic': 0.005995008163154125,
  'obscene': 0.25278240442276,
  'threat': 0.01800243742763996,
  'insult': 0.8993880748748779,
  'identity_hate': 0.009773819707334042},
 {'toxic': 0.0032725019846111536,
  'severe_toxic': 0.0012575405417010188,
  'obscene': 0.0010751370573416352,
  'threat': 0.0049848114140331745,
  'insult': 0.0011288868263363838,
  'identity_hate': 0.0017610270297154784},
 {'toxic': 0.964800238609314,
  'severe_toxic': 0.1961364895105362,
  'obscene': 0.3233829140663147,
  'threat': 0.9933465719223022,
  'insult': 0.2797642648220062,
  'identity_hate': 0.05119376257061958}]

In [15]:
from transformers import AutoTokenizer

# Save final model
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/vocab.txt',
 './final_model/added_tokens.json',
 './final_model/tokenizer.json')