In [1]:
!pip install transformers datasets wandb --quiet

import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from tqdm.auto import tqdm

import wandb


In [2]:
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m22f3000982[0m ([33m22f3000982-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
train_path = "/content/train (3).csv"

train_df = pd.read_csv(train_path)
labels = ["anger", "fear", "joy", "sadness", "surprise"]

print(train_df.head())
print("Train shape:", train_df.shape)


   id                                               text  anger  fear  joy  \
0   0  the dentist that did the work apparently did a...      1     0    0   
1   1  i'm gonna absolutely ~~suck~~ be terrible duri...      0     1    0   
2   2  bridge: so leave me drowning calling houston, ...      0     1    0   
3   3  after that mess i went to see my now ex-girlfr...      1     1    0   
4   4  as he stumbled i ran off, afraid it might some...      0     1    0   

   sadness  surprise                    emotions  
0        1         0         ['anger' 'sadness']  
1        1         0          ['fear' 'sadness']  
2        1         0          ['fear' 'sadness']  
3        1         0  ['anger' 'fear' 'sadness']  
4        0         0                    ['fear']  
Train shape: (6827, 8)


In [5]:
train_df = train_df.dropna(subset=["text"]).reset_index(drop=True)

train_data, val_data = train_test_split(
    train_df,
    test_size=0.1,
    random_state=42,
    shuffle=True
)

print("Train rows:", len(train_data))
print("Val rows:", len(val_data))

for col in labels:
    train_data[col] = train_data[col].astype(int)
    val_data[col] = val_data[col].astype(int)


Train rows: 6144
Val rows: 683


In [6]:
from datasets import Dataset
from transformers import AutoTokenizer

MODEL_NAME = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = Dataset.from_pandas(train_data.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_data.reset_index(drop=True))

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_tokenized = train_dataset.map(tokenize_function, batched=True)
val_tokenized = val_dataset.map(tokenize_function, batched=True)

for col in labels:
    train_tokenized = train_tokenized.rename_column(col, f"labels_{col}")
    val_tokenized = val_tokenized.rename_column(col, f"labels_{col}")

train_tokenized.set_format("torch", columns=["input_ids", "attention_mask"] + [f"labels_{c}" for c in labels])
val_tokenized.set_format("torch", columns=["input_ids", "attention_mask"] + [f"labels_{c}" for c in labels])

print(train_tokenized[0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/6144 [00:00<?, ? examples/s]

Map:   0%|          | 0/683 [00:00<?, ? examples/s]

{'labels_anger': tensor(0), 'labels_fear': tensor(0), 'labels_joy': tensor(1), 'labels_sadness': tensor(0), 'labels_surprise': tensor(0), 'input_ids': tensor([ 101, 2009, 3727, 2119, 2026, 2398, 2489, 2000, 2022, 2583, 2000, 2828,
        1012,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,

In [7]:
def collate_fn(batch):
    input_ids = torch.stack([x["input_ids"] for x in batch])
    attention_mask = torch.stack([x["attention_mask"] for x in batch])
    label_tensors = []
    for x in batch:
        label_tensors.append(
            torch.tensor([
                x["labels_anger"],
                x["labels_fear"],
                x["labels_joy"],
                x["labels_sadness"],
                x["labels_surprise"]
            ], dtype=torch.float)
        )
    labels_tensor = torch.stack(label_tensors)
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels_tensor
    }

from transformers import AutoModel

class DistilBertMultiLabel(nn.Module):
    def __init__(self, model_name, num_labels=5):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        hidden_size = self.backbone.config.hidden_size
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # DistilBERT ka [CLS] token = first token
        hidden_state = outputs.last_hidden_state
        pooled = hidden_state[:, 0]
        pooled = self.dropout(pooled)
        logits = self.classifier(pooled)
        return {"logits": logits}


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

batch_size = 16
num_epochs = 4
learning_rate = 2e-5
weight_decay = 0.01
warmup_ratio = 0.1
grad_accum_steps = 1
max_grad_norm = 1.0
early_stop_patience = 2

train_loader = DataLoader(
    train_tokenized,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)
val_loader = DataLoader(
    val_tokenized,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

from transformers import get_cosine_schedule_with_warmup
from torch.optim import AdamW

model = DistilBertMultiLabel(MODEL_NAME, num_labels=5).to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

num_training_steps = len(train_loader) * num_epochs // grad_accum_steps
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(num_training_steps * warmup_ratio),
    num_training_steps=num_training_steps
)

pos_weights = torch.tensor([1.2, 0.9, 1.1, 1.0, 1.3], device=device)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weights)

run = wandb.init(
    project="2025-sep-dl-genai-project",
    name="model3_distilbert_multilabel",
    config={
        "model": MODEL_NAME,
        "epochs": num_epochs,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "warmup_ratio": warmup_ratio,
    }
)


Device: cuda


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [9]:
def evaluate(model, dataloader, threshold=0.5):
    model.eval()
    all_logits = []
    all_labels = []
    total_loss = 0.0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs["logits"]
            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            all_logits.append(torch.sigmoid(logits).cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_logits = np.vstack(all_logits)
    all_labels = np.vstack(all_labels)
    preds = (all_logits >= threshold).astype(int)

    f1_macro = f1_score(all_labels, preds, average="macro")
    f1_micro = f1_score(all_labels, preds, average="micro")
    acc = accuracy_score(all_labels, preds)
    try:
        roc = roc_auc_score(all_labels, all_logits, average="macro")
    except:
        roc = 0.0

    avg_loss = total_loss / len(dataloader)
    return {
        "loss": avg_loss,
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "accuracy": acc,
        "roc_auc": roc
    }


In [10]:
best_f1 = 0.0
epochs_no_improve = 0
global_step = 0

for epoch in range(1, num_epochs + 1):
    model.train()
    running_loss = 0.0
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch}/{num_epochs}")

    optimizer.zero_grad()
    for step, batch in pbar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs["logits"]
        loss = loss_fn(logits, labels) / grad_accum_steps
        loss.backward()
        running_loss += loss.item() * grad_accum_steps

        if (step + 1) % grad_accum_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            global_step += 1

        pbar.set_postfix({"loss": f"{running_loss / (step + 1):.4f}",
                          "lr": scheduler.get_last_lr()[0]})

    val_metrics = evaluate(model, val_loader, threshold=0.5)

    print(f"\nEpoch {epoch}/{num_epochs}")
    print(f"Train loss: {running_loss / len(train_loader):.4f}")
    print(f"Val loss: {val_metrics['loss']:.4f}")
    print(f"Val F1-macro: {val_metrics['f1_macro']:.4f} | F1-micro: {val_metrics['f1_micro']:.4f} | Acc: {val_metrics['accuracy']:.4f}")

    wandb.log({
        "epoch": epoch,
        "train_loss": running_loss / len(train_loader),
        "val_loss": val_metrics["loss"],
        "val_f1_macro": val_metrics["f1_macro"],
        "val_f1_micro": val_metrics["f1_micro"],
        "val_accuracy": val_metrics["accuracy"],
        "val_roc_auc": val_metrics["roc_auc"],
        "learning_rate": scheduler.get_last_lr()[0]
    })

    if val_metrics["f1_macro"] > best_f1:
        best_f1 = val_metrics["f1_macro"]
        epochs_no_improve = 0
        torch.save(model.state_dict(), "distilbert_best_model.bin")
        print(f"✅ New best model saved! F1-macro={best_f1:.4f}")
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve} epoch(s).")

    if epochs_no_improve >= early_stop_patience:
        print("🛑 Early stopping.")
        break

print("Training complete. Best F1-macro:", best_f1)
wandb.finish()


Epoch 1/4:   0%|          | 0/384 [00:00<?, ?it/s]


Epoch 1/4
Train loss: 0.4930
Val loss: 0.3638
Val F1-macro: 0.6757 | F1-micro: 0.7413 | Acc: 0.4627
✅ New best model saved! F1-macro=0.6757


Epoch 2/4:   0%|          | 0/384 [00:00<?, ?it/s]


Epoch 2/4
Train loss: 0.3118
Val loss: 0.2970
Val F1-macro: 0.7755 | F1-micro: 0.8073 | Acc: 0.5564
✅ New best model saved! F1-macro=0.7755


Epoch 3/4:   0%|          | 0/384 [00:00<?, ?it/s]


Epoch 3/4
Train loss: 0.2162
Val loss: 0.2787
Val F1-macro: 0.8000 | F1-micro: 0.8211 | Acc: 0.5886
✅ New best model saved! F1-macro=0.8000


Epoch 4/4:   0%|          | 0/384 [00:00<?, ?it/s]


Epoch 4/4
Train loss: 0.1704
Val loss: 0.2754
Val F1-macro: 0.8027 | F1-micro: 0.8268 | Acc: 0.6032
✅ New best model saved! F1-macro=0.8027
Training complete. Best F1-macro: 0.8026845761810005


0,1
epoch,▁▃▆█
learning_rate,█▅▂▁
train_loss,█▄▂▁
val_accuracy,▁▆▇█
val_f1_macro,▁▇██
val_f1_micro,▁▆██
val_loss,█▃▁▁
val_roc_auc,▁▆██

0,1
epoch,4.0
learning_rate,0.0
train_loss,0.17038
val_accuracy,0.60322
val_f1_macro,0.80268
val_f1_micro,0.82678
val_loss,0.27535
val_roc_auc,0.93805
