In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.metrics import f1_score
import numpy as np

In [5]:
import pandas as pd
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

# Load the CSV file
def load_data(file_path):
    df = pd.read_csv(file_path)
    texts = df["text"].tolist()
    labels = df[["anger", "fear", "joy", "sadness", "surprise"]].values
    return texts, labels

# Load train and test data
train_file = "Emotion_data/public_data_test/track_a/train/eng.csv"
test_file = "Emotion_data/public_data_test/track_a/dev/eng.csv"

train_texts, train_labels = load_data(train_file)
test_texts, test_labels = load_data(test_file)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the text data
def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        texts,
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

train_encodings = tokenize_texts(train_texts, tokenizer)
test_encodings = tokenize_texts(test_texts, tokenizer)


In [2]:
# ** Dataset Class **
class SemEvalDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.attention_mask[idx], dtype=torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float),  # Use float for BCEWithLogitsLoss
        }


In [3]:
# ** Model Class **
class BertForMultiLabelClassification(torch.nn.Module):
    def __init__(self, num_labels=5):
        super(BertForMultiLabelClassification, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Extract [CLS] token embedding (first token of the sequence)
        cls_output = output.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)  # Shape: (batch_size, num_labels)
        return logits


In [6]:
# Combine tokenized inputs and labels into datasets
train_dataset = SemEvalDataset(
    train_encodings["input_ids"],
    train_encodings["attention_mask"],
    torch.tensor(train_labels, dtype=torch.float)
)

test_dataset = SemEvalDataset(
    test_encodings["input_ids"],
    test_encodings["attention_mask"],
    torch.tensor(test_labels, dtype=torch.float)
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [7]:
# ** Initialize Model, Optimizer, and Loss **
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForMultiLabelClassification(num_labels=5)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# ** Training Loop **
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader)}")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
  "attention_mask": torch.tensor(self.attention_mask[idx], dtype=torch.long),
  "labels": torch.tensor(self.labels[idx], dtype=torch.float),  # Use float for BCEWithLogitsLoss


Epoch 1/5, Loss: 0.46758256292756584
Epoch 2/5, Loss: 0.308569424586489
Epoch 3/5, Loss: 0.21039052492793583
Epoch 4/5, Loss: 0.13859177591828253
Epoch 5/5, Loss: 0.09122690666100883


In [8]:
# ** Evaluation **
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask)
        preds = torch.sigmoid(outputs) > 0.5  # Threshold for multi-label
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Calculate metrics
all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)

accuracy = (all_preds == all_labels).mean()
f1 = f1_score(all_labels, all_preds, average="micro")  # Use "macro" if needed

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

  "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
  "attention_mask": torch.tensor(self.attention_mask[idx], dtype=torch.long),
  "labels": torch.tensor(self.labels[idx], dtype=torch.float),  # Use float for BCEWithLogitsLoss


Accuracy: 0.8310
F1 Score: 0.7030


In [11]:
# ** Explainability using SHAP (Optional) **
import shap

# Create SHAP explainer for BERT model
explainer = shap.Explainer(model, tokenizer)
shap_values = explainer(test_texts[:10])  # Visualize SHAP for first 10 samples

shap.summary_plot(shap_values)

ImportError: Numba needs NumPy 2.1 or less. Got NumPy 2.2.