In [1]:
# Install required packages
%pip install transformers datasets accelerate scikit-learn

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading pyyaml-6.0.3-cp313-cp313-win_amd64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4

## BERT Baseline for Ekman Emotion Classification
This notebook trains a baseline BERT model on the GoEmotions dataset mapped to Ekman emotions.

In [1]:
#check if cuda is available
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))

Using GPU: NVIDIA GeForce RTX 4080 SUPER


In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from tqdm import tqdm

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


### Load and Prepare Data

In [3]:
# Load the dataset
df = pd.read_csv("data/goemotions_ekman.csv")

# Define emotion labels
LABELS = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']
NUM_LABELS = len(LABELS)

# Check data distribution
print(f"Dataset size: {len(df)}")
print("\nLabel distribution:")
for label in LABELS:
    print(f"  {label}: {df[label].sum()} ({df[label].mean()*100:.1f}%)")

# Split data
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"\nTrain size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

Dataset size: 211225

Label distribution:
  anger: 30473 (14.4%)
  disgust: 5301 (2.5%)
  fear: 4515 (2.1%)
  joy: 82938 (39.3%)
  sadness: 19101 (9.0%)
  surprise: 29282 (13.9%)
  neutral: 55298 (26.2%)

Train size: 168980
Validation size: 21122
Test size: 21123


### Create Dataset Class and DataLoaders

In [4]:
class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.labels = LABELS
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = str(self.data.loc[idx, 'text'])
        labels = self.data.loc[idx, self.labels].values.astype(float)
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create datasets
train_dataset = EmotionDataset(train_df, tokenizer)
val_dataset = EmotionDataset(val_df, tokenizer)
test_dataset = EmotionDataset(test_df, tokenizer)

# Create dataloaders
BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

Training batches: 10562
Validation batches: 1321
Test batches: 1321


### Initialize Model

In [5]:
# Load pre-trained BERT model for multi-label classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)
model.to(device)

# Setup optimizer and scheduler
EPOCHS = 3
LEARNING_RATE = 2e-5

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

print(f"Model loaded on {device}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on cuda


### Training and Evaluation Functions

In [6]:
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device, threshold=0.5):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            total_loss += outputs.loss.item()
            
            # Apply sigmoid and threshold
            probs = torch.sigmoid(outputs.logits)
            preds = (probs > threshold).float()
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    
    # Calculate metrics
    f1_micro = f1_score(all_labels, all_preds, average='micro')
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    
    return {
        'loss': total_loss / len(dataloader),
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'predictions': all_preds,
        'labels': all_labels
    }

### Train the Model

In [7]:
# Training loop
best_f1 = 0

for epoch in range(EPOCHS):
    print(f"\n{'='*50}")
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print(f"{'='*50}")
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"Training Loss: {train_loss:.4f}")
    
    # Evaluate on validation set
    val_results = evaluate(model, val_loader, device)
    print(f"Validation Loss: {val_results['loss']:.4f}")
    print(f"Validation F1 (micro): {val_results['f1_micro']:.4f}")
    print(f"Validation F1 (macro): {val_results['f1_macro']:.4f}")
    
    # Save best model
    if val_results['f1_macro'] > best_f1:
        best_f1 = val_results['f1_macro']
        torch.save(model.state_dict(), 'best_bert_model.pt')
        print("Saved best model!")

print(f"\nBest Validation F1 (macro): {best_f1:.4f}")


Epoch 1/3


Training: 100%|██████████| 10562/10562 [12:20<00:00, 14.27it/s]
Training: 100%|██████████| 10562/10562 [12:20<00:00, 14.27it/s]


Training Loss: 0.2797


Evaluating: 100%|██████████| 1321/1321 [00:39<00:00, 33.75it/s]



Validation Loss: 0.2684
Validation F1 (micro): 0.5419
Validation F1 (macro): 0.3977
Saved best model!

Epoch 2/3
Saved best model!

Epoch 2/3


Training: 100%|██████████| 10562/10562 [11:54<00:00, 14.78it/s]
Training: 100%|██████████| 10562/10562 [11:54<00:00, 14.78it/s]


Training Loss: 0.2554


Evaluating: 100%|██████████| 1321/1321 [00:39<00:00, 33.87it/s]



Validation Loss: 0.2718
Validation F1 (micro): 0.5649
Validation F1 (macro): 0.4386
Saved best model!

Epoch 3/3
Saved best model!

Epoch 3/3


Training: 100%|██████████| 10562/10562 [11:58<00:00, 14.70it/s]
Training: 100%|██████████| 10562/10562 [11:58<00:00, 14.70it/s]


Training Loss: 0.2374


Evaluating: 100%|██████████| 1321/1321 [00:37<00:00, 34.80it/s]



Validation Loss: 0.2785
Validation F1 (micro): 0.5682
Validation F1 (macro): 0.4564
Saved best model!

Best Validation F1 (macro): 0.4564
Saved best model!

Best Validation F1 (macro): 0.4564


### Evaluate on Test Set

In [8]:
# Load best model and evaluate on test set
model.load_state_dict(torch.load('best_bert_model.pt'))

test_results = evaluate(model, test_loader, device)

print("="*50)
print("TEST SET RESULTS")
print("="*50)
print(f"Test Loss: {test_results['loss']:.4f}")
print(f"Test F1 (micro): {test_results['f1_micro']:.4f}")
print(f"Test F1 (macro): {test_results['f1_macro']:.4f}")

# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(
    test_results['labels'], 
    test_results['predictions'], 
    target_names=LABELS,
    zero_division=0
))

Evaluating: 100%|██████████| 1321/1321 [00:40<00:00, 32.95it/s]



TEST SET RESULTS
Test Loss: 0.2779
Test F1 (micro): 0.5665
Test F1 (macro): 0.4603

Detailed Classification Report:
              precision    recall  f1-score   support

       anger       0.55      0.36      0.43      3084
     disgust       0.56      0.17      0.26       514
        fear       0.57      0.33      0.41       473
         joy       0.75      0.76      0.75      8232
     sadness       0.58      0.41      0.48      1895
    surprise       0.52      0.41      0.46      2894
     neutral       0.59      0.33      0.42      5610

   micro avg       0.65      0.50      0.57     22702
   macro avg       0.59      0.39      0.46     22702
weighted avg       0.63      0.50      0.55     22702
 samples avg       0.52      0.50      0.51     22702



### Test with Sample Predictions

In [9]:
def predict_emotions(text, model, tokenizer, threshold=0.5):
    """Predict emotions for a single text."""
    model.eval()
    
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        outputs = model(
            input_ids=encoding['input_ids'].to(device),
            attention_mask=encoding['attention_mask'].to(device)
        )
        probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]
    
    results = {label: prob for label, prob in zip(LABELS, probs)}
    predicted = [label for label, prob in results.items() if prob > threshold]
    
    return results, predicted

# Test with sample texts
sample_texts = [
    "I'm so happy today! Everything is going great!",
    "This makes me so angry, I can't believe it!",
    "I'm really scared about what might happen.",
    "That's disgusting, I hate it.",
    "I feel so sad and lonely.",
    "Wow, I didn't expect that at all!"
]

print("Sample Predictions:")
print("="*60)
for text in sample_texts:
    probs, predicted = predict_emotions(text, model, tokenizer)
    print(f"\nText: {text}")
    print(f"Predicted: {predicted if predicted else ['neutral']}")
    print(f"Probabilities: {', '.join([f'{k}:{v:.2f}' for k, v in probs.items()])}")

Sample Predictions:

Text: I'm so happy today! Everything is going great!
Predicted: ['joy']
Probabilities: anger:0.00, disgust:0.00, fear:0.00, joy:0.99, sadness:0.00, surprise:0.01, neutral:0.01

Text: This makes me so angry, I can't believe it!
Predicted: ['anger']
Probabilities: anger:0.84, disgust:0.03, fear:0.01, joy:0.03, sadness:0.02, surprise:0.25, neutral:0.03

Text: I'm really scared about what might happen.
Predicted: ['fear']
Probabilities: anger:0.02, disgust:0.02, fear:0.92, joy:0.09, sadness:0.04, surprise:0.08, neutral:0.03

Text: That's disgusting, I hate it.
Predicted: ['disgust']
Probabilities: anger:0.35, disgust:0.90, fear:0.04, joy:0.03, sadness:0.10, surprise:0.03, neutral:0.02

Text: I feel so sad and lonely.
Predicted: ['sadness']
Probabilities: anger:0.03, disgust:0.02, fear:0.05, joy:0.05, sadness:0.95, surprise:0.02, neutral:0.03

Text: Wow, I didn't expect that at all!
Predicted: ['surprise']
Probabilities: anger:0.06, disgust:0.00, fear:0.00, joy:0.29, sa