In [1]:
# Install required packages
%pip install transformers datasets accelerate scikit-learn

Note: you may need to restart the kernel to use updated packages.


## BERT Baseline for GoEmotions Classification (27 Labels)
This notebook trains a baseline BERT model on the GoEmotions dataset with all 27 original emotion labels.

In [2]:
#check if cuda is available
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))

Using GPU: NVIDIA GeForce RTX 4080 SUPER


In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from tqdm import tqdm

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


### Load and Prepare Data

In [4]:
from datasets import load_dataset

# Load the GoEmotions dataset from Hugging Face
dataset = load_dataset("google-research-datasets/go_emotions", "simplified")

# Define all 27 GoEmotions labels (+ neutral = 28, but the dataset has 27 unique emotions)
LABELS = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]
NUM_LABELS = len(LABELS)

# Convert dataset to DataFrames
def convert_to_df(split):
    data = dataset[split]
    rows = []
    for i in range(len(data)):
        text = data[i]['text']
        label_ids = data[i]['labels']
        # Create multi-hot encoding
        label_vec = [1 if j in label_ids else 0 for j in range(NUM_LABELS)]
        rows.append([text] + label_vec)
    return pd.DataFrame(rows, columns=['text'] + LABELS)

train_df = convert_to_df('train')
val_df = convert_to_df('validation')
test_df = convert_to_df('test')

print(f"Dataset sizes - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
print(f"\nNumber of labels: {NUM_LABELS}")
print(f"Labels: {LABELS}")

print("\nLabel distribution in training set:")
for label in LABELS:
    count = train_df[label].sum()
    pct = count / len(train_df) * 100
    print(f"  {label}: {count} ({pct:.1f}%)")

README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


validation-00000-of-00001.parquet:   0%|          | 0.00/350k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Dataset sizes - Train: 43410, Val: 5426, Test: 5427

Number of labels: 28
Labels: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

Label distribution in training set:
  admiration: 4130 (9.5%)
  amusement: 2328 (5.4%)
  anger: 1567 (3.6%)
  annoyance: 2470 (5.7%)
  approval: 2939 (6.8%)
  caring: 1087 (2.5%)
  confusion: 1368 (3.2%)
  curiosity: 2191 (5.0%)
  desire: 641 (1.5%)
  disappointment: 1269 (2.9%)
  disapproval: 2022 (4.7%)
  disgust: 793 (1.8%)
  embarrassment: 303 (0.7%)
  excitement: 853 (2.0%)
  fear: 596 (1.4%)
  gratitude: 2662 (6.1%)
  grief: 77 (0.2%)
  joy: 1452 (3.3%)
  love: 2086 (4.8%)
  nervousness: 164 (0.4%)
  optimism: 1581 (3.6%)
  pride: 111 (0.3%)
  realization: 1110 (2.6%)
  

### Create Dataset Class and DataLoaders

In [5]:
class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.labels = LABELS
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = str(self.data.loc[idx, 'text'])
        labels = self.data.loc[idx, self.labels].values.astype(float)
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create datasets
train_dataset = EmotionDataset(train_df, tokenizer)
val_dataset = EmotionDataset(val_df, tokenizer)
test_dataset = EmotionDataset(test_df, tokenizer)

# Create dataloaders
BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

Training batches: 2714
Validation batches: 340
Test batches: 340


### Initialize Model

In [6]:
# Load pre-trained BERT model for multi-label classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)
model.to(device)

# Setup optimizer and scheduler
EPOCHS = 3
LEARNING_RATE = 2e-5

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

print(f"Model loaded on {device}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on cuda


### Training and Evaluation Functions

In [7]:
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device, threshold=0.5):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            total_loss += outputs.loss.item()
            
            # Apply sigmoid and threshold
            probs = torch.sigmoid(outputs.logits)
            preds = (probs > threshold).float()
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    
    # Calculate metrics
    f1_micro = f1_score(all_labels, all_preds, average='micro')
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    
    return {
        'loss': total_loss / len(dataloader),
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'predictions': all_preds,
        'labels': all_labels
    }

### Train the Model

In [8]:
# Training loop
best_f1 = 0

for epoch in range(EPOCHS):
    print(f"\n{'='*50}")
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print(f"{'='*50}")
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"Training Loss: {train_loss:.4f}")
    
    # Evaluate on validation set
    val_results = evaluate(model, val_loader, device)
    print(f"Validation Loss: {val_results['loss']:.4f}")
    print(f"Validation F1 (micro): {val_results['f1_micro']:.4f}")
    print(f"Validation F1 (macro): {val_results['f1_macro']:.4f}")
    
    # Save best model
    if val_results['f1_macro'] > best_f1:
        best_f1 = val_results['f1_macro']
        torch.save(model.state_dict(), 'best_bert_model.pt')
        print("Saved best model!")

print(f"\nBest Validation F1 (macro): {best_f1:.4f}")


Epoch 1/3


Training: 100%|██████████| 2714/2714 [03:24<00:00, 13.29it/s]
Training: 100%|██████████| 2714/2714 [03:24<00:00, 13.29it/s]


Training Loss: 0.1237


Evaluating: 100%|██████████| 340/340 [00:09<00:00, 34.07it/s]



Validation Loss: 0.0918
Validation F1 (micro): 0.5123
Validation F1 (macro): 0.2567
Saved best model!

Epoch 2/3
Saved best model!

Epoch 2/3


Training: 100%|██████████| 2714/2714 [03:09<00:00, 14.34it/s]
Training: 100%|██████████| 2714/2714 [03:09<00:00, 14.34it/s]


Training Loss: 0.0841


Evaluating: 100%|██████████| 340/340 [00:10<00:00, 32.29it/s]
Evaluating: 100%|██████████| 340/340 [00:10<00:00, 32.29it/s]


Validation Loss: 0.0847
Validation F1 (micro): 0.5686
Validation F1 (macro): 0.3805
Saved best model!

Epoch 3/3
Saved best model!

Epoch 3/3


Training: 100%|██████████| 2714/2714 [03:10<00:00, 14.22it/s]
Training: 100%|██████████| 2714/2714 [03:10<00:00, 14.22it/s]


Training Loss: 0.0728


Evaluating: 100%|██████████| 340/340 [00:10<00:00, 33.33it/s]



Validation Loss: 0.0848
Validation F1 (micro): 0.5748
Validation F1 (macro): 0.4071
Saved best model!

Best Validation F1 (macro): 0.4071
Saved best model!

Best Validation F1 (macro): 0.4071


### Evaluate on Test Set

In [9]:
# Load best model and evaluate on test set
model.load_state_dict(torch.load('best_bert_model.pt'))

test_results = evaluate(model, test_loader, device)

print("="*50)
print("TEST SET RESULTS")
print("="*50)
print(f"Test Loss: {test_results['loss']:.4f}")
print(f"Test F1 (micro): {test_results['f1_micro']:.4f}")
print(f"Test F1 (macro): {test_results['f1_macro']:.4f}")

# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(
    test_results['labels'], 
    test_results['predictions'], 
    target_names=LABELS,
    zero_division=0
))

Evaluating: 100%|██████████| 340/340 [00:11<00:00, 30.40it/s]



TEST SET RESULTS
Test Loss: 0.0835
Test F1 (micro): 0.5859
Test F1 (macro): 0.4194

Detailed Classification Report:
                precision    recall  f1-score   support

    admiration       0.70      0.71      0.70       504
     amusement       0.79      0.84      0.81       264
         anger       0.61      0.41      0.49       198
     annoyance       0.62      0.10      0.17       320
      approval       0.65      0.30      0.41       351
        caring       0.56      0.33      0.41       135
     confusion       0.60      0.29      0.39       153
     curiosity       0.58      0.43      0.49       284
        desire       0.72      0.37      0.49        83
disappointment       0.62      0.05      0.10       151
   disapproval       0.50      0.28      0.36       267
       disgust       0.83      0.31      0.45       123
 embarrassment       0.00      0.00      0.00        37
    excitement       0.80      0.23      0.36       103
          fear       0.73      0.67      0.

### Test with Sample Predictions

In [10]:
def predict_emotions(text, model, tokenizer, threshold=0.3):
    """Predict emotions for a single text."""
    model.eval()
    
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        outputs = model(
            input_ids=encoding['input_ids'].to(device),
            attention_mask=encoding['attention_mask'].to(device)
        )
        probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]
    
    results = {label: prob for label, prob in zip(LABELS, probs)}
    predicted = [label for label, prob in results.items() if prob > threshold]
    
    return results, predicted

# Test with sample texts
sample_texts = [
    "I'm so happy today! Everything is going great!",
    "This makes me so angry, I can't believe it!",
    "I'm really scared about what might happen.",
    "That's disgusting, I hate it.",
    "I feel so sad and lonely.",
    "Wow, I didn't expect that at all!",
    "Thank you so much, you're amazing!",
    "I'm curious about how this works.",
    "I love you so much!"
]

print("Sample Predictions:")
print("="*60)
for text in sample_texts:
    probs, predicted = predict_emotions(text, model, tokenizer)
    print(f"\nText: {text}")
    print(f"Predicted: {predicted if predicted else ['neutral']}")
    # Only show top 5 emotions by probability
    top_probs = sorted(probs.items(), key=lambda x: x[1], reverse=True)[:5]
    print(f"Top 5 Probabilities: {', '.join([f'{k}:{v:.2f}' for k, v in top_probs])}")

Sample Predictions:

Text: I'm so happy today! Everything is going great!
Predicted: ['admiration', 'joy']
Top 5 Probabilities: joy:0.80, admiration:0.32, excitement:0.14, gratitude:0.05, approval:0.03

Text: This makes me so angry, I can't believe it!
Predicted: ['anger']
Top 5 Probabilities: anger:0.80, annoyance:0.27, disgust:0.04, disappointment:0.02, admiration:0.02

Text: I'm really scared about what might happen.
Predicted: ['fear']
Top 5 Probabilities: fear:0.75, nervousness:0.08, disgust:0.04, neutral:0.04, sadness:0.04

Text: That's disgusting, I hate it.
Predicted: ['disgust']
Top 5 Probabilities: disgust:0.72, anger:0.16, annoyance:0.14, fear:0.09, disapproval:0.06

Text: I feel so sad and lonely.
Predicted: ['sadness']
Top 5 Probabilities: sadness:0.85, disappointment:0.14, remorse:0.04, neutral:0.04, fear:0.03

Text: Wow, I didn't expect that at all!
Predicted: ['surprise']
Top 5 Probabilities: surprise:0.84, excitement:0.12, realization:0.05, neutral:0.04, admiration:0.0