# ModernBERT Emotion Classification
This notebook trains a ModernBERT model on the GoEmotions dataset and then fine-tunes it on the presidential speeches dataset.
ModernBERT is a modernized version of BERT with architectural improvements (Rotary Embeddings, Unpadding, etc.).

In [6]:
# Install required packages
# ModernBERT requires a recent version of transformers
%pip install "transformers>=4.48.0" datasets accelerate scikit-learn protobuf flash-attn

Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m40.6 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[23 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File [35m"/home/xiang/miniconda3/envs/nlp/lib/python3.14/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py"[0m, line [35m389[0m, in [35m<module>[0m
  [31m   [0m     [31mmain[0m[1;31m()[0m
  [31m   [0m     [31m~~~~[0m[1;31m^^[0m
  [31m   [0m   File [35m"/home/xiang/miniconda3/envs/nlp/lib/python3.14/site-packages/pip/_vendor/pyproject_hooks/_in_pr

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from datasets import load_dataset
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


### Load Data and Define Labels

In [2]:
# Load the GoEmotions dataset
dataset = load_dataset("google-research-datasets/go_emotions", "simplified")

LABELS = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]
NUM_LABELS = len(LABELS)

def convert_to_df(split):
    data = dataset[split]
    rows = []
    for i in range(len(data)):
        text = data[i]['text']
        label_ids = data[i]['labels']
        label_vec = [1 if j in label_ids else 0 for j in range(NUM_LABELS)]
        rows.append([text] + label_vec)
    return pd.DataFrame(rows, columns=['text'] + LABELS)

train_df = convert_to_df('train')
val_df = convert_to_df('validation')
test_df = convert_to_df('test')

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Train: 43410, Val: 5426, Test: 5427


### Initialize ModernBERT Tokenizer and Dataset

In [3]:
# Use ModernBERT base tokenizer
MODEL_NAME = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512): # ModernBERT supports up to 8192, but we'll stick to 512 for speed
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.labels = LABELS
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = str(self.data.loc[idx, 'text'])
        labels = self.data.loc[idx, self.labels].values.astype(float)
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

BATCH_SIZE = 16
train_dataset = EmotionDataset(train_df, tokenizer)
val_dataset = EmotionDataset(val_df, tokenizer)
test_dataset = EmotionDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

### Initialize ModernBERT Model

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)
model.to(device)

EPOCHS = 4
LEARNING_RATE = 5e-5 # ModernBERT can often handle slightly higher LRs

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=500,
    num_training_steps=total_steps
)

print(f"Model {MODEL_NAME} loaded on {device}")

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model answerdotai/ModernBERT-base loaded on cuda


In [5]:
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device, threshold=0.5):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
            
            probs = torch.sigmoid(outputs.logits)
            preds = (probs > threshold).float()
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    
    f1_micro = f1_score(all_labels, all_preds, average='micro')
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    
    return {
        'loss': total_loss / len(dataloader),
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'predictions': all_preds,
        'labels': all_labels
    }

### Train ModernBERT

In [6]:
best_f1 = 0

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"Training Loss: {train_loss:.4f}")
    
    val_results = evaluate(model, val_loader, device)
    print(f"Validation Loss: {val_results['loss']:.4f}")
    print(f"Validation F1 (micro): {val_results['f1_micro']:.4f}")
    print(f"Validation F1 (macro): {val_results['f1_macro']:.4f}")
    
    if val_results['f1_macro'] > best_f1:
        best_f1 = val_results['f1_macro']
        torch.save(model.state_dict(), 'best_modernbert_model.pt')
        print("Saved best model!")

print(f"\nBest Validation F1 (macro): {best_f1:.4f}")


Epoch 1/4


Training: 100%|██████████| 2714/2714 [18:43<00:00,  2.42it/s]


Training Loss: 0.1058


Evaluating: 100%|██████████| 340/340 [00:55<00:00,  6.14it/s]


Validation Loss: 0.0829
Validation F1 (micro): 0.5737
Validation F1 (macro): 0.4184
Saved best model!

Epoch 2/4


Training: 100%|██████████| 2714/2714 [17:10<00:00,  2.63it/s]


Training Loss: 0.0729


Evaluating: 100%|██████████| 340/340 [00:44<00:00,  7.68it/s]


Validation Loss: 0.0812
Validation F1 (micro): 0.5753
Validation F1 (macro): 0.4555
Saved best model!

Epoch 3/4


Training: 100%|██████████| 2714/2714 [16:15<00:00,  2.78it/s]


Training Loss: 0.0473


Evaluating: 100%|██████████| 340/340 [00:40<00:00,  8.37it/s]


Validation Loss: 0.0932
Validation F1 (micro): 0.5909
Validation F1 (macro): 0.4920
Saved best model!

Epoch 4/4


Training: 100%|██████████| 2714/2714 [16:02<00:00,  2.82it/s]


Training Loss: 0.0188


Evaluating: 100%|██████████| 340/340 [00:40<00:00,  8.36it/s]

Validation Loss: 0.1136
Validation F1 (micro): 0.5811
Validation F1 (macro): 0.4903

Best Validation F1 (macro): 0.4920





### Evaluate on Test Set

In [7]:
model.load_state_dict(torch.load('best_modernbert_model.pt'))
test_results = evaluate(model, test_loader, device)

print("="*50)
print("MODERNBERT TEST RESULTS")
print("="*50)
print(f"Test F1 (micro): {test_results['f1_micro']:.4f}")
print(f"Test F1 (macro): {test_results['f1_macro']:.4f}")

print("\nDetailed Classification Report:")
print(classification_report(
    test_results['labels'], 
    test_results['predictions'], 
    target_names=LABELS,
    zero_division=0
))

Evaluating: 100%|██████████| 340/340 [00:41<00:00,  8.20it/s]

MODERNBERT TEST RESULTS
Test F1 (micro): 0.5928
Test F1 (macro): 0.4940

Detailed Classification Report:
                precision    recall  f1-score   support

    admiration       0.70      0.70      0.70       504
     amusement       0.78      0.82      0.80       264
         anger       0.56      0.41      0.48       198
     annoyance       0.43      0.28      0.34       320
      approval       0.57      0.27      0.37       351
        caring       0.47      0.30      0.36       135
     confusion       0.59      0.33      0.42       153
     curiosity       0.52      0.58      0.55       284
        desire       0.57      0.47      0.51        83
disappointment       0.38      0.12      0.18       151
   disapproval       0.48      0.31      0.37       267
       disgust       0.62      0.42      0.50       123
 embarrassment       0.63      0.32      0.43        37
    excitement       0.55      0.34      0.42       103
          fear       0.72      0.62      0.66        7




## Domain Adaptation: Presidential Speeches
Fine-tune the best ModernBERT model on the presidential speeches dataset.

In [8]:
# Load Presidential Data
pres_df = pd.read_csv("data/presidential_speeches_goemotions_labeled.csv")
print(f"Presidential Dataset Shape: {pres_df.shape}")

# Identify text column
text_col = None
for col in ['speech', 'Speech', 'transcript', 'Transcript', 'text', 'Text', 'content']:
    if col in pres_df.columns:
        text_col = col
        break

if text_col is None:
    raise ValueError(f"Could not find text column. Available columns: {pres_df.columns.tolist()}")

print(f"Using text column: {text_col}")

# Train-test split
pres_train_df, pres_test_df = train_test_split(pres_df, test_size=0.2, random_state=42)

# Define Dataset class for Presidential data
class PresidentialDataset(Dataset):
    def __init__(self, dataframe, tokenizer, text_col, max_length=512):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_col = text_col
        self.labels = LABELS
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = str(self.data.loc[idx, self.text_col])
        if pd.isna(text):
            text = ""
        labels = self.data.loc[idx, self.labels].values.astype(float)
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Create DataLoaders
pres_train_dataset = PresidentialDataset(pres_train_df, tokenizer, text_col)
pres_test_dataset = PresidentialDataset(pres_test_df, tokenizer, text_col)

pres_train_loader = DataLoader(pres_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
pres_test_loader = DataLoader(pres_test_dataset, batch_size=BATCH_SIZE)

Presidential Dataset Shape: (995, 67)
Using text column: speech


In [9]:
# Load the best ModernBERT model
model.load_state_dict(torch.load('best_modernbert_model.pt'))
model.to(device)

# Fine-tuning parameters
FINE_TUNE_LR = 1e-5
FINE_TUNE_EPOCHS = 3

optimizer = AdamW(model.parameters(), lr=FINE_TUNE_LR, weight_decay=0.01)
total_steps = len(pres_train_loader) * FINE_TUNE_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

print(f"Fine-tuning ModernBERT on presidential data for {FINE_TUNE_EPOCHS} epochs...")

best_pres_f1 = 0

for epoch in range(FINE_TUNE_EPOCHS):
    print(f"\nEpoch {epoch + 1}/{FINE_TUNE_EPOCHS}")
    
    train_loss = train_epoch(model, pres_train_loader, optimizer, scheduler, device)
    print(f"Training Loss: {train_loss:.4f}")
    
    val_results = evaluate(model, pres_test_loader, device)
    print(f"Test Loss: {val_results['loss']:.4f}")
    print(f"Test F1 (micro): {val_results['f1_micro']:.4f}")
    print(f"Test F1 (macro): {val_results['f1_macro']:.4f}")
    
    if val_results['f1_macro'] > best_pres_f1:
        best_pres_f1 = val_results['f1_macro']
        torch.save(model.state_dict(), 'best_presidential_modernbert_model.pt')
        print("Saved best presidential model!")

print(f"\nBest Presidential Test F1 (macro): {best_pres_f1:.4f}")

Fine-tuning ModernBERT on presidential data for 3 epochs...

Epoch 1/3


Training: 100%|██████████| 50/50 [00:17<00:00,  2.81it/s]


Training Loss: 0.0537


Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.80it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Test Loss: 0.0494
Test F1 (micro): 0.7120
Test F1 (macro): 0.0522
Saved best presidential model!

Epoch 2/3


Training: 100%|██████████| 50/50 [00:18<00:00,  2.76it/s]


Training Loss: 0.0434


Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.77it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Test Loss: 0.0496
Test F1 (micro): 0.7254
Test F1 (macro): 0.0503

Epoch 3/3


Training: 100%|██████████| 50/50 [00:17<00:00,  2.88it/s]


Training Loss: 0.0385


Evaluating: 100%|██████████| 13/13 [00:03<00:00,  3.31it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Test Loss: 0.0488
Test F1 (micro): 0.7202
Test F1 (macro): 0.0526
Saved best presidential model!

Best Presidential Test F1 (macro): 0.0526


In [10]:
# Load best presidential model
model.load_state_dict(torch.load('best_presidential_modernbert_model.pt'))

final_results = evaluate(model, pres_test_loader, device)

print("="*60)
print("FINAL PRESIDENTIAL MODERNBERT MODEL RESULTS")
print("="*60)
print(f"Test Loss: {final_results['loss']:.4f}")
print(f"Test F1 (micro): {final_results['f1_micro']:.4f}")
print(f"Test F1 (macro): {final_results['f1_macro']:.4f}")

print("\nDetailed Classification Report:")
print(classification_report(
    final_results['labels'], 
    final_results['predictions'], 
    target_names=LABELS,
    zero_division=0
))

Evaluating: 100%|██████████| 13/13 [00:03<00:00,  3.72it/s]

FINAL PRESIDENTIAL MODERNBERT MODEL RESULTS
Test Loss: 0.0488
Test F1 (micro): 0.7202
Test F1 (macro): 0.0526

Detailed Classification Report:
                precision    recall  f1-score   support

    admiration       0.00      0.00      0.00         3
     amusement       0.00      0.00      0.00         0
         anger       0.00      0.00      0.00         0
     annoyance       0.00      0.00      0.00         0
      approval       0.57      0.11      0.19        36
        caring       0.00      0.00      0.00         1
     confusion       0.00      0.00      0.00         0
     curiosity       0.00      0.00      0.00         1
        desire       0.00      0.00      0.00         2
disappointment       0.00      0.00      0.00         0
   disapproval       0.00      0.00      0.00         0
       disgust       0.00      0.00      0.00         0
 embarrassment       0.00      0.00      0.00         0
    excitement       0.00      0.00      0.00         0
          fear  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
