# 1 - Imports and Loading Data

In [9]:
# Imports
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score

import torch
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import BCEWithLogitsLoss
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm

In [10]:
# Load the data
try:
    df = pd.read_csv('goemotions.csv')
except FileNotFoundError:
    print("Error: goemotions.csv not found.")
    exit()

print("\n--- Data Info ---")
df.info() # Found no missing values

print("\n--- Data Set ---")
df


--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211225 entries, 0 to 211224
Data columns (total 37 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   text                  211225 non-null  object 
 1   id                    211225 non-null  object 
 2   author                211225 non-null  object 
 3   subreddit             211225 non-null  object 
 4   link_id               211225 non-null  object 
 5   parent_id             211225 non-null  object 
 6   created_utc           211225 non-null  float64
 7   rater_id              211225 non-null  int64  
 8   example_very_unclear  211225 non-null  bool   
 9   admiration            211225 non-null  int64  
 10  amusement             211225 non-null  int64  
 11  anger                 211225 non-null  int64  
 12  annoyance             211225 non-null  int64  
 13  approval              211225 non-null  int64  
 14  caring                211225 non-

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1.548381e+09,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1.548084e+09,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1.546428e+09,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1.547965e+09,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1.546669e+09,2,False,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211220,Everyone likes [NAME].,ee6pagw,Senshado,heroesofthestorm,t3_agjf24,t3_agjf24,1.547634e+09,16,False,0,...,1,0,0,0,0,0,0,0,0,0
211221,Well when you’ve imported about a gazillion of...,ef28nod,5inchloser,nottheonion,t3_ak26t3,t3_ak26t3,1.548553e+09,15,False,0,...,0,0,0,0,0,0,0,0,0,0
211222,That looks amazing,ee8hse1,springt1me,shittyfoodporn,t3_agrnqb,t3_agrnqb,1.547684e+09,70,False,1,...,0,0,0,0,0,0,0,0,0,0
211223,The FDA has plenty to criticize. But like here...,edrhoxh,enamedata,medicine,t3_aejqzd,t1_edrgdtx,1.547169e+09,4,False,0,...,0,0,0,0,0,0,0,0,0,0


# 2 - Exploratory Data Analysis

In [11]:
# Using the full dataset (211,225 samples)
# Calculate label distribution
emotion_labels = df.columns[9:37].tolist()  # Emotion columns (admiration to neutral)
label_counts = df.iloc[:, 9:37].sum().values  # Sum each emotion column to get frequency (NumPy array)
label_dist = pd.DataFrame({
    'Emotion': emotion_labels,
    'Count': label_counts,
    'Percentage': (label_counts / len(df)) * 100
}) # Creates a dataframe to display label counts and percentages

# Calculate average number of labels per sample
labels_per_sample = df.iloc[:, 9:37].sum(axis=1)
avg_labels_per_sample = labels_per_sample.mean()

print("\n--- Label Distribution ---")
print(label_dist)

print("\n--- Label Statistics ---")
print(f"Average number of labels per sample: {avg_labels_per_sample:.2f}")
print(f"Min labels per sample: {labels_per_sample.min()}")
print(f"Max labels per sample: {labels_per_sample.max()}")


--- Label Distribution ---
           Emotion  Count  Percentage
0       admiration  17131    8.110309
1        amusement   9245    4.376849
2            anger   8084    3.827198
3        annoyance  13618    6.447154
4         approval  17620    8.341816
5           caring   5999    2.840099
6        confusion   7359    3.483963
7        curiosity   9692    4.588472
8           desire   3817    1.807078
9   disappointment   8469    4.009469
10     disapproval  11424    5.408451
11         disgust   5301    2.509646
12   embarrassment   2476    1.172210
13      excitement   5629    2.664931
14            fear   3197    1.513552
15       gratitude  11625    5.503610
16           grief    673    0.318618
17             joy   7983    3.779382
18            love   8191    3.877855
19     nervousness   1810    0.856906
20        optimism   8715    4.125932
21           pride   1302    0.616404
22     realization   8785    4.159072
23          relief   1289    0.610250
24         remorse   2

# 3 - Data Preprocessing

In [12]:
# Extract text and labels from the full dataset
texts = df['text'].values # Converts to NumPy array for later use
labels = df.iloc[:, 9:37].values # Columns 9 to 36 are emotion labels

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Calculate token lengths for all texts
token_lengths = [len(tokenizer.encode(text, add_special_tokens=True)) for text in df['text']]

# Compute statistics
max_length = max(token_lengths)
median_length = np.median(token_lengths)
mean_length = np.mean(token_lengths)
percentile_95 = np.percentile(token_lengths, 95)
percentile_99 = np.percentile(token_lengths, 99)
truncated_count = sum(1 for length in token_lengths if length > 128)

# Print results
print("\n--- Token Length Distribution ---")
print(f"Maximum token length: {max_length}")
print(f"Mean token length: {mean_length:.2f}")
print(f"Median token length: {median_length:.2f}")
print(f"95th percentile: {percentile_95:.2f}")
print(f"99th percentile: {percentile_99:.2f}")
print(f"Samples truncated at max_length=128: {truncated_count} ({truncated_count/len(token_lengths)*100:.2f}%)")

# Tokenize the texts
encoded_data = tokenizer(texts.tolist(), truncation=True, padding='max_length', max_length=128, return_tensors='pt')

print("\n--- Encoded Data Shape ---")
print(f"Input IDs: {encoded_data['input_ids'].shape}")
print(f"Attention Masks: {encoded_data['attention_mask'].shape}")
print(f"Labels Shape: {labels.shape}")


--- Token Length Distribution ---
Total samples: 211225
Maximum token length: 316
Mean token length: 19.40
Median token length: 19.00
95th percentile: 34.00
99th percentile: 38.00
Samples truncated at max_length=128: 3 (0.00%)

--- Encoded Data Shape ---
Input IDs: torch.Size([211225, 128])
Attention Masks: torch.Size([211225, 128])
Labels Shape: (211225, 28)


# 4 - Data Splitting

In [5]:
# Split into train, validation, and test sets
train_texts, temp_texts, train_masks, temp_masks, train_labels, temp_labels = train_test_split(
    encoded_data['input_ids'], encoded_data['attention_mask'], labels, 
    test_size=0.3, random_state=42  # 70% train, 30% temp (val + test)
)

val_texts, test_texts, val_masks, test_masks, val_labels, test_labels = train_test_split(
    temp_texts, temp_masks, temp_labels, 
    test_size=0.5, random_state=42  # Split 30% into 15% val and 15% test
)

print("\n--- Split Sizes ---")
print(f"Train Input IDs: {train_texts.shape}")
print(f"Validation Input IDs: {val_texts.shape}")
print(f"Test Input IDs: {test_texts.shape}")
print(f"Train Labels: {train_labels.shape}")
print(f"Validation Labels: {val_labels.shape}")
print(f"Test Labels: {test_labels.shape}")

# Create TensorDatasets for PyTorch DataLoader (combines inputs, masks and labels into a dataset object)
train_dataset = TensorDataset(train_texts, train_masks, torch.tensor(train_labels))
val_dataset = TensorDataset(val_texts, val_masks, torch.tensor(val_labels))
test_dataset = TensorDataset(test_texts, test_masks, torch.tensor(test_labels))


--- Split Sizes ---
Train Input IDs: torch.Size([147857, 128])
Validation Input IDs: torch.Size([31684, 128])
Test Input IDs: torch.Size([31684, 128])
Train Labels: (147857, 28)
Validation Labels: (31684, 28)
Test Labels: (31684, 28)


# 5 - Model Setup

In [6]:
# Set device to CUDA if available as it massively speeds up training times
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load BERT model with 28 emotion labels (excluding 'example_very_unclear')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=28)
# This ensures the model uses a sigmoid activation on the outputs and an appropriate loss function (like BCEWithLogitsLoss)
model.config.problem_type = "multi_label_classification"  # Set for multi-label task
model.to(device)

print("\n--- Model Loaded ---")
print(f"Device: {device}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Model Loaded ---
Device: cuda


# 6 - Training Setup

In [7]:
# DataLoaders (shuffle is set to 'True' to randomize data order for better generalization)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Set up optimizer, AdamW is a common optimizer for transformer models
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Calculate class weights based on label frequencies to adress class imbalance, gives higher weight to less frequent classes
class_weights = 1.0 / torch.tensor(label_counts, dtype=torch.float).to(device)
class_weights = class_weights / class_weights.sum() * 28  # Normalize for 28 classes
criterion = BCEWithLogitsLoss(pos_weight=class_weights) # 'pos_weight' applies the calculated class weights

print("\n--- Training Setup Complete ---")
print(f"Train Loader Size: {len(train_loader)}")
print(f"Validation Loader Size: {len(val_loader)}")
print(f"Test Loader Size: {len(test_loader)}")


--- Training Setup Complete ---
Train Loader Size: 4621
Validation Loader Size: 991
Test Loader Size: 991


# 7 - Model Training

In [None]:
# Set hyperparameters for training
epochs = 50
patience = 5 # Number of epochs to wait before early stopping
best_val_loss = float('inf')
epochs_no_improve = 0
save_path = 'best_emotion_model.pt'

# Training loop
for epoch in range(epochs):
    model.train() # Set model to training mode
    total_train_loss = 0
    # Use a progress bar over the training batches
    with tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch") as pbar: # Progress bar
        for batch in pbar:
            batch = [b.to(device) for b in batch] # Move batch to device
            input_ids, attention_mask, labels = batch
            labels = labels.float() # Ensure labels are float for multilabel
            optimizer.zero_grad() # Clears the previous gradients
            # Forward pass: compute model outputs and loss
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_train_loss += loss.item() # Accumulate loss
            loss.backward() # Backward pass: compute gradients
            optimizer.step() # Update model parameters
    
    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {avg_train_loss:.4f}")
    
    # Validation phase
    model.eval() # Set model to evaluate mode
    total_val_loss = 0
    with torch.no_grad(): # Disable gradient calculations for validation
        with tqdm(val_loader, desc=f"Epoch {epoch + 1}/{epochs} (Val)", unit="batch") as pbar:
            for batch in pbar:
                batch = [b.to(device) for b in batch]
                input_ids, attention_mask, labels = batch
                labels = labels.float()
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                total_val_loss += outputs.loss.item()
    
    avg_val_loss = total_val_loss / len(val_loader)
    
    # Early stopping and saving logic
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        try:
            torch.save(model.state_dict(), save_path)
            print(f"Model saved as {save_path} with validation loss: {best_val_loss:.4f}")
        except Exception as e:
            print(f"Error saving model: {e}")
    else:
        epochs_no_improve += 1
        print(f"No improvement in validation loss for {epochs_no_improve} epoch(s).")
    
    if epochs_no_improve >= patience:
        print(f"Early stopping triggered after {epoch + 1} epochs.")
        break # Exit training loop

print("\n--- Training Complete ---")

# 8 - Model Evaluation

In [8]:
# Load the best model saved during training
model.load_state_dict(torch.load('best_emotion_model.pt'))  # Load the saved state_dict of the best-performing model
model.eval()

# Evaluation function with optimal threshold tuning
def evaluate_model(loader, dataset_name="Validation", thresholds=None):
    all_probs = []
    all_labels = []
    with torch.no_grad():  # Disable gradient computation for efficiency
        for batch in tqdm(loader, desc=f"Evaluating {dataset_name}", unit="batch"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]  # Move batch to device
            labels = labels.float()  # Ensure labels are float for compatibility
            outputs = model(input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(outputs.logits).cpu().numpy()  # Apply sigmoid to logits and convert to numpy
            all_probs.append(probs)  # Store probabilities
            all_labels.append(labels.cpu().numpy())  # Store true labels
    all_probs = np.vstack(all_probs)  # Stack probabilities into a single array
    all_labels = np.vstack(all_labels)  # Stack labels into a single array
    
    # Tune thresholds if not provided
    if thresholds is None:
        thresholds = []
        for i in range(all_probs.shape[1]):  # Iterate over each emotion class
            best_f1 = 0
            best_t = 0.5  # Default threshold
            for t in np.arange(0.1, 0.9, 0.1):  # Test thresholds from 0.1 to 0.9
                preds = all_probs[:, i] > t  # Apply threshold to predictions
                f1 = f1_score(all_labels[:, i], preds)  # Compute F1 score
                if f1 > best_f1:  # Update if better F1 score found
                    best_f1 = f1
                    best_t = t
            thresholds.append(best_t)  # Store optimal threshold for this class
        print(f"\nOptimal Thresholds per Emotion ({dataset_name}):", dict(zip(emotion_labels, thresholds)))  # Display thresholds
    
    # Apply thresholds to get binary predictions
    preds = np.zeros_like(all_probs)  # Initialize prediction array
    for i in range(all_probs.shape[1]):  # Apply threshold for each class
        preds[:, i] = all_probs[:, i] > thresholds[i]
    
    # Compute evaluation metrics
    micro_f1 = f1_score(all_labels, preds, average='micro')
    macro_f1 = f1_score(all_labels, preds, average='macro')
    accuracy = accuracy_score(all_labels, preds)
    report = classification_report(all_labels, preds, target_names=emotion_labels, zero_division=0)
    
    # Display results
    print(f"\n--- {dataset_name} Results ---")
    print(f"Micro F1: {micro_f1:.4f}")
    print(f"Macro F1: {macro_f1:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    return micro_f1, macro_f1, accuracy, report, thresholds

# Evaluate on validation and test sets
val_micro_f1, val_macro_f1, val_accuracy, val_report, optimal_thresholds = evaluate_model(val_loader, "Validation")  # Evaluate on validation set and get optimal thresholds
test_micro_f1, test_macro_f1, test_accuracy, test_report, _ = evaluate_model(test_loader, "Test", optimal_thresholds)  # Evaluate on test set using validation thresholds

Evaluating Validation: 100%|██████████| 991/991 [01:34<00:00, 10.50batch/s]



Optimal Thresholds per Emotion (Validation): {'admiration': np.float64(0.30000000000000004), 'amusement': np.float64(0.30000000000000004), 'anger': np.float64(0.2), 'annoyance': np.float64(0.2), 'approval': np.float64(0.2), 'caring': np.float64(0.2), 'confusion': np.float64(0.2), 'curiosity': np.float64(0.2), 'desire': np.float64(0.2), 'disappointment': np.float64(0.2), 'disapproval': np.float64(0.2), 'disgust': np.float64(0.2), 'embarrassment': np.float64(0.2), 'excitement': np.float64(0.1), 'fear': np.float64(0.2), 'gratitude': np.float64(0.6), 'grief': np.float64(0.1), 'joy': np.float64(0.30000000000000004), 'love': np.float64(0.30000000000000004), 'nervousness': np.float64(0.1), 'optimism': np.float64(0.30000000000000004), 'pride': np.float64(0.1), 'realization': np.float64(0.1), 'relief': np.float64(0.1), 'remorse': np.float64(0.2), 'sadness': np.float64(0.30000000000000004), 'surprise': np.float64(0.2), 'neutral': np.float64(0.2)}

--- Validation Results ---
Micro F1: 0.4402
Mac

Evaluating Test: 100%|██████████| 991/991 [01:32<00:00, 10.66batch/s]


--- Test Results ---
Micro F1: 0.4375
Macro F1: 0.3666
Accuracy: 0.2183

Classification Report:
                precision    recall  f1-score   support

    admiration       0.51      0.68      0.58      2587
     amusement       0.53      0.74      0.62      1414
         anger       0.34      0.48      0.40      1235
     annoyance       0.25      0.40      0.30      2002
      approval       0.26      0.32      0.29      2577
        caring       0.26      0.45      0.33       863
     confusion       0.27      0.42      0.33      1114
     curiosity       0.33      0.70      0.45      1445
        desire       0.34      0.33      0.33       596
disappointment       0.21      0.26      0.23      1283
   disapproval       0.26      0.44      0.33      1778
       disgust       0.27      0.29      0.28       751
 embarrassment       0.41      0.26      0.31       372
    excitement       0.21      0.45      0.28       830
          fear       0.45      0.47      0.46       487
     g


