# 1 - Imports and Loading Data

In [1]:
# Imports
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import BCEWithLogitsLoss
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm

In [3]:
# Load the data
try:
    df = pd.read_csv('goemotions.csv')
except FileNotFoundError:
    print("Error: goemotions.csv not found.")
    exit()

print("\n--- Data Info ---")
df.info() # Found no missing values

print("\n--- Data Set ---")
df


--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211225 entries, 0 to 211224
Data columns (total 37 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   text                  211225 non-null  object 
 1   id                    211225 non-null  object 
 2   author                211225 non-null  object 
 3   subreddit             211225 non-null  object 
 4   link_id               211225 non-null  object 
 5   parent_id             211225 non-null  object 
 6   created_utc           211225 non-null  float64
 7   rater_id              211225 non-null  int64  
 8   example_very_unclear  211225 non-null  bool   
 9   admiration            211225 non-null  int64  
 10  amusement             211225 non-null  int64  
 11  anger                 211225 non-null  int64  
 12  annoyance             211225 non-null  int64  
 13  approval              211225 non-null  int64  
 14  caring                211225 non-

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1.548381e+09,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1.548084e+09,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1.546428e+09,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1.547965e+09,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1.546669e+09,2,False,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211220,Everyone likes [NAME].,ee6pagw,Senshado,heroesofthestorm,t3_agjf24,t3_agjf24,1.547634e+09,16,False,0,...,1,0,0,0,0,0,0,0,0,0
211221,Well when you’ve imported about a gazillion of...,ef28nod,5inchloser,nottheonion,t3_ak26t3,t3_ak26t3,1.548553e+09,15,False,0,...,0,0,0,0,0,0,0,0,0,0
211222,That looks amazing,ee8hse1,springt1me,shittyfoodporn,t3_agrnqb,t3_agrnqb,1.547684e+09,70,False,1,...,0,0,0,0,0,0,0,0,0,0
211223,The FDA has plenty to criticize. But like here...,edrhoxh,enamedata,medicine,t3_aejqzd,t1_edrgdtx,1.547169e+09,4,False,0,...,0,0,0,0,0,0,0,0,0,0


# 2 - Exploratory Data Analysis

In [5]:
# Using the full dataset (211,225 samples)
# Calculate label distribution
emotion_labels = df.columns[9:37].tolist()  # Emotion columns (admiration to neutral)
label_counts = df.iloc[:, 9:37].sum()  # Sum each emotion column to get frequency
label_dist = pd.DataFrame({
    'Emotion': emotion_labels,
    'Count': label_counts,
    'Percentage': (label_counts / len(df)) * 100
})

# Calculate average number of labels per sample
labels_per_sample = df.iloc[:, 9:37].sum(axis=1)
avg_labels_per_sample = labels_per_sample.mean()

print("\n--- Label Distribution ---")
print(label_dist)

print("\n--- Label Statistics ---")
print(f"Average number of labels per sample: {avg_labels_per_sample:.2f}")
print(f"Min labels per sample: {labels_per_sample.min()}")
print(f"Max labels per sample: {labels_per_sample.max()}")


--- Label Distribution ---
                       Emotion  Count  Percentage
admiration          admiration  17131    8.110309
amusement            amusement   9245    4.376849
anger                    anger   8084    3.827198
annoyance            annoyance  13618    6.447154
approval              approval  17620    8.341816
caring                  caring   5999    2.840099
confusion            confusion   7359    3.483963
curiosity            curiosity   9692    4.588472
desire                  desire   3817    1.807078
disappointment  disappointment   8469    4.009469
disapproval        disapproval  11424    5.408451
disgust                disgust   5301    2.509646
embarrassment    embarrassment   2476    1.172210
excitement          excitement   5629    2.664931
fear                      fear   3197    1.513552
gratitude            gratitude  11625    5.503610
grief                    grief    673    0.318618
joy                        joy   7983    3.779382
love                  

# 3 - Data Preprocessing

In [7]:
# Extract text and labels from the full dataset
texts = df['text'].values
labels = df.iloc[:, 9:].values  # Columns 9 to 36 are emotion labels

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts
encoded_data = tokenizer(texts.tolist(), truncation=True, padding='max_length', max_length=128, return_tensors='pt')

print("\n--- Encoded Data Shape ---")
print(f"Input IDs: {encoded_data['input_ids'].shape}")
print(f"Attention Masks: {encoded_data['attention_mask'].shape}")
print(f"Labels Shape: {labels.shape}")


--- Encoded Data Shape ---
Input IDs: torch.Size([211225, 128])
Attention Masks: torch.Size([211225, 128])
Labels Shape: (211225, 28)


# 4 - Data Splitting

In [9]:
# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    encoded_data['input_ids'], labels, test_size=0.2, random_state=42
)
train_masks, val_masks = train_test_split(
    encoded_data['attention_mask'], test_size=0.2, random_state=42
)

print("\n--- Split Sizes ---")
print(f"Train Input IDs: {train_texts.shape}")
print(f"Validation Input IDs: {val_texts.shape}")
print(f"Train Labels: {train_labels.shape}")
print(f"Validation Labels: {val_labels.shape}")


--- Split Sizes ---
Train Input IDs: torch.Size([168980, 128])
Validation Input IDs: torch.Size([42245, 128])
Train Labels: (168980, 28)
Validation Labels: (42245, 28)


# 5 - Model Setup

In [11]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load BERT model with 28 emotion labels (excluding 'example_very_unclear')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=28)
model.config.problem_type = "multi_label_classification"  # Set for multi-label task
model.to(device)

print("\n--- Model Loaded ---")
print(f"Device: {device}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Model Loaded ---
Device: cuda


# 6 - Training Setup

In [13]:
# Convert to TensorDataset
train_dataset = TensorDataset(
    train_texts.clone().detach(), train_masks.clone().detach(), torch.tensor(train_labels)
)
val_dataset = TensorDataset(
    val_texts.clone().detach(), val_masks.clone().detach(), torch.tensor(val_labels)
)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Set up optimizer
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

print("\n--- Training Setup Complete ---")
print(f"Train Loader Size: {len(train_loader)}")
print(f"Validation Loader Size: {len(val_loader)}")


--- Training Setup Complete ---
Train Loader Size: 10562
Validation Loader Size: 2641


# 7 - Model Training

In [15]:
# Set number of epochs and early stopping parameters
epochs = 50
patience = 5  # Number of epochs with no improvement before early stopping
best_val_loss = float('inf')
epochs_no_improve = 0
save_path = 'best_emotion_model.pt'

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    # Add progress bar for batches in this epoch
    with tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch") as pbar:
        for batch in pbar:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            optimizer.zero_grad()
            labels = labels.float()  # Ensure labels are float for multi-label
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            pbar.set_postfix({'loss': loss.item()})  # Show current batch loss
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {avg_loss:.4f}")
    
    # Validation and early stopping
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        with tqdm(val_loader, desc=f"Epoch {epoch + 1}/{epochs} (Val)", unit="batch") as pbar_val:
            for batch in pbar_val:
                input_ids, attention_mask, labels = [b.to(device) for b in batch]
                labels = labels.float()  # Ensure labels are float for multi-label
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)  # Pass labels
                val_loss = outputs.loss
                total_val_loss += val_loss.item()
                pbar_val.set_postfix({'loss': val_loss.item()})
    
    avg_val_loss = total_val_loss / len(val_loader)
    
    # Early stopping logic
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        # Save the model
        try:
            torch.save(model.state_dict(), save_path)
            print(f"Model saved as {save_path} with validation loss: {best_val_loss:.4f}")
        except Exception as e:
            print(f"Error saving model: {e}")
    else:
        epochs_no_improve += 1
        print(f"No improvement in validation loss for {epochs_no_improve} epoch(s).")
    
    # Check if early stopping is triggered
    if epochs_no_improve >= patience:
        print(f"Early stopping triggered after {epoch + 1} epochs.")
        break

print("\n--- Training Complete ---")

Epoch 1/50: 100%|██████████| 10562/10562 [34:38<00:00,  5.08batch/s, loss=0.0955]


Epoch 1/50, Average Loss: 0.1245


Epoch 1/50 (Val): 100%|██████████| 2641/2641 [02:24<00:00, 18.26batch/s, loss=0.0818]


Model saved as best_emotion_model.pt with validation loss: 0.1126


Epoch 2/50: 100%|██████████| 10562/10562 [31:30<00:00,  5.59batch/s, loss=0.0512]


Epoch 2/50, Average Loss: 0.1089


Epoch 2/50 (Val): 100%|██████████| 2641/2641 [02:23<00:00, 18.44batch/s, loss=0.0713]


Model saved as best_emotion_model.pt with validation loss: 0.1110


Epoch 3/50: 100%|██████████| 10562/10562 [34:18<00:00,  5.13batch/s, loss=0.0396]


Epoch 3/50, Average Loss: 0.1032


Epoch 3/50 (Val): 100%|██████████| 2641/2641 [02:22<00:00, 18.54batch/s, loss=0.0757]


No improvement in validation loss for 1 epoch(s).


Epoch 4/50: 100%|██████████| 10562/10562 [42:02<00:00,  4.19batch/s, loss=0.097] 


Epoch 4/50, Average Loss: 0.0981


Epoch 4/50 (Val): 100%|██████████| 2641/2641 [03:08<00:00, 13.99batch/s, loss=0.0694]


No improvement in validation loss for 2 epoch(s).


Epoch 5/50:   6%|▋         | 675/10562 [02:27<36:05,  4.57batch/s, loss=0.0871]


KeyboardInterrupt: 

# 8 - Model Evaluation

In [None]:
# Evaluate the model
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.sigmoid(outputs.logits).cpu().numpy() > 0.5
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

f1 = f1_score(all_labels, all_preds, average='micro')
print("\n--- Evaluation Results ---")
print(f"F1 Score (Micro): {f1:.4f}")

# 9 - Save and Test Model

In [None]:
# Save the model
model.save_pretrained('emotion_model')
tokenizer.save_pretrained('emotion_model')

# Test with a sample text
sample_text = "I am so excited to see my friends!"
inputs = tokenizer(sample_text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
inputs = {k: v.to(device) for k, v in inputs.items()}

model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    preds = torch.sigmoid(outputs.logits).cpu().numpy()[0] > 0.5

emotion_labels = df.columns[9:37].tolist()  # Get emotion column names
predicted_emotions = [emotion_labels[i] for i, pred in enumerate(preds) if pred]
print("\n--- Sample Prediction ---")
print(f"Text: {sample_text}")
print(f"Predicted Emotions: {predicted_emotions}")