In [1]:
import pandas as pd

In [6]:
df = pd.read_csv("go_emotions_dataset.csv")  
df

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,That game hurt.,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,eemcysk,>sexuality shouldn’t be a grouping category I...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,eeibobj,Man I love reddit.,False,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211220,ee6pagw,Everyone likes [NAME].,False,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
211221,ef28nod,Well when you’ve imported about a gazillion of...,False,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
211222,ee8hse1,That looks amazing,False,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211223,edrhoxh,The FDA has plenty to criticize. But like here...,False,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df = df[df['example_very_unclear'] == False]

df = df.drop(columns=['id', 'example_very_unclear'])

emotion_labels = [col for col in df.columns if col not in ['text', 'neutral']]

df = df[df[emotion_labels].sum(axis=1) > 0]

print("Dataset shape after cleaning:", df.shape)
print("\nSample text:\n", df['text'].iloc[0])
print("\nSample label vector:\n", df[emotion_labels].iloc[0].values)
print("\nEmotion Labels:\n", emotion_labels)

Dataset shape after cleaning: (152516, 29)

Sample text:
 That game hurt.

Sample label vector:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]

Emotion Labels:
 ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']


In [8]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

N = 1000  
texts = df['text'].iloc[:N].tolist()
labels = df[emotion_labels].iloc[:N].values

encodings = tokenizer(
    texts,
    truncation=True,
    padding=True,
    max_length=128, 
    return_tensors="pt"
)

print("Sample input_ids shape:", encodings['input_ids'].shape)
print("Sample attention_mask shape:", encodings['attention_mask'].shape)
print("Sample label vector (first):", labels[0])

Sample input_ids shape: torch.Size([1000, 50])
Sample attention_mask shape: torch.Size([1000, 50])
Sample label vector (first): [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]


In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

class GoEmotionsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

dataset = GoEmotionsDataset(encodings, labels)

train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

for batch in train_loader:
    print("Batch input_ids shape:", batch['input_ids'].shape)
    print("Batch labels shape:", batch['labels'].shape)
    break

Batch input_ids shape: torch.Size([16, 50])
Batch labels shape: torch.Size([16, 27])


In [10]:
import torch.nn as nn
from transformers import DistilBertModel

class DistilBertForMultiLabel(nn.Module):
    def __init__(self, num_labels=27):
        super(DistilBertForMultiLabel, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0, :] 
        logits = self.classifier(hidden_state)
        return logits

In [11]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForMultiLabel().to(device)

optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.BCEWithLogitsLoss()

model.train()
for batch in train_loader:
    optimizer.zero_grad()
    
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids, attention_mask)
    loss = criterion(outputs, labels)

    loss.backward()
    optimizer.step()

    print("Loss:", loss.item())
    break 

Loss: 0.6955889463424683


In [12]:
from sklearn.metrics import hamming_loss, f1_score

model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        preds = torch.sigmoid(outputs).cpu().numpy()
        labels = labels.cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

import numpy as np
binary_preds = np.array(all_preds) > 0.5

h_loss = hamming_loss(all_labels, binary_preds)

f1_micro = f1_score(all_labels, binary_preds, average='micro')

print("Hamming Loss:", h_loss)
print("Micro F1 Score:", f1_micro)


Hamming Loss: 0.3558148148148148
Micro F1 Score: 0.0718771133223843


In [13]:
def predict_emotions(text):
    model.eval()
    inputs = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.sigmoid(outputs).cpu().numpy()[0]

    predicted_emotions = [emotion_labels[i] for i, prob in enumerate(probs) if prob > 0.5]

    return predicted_emotions, probs

text = "I can't believe how amazing this product is. I'm so happy!"
emotions, probs = predict_emotions(text)

print("Input:", text)
print("Predicted Emotions:", emotions)

Input: I can't believe how amazing this product is. I'm so happy!
Predicted Emotions: ['approval', 'confusion', 'disgust', 'embarrassment', 'joy', 'pride', 'relief', 'surprise']
