In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("go_emotions_dataset.csv")  
df

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,That game hurt.,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,eemcysk,>sexuality shouldn’t be a grouping category I...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,eeibobj,Man I love reddit.,False,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211220,ee6pagw,Everyone likes [NAME].,False,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
211221,ef28nod,Well when you’ve imported about a gazillion of...,False,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
211222,ee8hse1,That looks amazing,False,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211223,edrhoxh,The FDA has plenty to criticize. But like here...,False,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df.columns

Index(['id', 'text', 'example_very_unclear', 'admiration', 'amusement',
       'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
       'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
       'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
       'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
       'sadness', 'surprise', 'neutral'],
      dtype='object')

In [4]:
text_col = 'text'
label_cols = df.columns[3:]  
df = df[[text_col] + list(label_cols)]

print("Total rows before removing empty labels:", len(df))

df = df[df[label_cols].sum(axis=1) > 0]
print("Total rows after removing empty labels:", len(df))

label_distribution = df[label_cols].sum().sort_values(ascending=False)
print("\nLabel Distribution:\n", label_distribution)

Total rows before removing empty labels: 211225
Total rows after removing empty labels: 207814

Label Distribution:
 neutral           55298
approval          17620
admiration        17131
annoyance         13618
gratitude         11625
disapproval       11424
curiosity          9692
amusement          9245
realization        8785
optimism           8715
disappointment     8469
love               8191
anger              8084
joy                7983
confusion          7359
sadness            6758
caring             5999
excitement         5629
surprise           5514
disgust            5301
desire             3817
fear               3197
remorse            2525
embarrassment      2476
nervousness        1810
pride              1302
relief             1289
grief               673
dtype: int64


In [5]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df[label_cols].values, test_size=0.2, random_state=42)

print("Train size:", len(train_texts))
print("Test size:", len(test_texts))

Train size: 166251
Test size: 41563


In [6]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

In [7]:
import torch

class GoEmotionsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = GoEmotionsDataset(train_encodings, train_labels)
test_dataset = GoEmotionsDataset(test_encodings, test_labels)

In [8]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_cols),
    problem_type="multi_label_classification"
)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from sklearn.metrics import f1_score, hamming_loss
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = (logits > 0.5).astype(int)  
    f1 = f1_score(labels, preds, average="micro")
    h_loss = hamming_loss(labels, preds)
    return {"micro_f1": f1, "hamming_loss": h_loss}

In [10]:
small_train_texts = train_texts[:10000]
small_train_labels = train_labels[:10000]

small_train_encodings = tokenizer(small_train_texts, truncation=True, padding=True, max_length=128)

small_train_dataset = GoEmotionsDataset(small_train_encodings, small_train_labels)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,  
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()



Step,Training Loss
100,0.2661
200,0.1594
300,0.1576
400,0.1568
500,0.1484
600,0.1439
700,0.1363
800,0.1343
900,0.1334
1000,0.1305


