In [1]:
import os
import re
import string
import emoji
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import torch
from transformers import BertForSequenceClassification
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim


import warnings
warnings.filterwarnings('ignore')

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
train_data = pd.read_csv("../input/goemotions/data/train.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
valid_data = pd.read_csv("../input/goemotions/data/dev.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
test_data = pd.read_csv("/kaggle/input/goemotions/data/test.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])

In [4]:
train_data.head()

Unnamed: 0,Text,Class,ID
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43410 entries, 0 to 43409
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    43410 non-null  object
 1   Class   43410 non-null  object
 2   ID      43410 non-null  object
dtypes: object(3)
memory usage: 1017.5+ KB


In [6]:
train_data['Class List'] = train_data['Class'].apply(lambda x: x.split(','))
train_data['Class Length'] = train_data['Class List'].apply(lambda x: len(x))

valid_data['Class List'] = valid_data['Class'].apply(lambda x: x.split(','))
valid_data['Class Length'] = valid_data['Class List'].apply(lambda x: len(x))

test_data['Class List'] = test_data['Class'].apply(lambda x: x.split(','))
test_data['Class Length'] = test_data['Class List'].apply(lambda x: len(x))

In [7]:
train_data["Class Length"].value_counts()


Class Length
1    36308
2     6541
3      532
4       28
5        1
Name: count, dtype: int64

In [8]:
with open('../input/goemotions/data/ekman_mapping.json') as file:
    ekman_mapping = json.load(file)
ekman_mapping

{'anger': ['anger', 'annoyance', 'disapproval'],
 'disgust': ['disgust'],
 'fear': ['fear', 'nervousness'],
 'joy': ['joy',
  'amusement',
  'approval',
  'excitement',
  'gratitude',
  'love',
  'optimism',
  'relief',
  'pride',
  'admiration',
  'desire',
  'caring'],
 'sadness': ['sadness', 'disappointment', 'embarrassment', 'grief', 'remorse'],
 'surprise': ['surprise', 'realization', 'confusion', 'curiosity']}

In [9]:
emotion_file = open("../input/goemotions/data/emotions.txt", "r")
emotion_list = emotion_file.read()
emotion_list = emotion_list.split("\n")
print(emotion_list)

['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [10]:
def idx2class(idx_list):
    arr = []
    for i in idx_list:
        arr.append(emotion_list[int(i)])
    return arr
train_data['Emotions'] = train_data['Class List'].apply(idx2class)
valid_data['Emotions'] = valid_data['Class List'].apply(idx2class)
test_data['Emotions'] = test_data['Class List'].apply(idx2class)

In [11]:
train_data.head()

Unnamed: 0,Text,Class,ID,Class List,Class Length,Emotions
0,My favourite food is anything I didn't have to...,27,eebbqej,[27],1,[neutral]
1,"Now if he does off himself, everyone will thin...",27,ed00q6i,[27],1,[neutral]
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj,[2],1,[anger]
3,To make her feel threatened,14,ed7ypvh,[14],1,[fear]
4,Dirty Southern Wankers,3,ed0bdzj,[3],1,[annoyance]


In [12]:
emotion_to_idx = {emotion: i for i, emotion in enumerate(emotion_list)}

# Function to convert list of emotions to binary vector
def encode_emotions(emotions):
    vector = [0] * len(emotion_to_idx)
    for emotion in emotions:
        idx = emotion_to_idx[emotion]
        vector[idx] = 1
    return vector

In [13]:
train_data['emotion_vector'] = train_data['Emotions'].apply(encode_emotions)

In [14]:
train_data.head()

Unnamed: 0,Text,Class,ID,Class List,Class Length,Emotions,emotion_vector
0,My favourite food is anything I didn't have to...,27,eebbqej,[27],1,[neutral],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"Now if he does off himself, everyone will thin...",27,ed00q6i,[27],1,[neutral],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj,[2],1,[anger],"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,To make her feel threatened,14,ed7ypvh,[14],1,[fear],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,Dirty Southern Wankers,3,ed0bdzj,[3],1,[annoyance],"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [15]:
class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.texts = dataframe['Text']
        self.labels = dataframe['emotion_vector'].tolist()
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts.iloc[index]
        label = self.labels[index]
        
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length = self.max_len
        )
        return {
            'input_ids': torch.tensor(encoding['input_ids']),
            'attention_mask': torch.tensor(encoding['attention_mask']),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [16]:
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = EmotionDataset(train_data, tokenizer, 512)
loader = DataLoader(dataset, batch_size=16, shuffle=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [17]:
num_labels = 28 
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Use Binary Cross-Entropy Loss for multi-label classification
loss_fn = nn.BCEWithLogitsLoss()

# Optimizer remains the same
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Ensure you set the device to cuda:0 as the default
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Move the model to cuda:0 before applying DataParallel
model = model.to(device)

In [19]:
pip install torch_xla

Collecting torch_xla
  Downloading torch_xla-2.6.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (21 kB)
Downloading torch_xla-2.6.0-cp310-cp310-manylinux_2_28_x86_64.whl (93.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.6/93.6 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: torch_xla
Successfully installed torch_xla-2.6.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
import torch_xla.core.xla_model as xm
device = xm.xla_device()
model = YourModel().to(device) 
model = torch.nn.DataParallel(model)

# Training loop
for epoch in range(10):  # Replace 10 with the number of epochs you want
    model.train()
    epoch_loss = 0
    for batch in loader:
        optimizer.zero_grad()

        # Move the input data to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].float().to(device)  # Ensure labels are float for BCEWithLogitsLoss

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Compute loss
        loss = loss_fn(outputs.logits, labels)
        
        # Backward pass
        loss.backward()
        
        # Optimization step
        optimizer.step()
        
        # Track loss for this epoch
        epoch_loss += loss.item()
    
    # Average epoch loss
    avg_loss = epoch_loss / len(loader)
    print(f"Epoch {epoch+1} Loss: {avg_loss}")

In [None]:
output_directory = "/kaggle/working/bert_emotion_classifier"
# Save the model
if torch.cuda.device_count() > 1:
    model.module.save_pretrained(output_directory)
else:
    model.save_pretrained(output_directory)
# Save the tokenizer
tokenizer.save_pretrained(output_directory)
print(f"Model saved as: {output_directory}")