Emotion Classifier

# Setup

## Imports 

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 200)

import json
from typing import List, Dict, Union


## Reading in the Data

In [2]:
# Load emotion names from file
with open('../datasets/GoEmotions/emotions.txt', 'r') as f:
    emotion_names = [line.strip() for line in f]

# Function to read and expand one file
def load_go_emotions_split(path: str) -> pd.DataFrame:
    """
    Loads and processes a GoEmotions data split from a TSV file, expanding the emotion labels
    into 28 separate one-hot columns for multi-label classification.

    Parameters:
    ----------
    - path: str
        File path to the GoEmotions split (TSV format, tab-separated, no header).

    Returns:
    ----------
    - df: pd.DataFrame
        DataFrame containing columns for 'text', 'emotion_labels', 'id', and 28 one-hot
        emotion columns (one per GoEmotions category).

    Raises:
    ----------
    - FileNotFoundError
        If the specified path does not exist or is incorrect.
        Check the file path and ensure the dataset is downloaded and placed correctly.
    - pd.errors.ParserError
        If the file format is invalid or incorrectly delimited.
        Ensure the file is tab-separated and matches GoEmotions expected structure.
    """
    df = pd.read_csv(path, sep='\t', header=None, names=['text', 'emotion_labels', 'id'])
    # Create 28 one-hot columns, all default to 0
    for i, emo in enumerate(emotion_names):
        df[emo] = 0
    # Fill columns by parsing emotion_labels
    for idx, row in df.iterrows():
        label_idxs = list(map(int, row['emotion_labels'].split(',')))
        for label in label_idxs:
            df.at[idx, emotion_names[label]] = 1
    return df

# Load all splits
go_emotions_train = load_go_emotions_split('../datasets/GoEmotions/train.tsv')
go_emotions_val   = load_go_emotions_split('../datasets/GoEmotions/dev.tsv')
go_emotions_test  = load_go_emotions_split('../datasets/GoEmotions/test.tsv')

# Read in the Ekman mapping
with open('../datasets/GoEmotions/ekman_mapping.json', 'r') as f:
    ekman_map = json.load(f)

## Helper Functions

### Ekman Mapping

In [3]:
def ekman_category_breakdown(
    df: pd.DataFrame,
    emotion_columns: list,
    ekman_mapping: dict
) -> None:
    """
    Prints a clean percentage breakdown of each Ekman umbrella category in the dataset.

    Args:
        df (pd.DataFrame): DataFrame with one-hot columns for emotions.
        emotion_columns (list): List of the 28 emotion column names.
        ekman_mapping (dict): Dict mapping Ekman categories to emotion names.
    """
    total = len(df)
    print("Ekman category percentage breakdown:")
    for ekman_cat, fine_emotions in ekman_mapping.items():
        present = df[fine_emotions].any(axis=1)
        pct = present.sum() / total * 100
        print(f"{ekman_cat:<9} : {pct:.2f}%")


# Exploring Training Set

In [4]:
print("5 random rows of the training set:\n")
display(go_emotions_train.sample(5).style.set_properties(**{'white-space': 'pre-wrap'}))


5 random rows of the training set:



Unnamed: 0,text,emotion_labels,id,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
40319,[NAME] and I play with people with 400 plus spm and still play noobs please stop the excuses if you’re getting wrecked you just suck,3,edzhzd8,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21167,I can read the anger in this post.,27,ed9xpvd,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
20996,Lol. That too,1,efdp1lo,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8742,rich people usually tip the worst lol,1,efesrdr,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
42023,I'm so proud of you!,21,edjbo3e,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [5]:
print("go_emotion_train info:\n")
print(go_emotions_train.info())

go_emotion_train info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43410 entries, 0 to 43409
Data columns (total 31 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            43410 non-null  object
 1   emotion_labels  43410 non-null  object
 2   id              43410 non-null  object
 3   admiration      43410 non-null  int64 
 4   amusement       43410 non-null  int64 
 5   anger           43410 non-null  int64 
 6   annoyance       43410 non-null  int64 
 7   approval        43410 non-null  int64 
 8   caring          43410 non-null  int64 
 9   confusion       43410 non-null  int64 
 10  curiosity       43410 non-null  int64 
 11  desire          43410 non-null  int64 
 12  disappointment  43410 non-null  int64 
 13  disapproval     43410 non-null  int64 
 14  disgust         43410 non-null  int64 
 15  embarrassment   43410 non-null  int64 
 16  excitement      43410 non-null  int64 
 17  fear            43410 non-

# Exploring Validation Set

In [6]:
print("5 random rows of the validation set:\n")
display(go_emotions_val.sample(5).style.set_properties(**{'white-space': 'pre-wrap'}))


5 random rows of the validation set:



Unnamed: 0,text,emotion_labels,id,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
1035,It really is a brilliant piece of nostalgia and terrible film making. I love everything about it.,0,ee8wzl3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3485,"I’ve been there and it’s above a highway, feel like that might raise some eyebrows haha",1,eezxebz,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3052,Neither is beating people up,327,ed1bml7,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4591,That's the funniest thing I've read. I would have had a hard time keeping back a chuckle.,120,eer6vx5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2067,"This is the first I’m hearing of this. Not on any of the local news, but on Reddit! MASSIVE media bias.",26,eeoq2kd,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [7]:
print("go_emotions_val info:\n")
print(go_emotions_val.info())

go_emotions_val info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5426 entries, 0 to 5425
Data columns (total 31 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            5426 non-null   object
 1   emotion_labels  5426 non-null   object
 2   id              5426 non-null   object
 3   admiration      5426 non-null   int64 
 4   amusement       5426 non-null   int64 
 5   anger           5426 non-null   int64 
 6   annoyance       5426 non-null   int64 
 7   approval        5426 non-null   int64 
 8   caring          5426 non-null   int64 
 9   confusion       5426 non-null   int64 
 10  curiosity       5426 non-null   int64 
 11  desire          5426 non-null   int64 
 12  disappointment  5426 non-null   int64 
 13  disapproval     5426 non-null   int64 
 14  disgust         5426 non-null   int64 
 15  embarrassment   5426 non-null   int64 
 16  excitement      5426 non-null   int64 
 17  fear            5426 non-null

# Exploring Test Set

In [8]:
print("5 random rows of the test set:\n")
display(go_emotions_test.sample(5).style.set_properties(**{'white-space': 'pre-wrap'}))


5 random rows of the test set:



Unnamed: 0,text,emotion_labels,id,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
413,"Don't listen to the idiot, we love your posts!",5,edv7ojx,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4878,Become friends with people who work night shift. We are always up at crazy hours and need a distraction from our work.,417,ede0xwo,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1762,To match the Mets green st paddy’s jerseys,27,eenkjpw,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2531,"You're my mvp for the night haha. I'll drop by there tonight, thanks!",115,eeynlc0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2314,Or just spend ten seconds around some fire and realize that it's super hot.,22,ee4w0uu,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [9]:
print("go_emotions_test info:\n")
print(go_emotions_test.info())

go_emotions_test info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5427 entries, 0 to 5426
Data columns (total 31 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            5427 non-null   object
 1   emotion_labels  5427 non-null   object
 2   id              5427 non-null   object
 3   admiration      5427 non-null   int64 
 4   amusement       5427 non-null   int64 
 5   anger           5427 non-null   int64 
 6   annoyance       5427 non-null   int64 
 7   approval        5427 non-null   int64 
 8   caring          5427 non-null   int64 
 9   confusion       5427 non-null   int64 
 10  curiosity       5427 non-null   int64 
 11  desire          5427 non-null   int64 
 12  disappointment  5427 non-null   int64 
 13  disapproval     5427 non-null   int64 
 14  disgust         5427 non-null   int64 
 15  embarrassment   5427 non-null   int64 
 16  excitement      5427 non-null   int64 
 17  fear            5427 non-nul

# Building the Classifier

In [10]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-roberta-base")


OSError: j-hartmann/emotion-english-roberta-base is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [None]:
class GoEmotionsDataset(torch.utils.data.Dataset):
    """
    A PyTorch Dataset class for the GoEmotions dataset. Handles input text cleaning, 
    tokenization, and packaging of emotion labels for multi-label classification.

    Parameters:
    ----------
    - texts: List[str]
        List of input text strings (already pre-cleaned, if necessary).
    - labels: List[List[float]] or np.ndarray
        List or array of 28-dimensional binary/multilabel emotion labels per sample.
    - tokenizer: PreTrainedTokenizer
        A HuggingFace tokenizer, e.g., BertTokenizer, to tokenize the input texts.
    - max_len: int
        Maximum length for tokenized input (default: MAX_LEN).

    Returns:
    ----------
    - __getitem__ returns a Dict[str, torch.Tensor]
        Dictionary containing:
            'input_ids': tensor of token indices,
            'attention_mask': tensor of attention mask,
            'labels': tensor of ground truth emotion labels.

    Raises:
    ----------
    - IndexError
        If `idx` is out of bounds for the dataset.
    """

    def __init__(self, texts, labels, tokenizer, max_len=128):
        """
        Initialize the GoEmotionsDataset with text, labels, and tokenizer.

        Parameters:
        ----------
        - texts: List[str]
            List of user input texts.
        - labels: List[List[float]] or np.ndarray
            Corresponding list/array of 28-dim label vectors.
        - tokenizer: PreTrainedTokenizer
            Tokenizer instance (e.g. BertTokenizer).
        - max_len: int
            Maximum length for tokenized sequences.
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self) -> int:
        """
        Returns the number of samples in the dataset.

        Returns:
        ----------
        - length: int
            Number of data samples (same as len(texts)).
        """
        return len(self.texts)

    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
        """
        Retrieves and tokenizes the sample at the given index.

        Parameters:
        ----------
        - idx: int
            Index of the sample to retrieve.

        Returns:
        ----------
        - sample: Dict[str, torch.Tensor]
            Dictionary with input_ids, attention_mask, and labels tensors.

        Raises:
        ----------
        - IndexError
            If idx is not within the range of the dataset.
        """
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.FloatTensor(self.labels[idx])
        }
