Emotion Classifier

# Setup

## Imports 

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 200)

import json
from typing import List, Dict, Union


## Reading in the Data

In [2]:
# Load emotion names from file
with open('../datasets/GoEmotions/emotions.txt', 'r') as f:
    emotion_names = [line.strip() for line in f]

# Function to read and expand one file
def load_go_emotions_split(path):
    df = pd.read_csv(path, sep='\t', header=None, names=['text', 'emotion_labels', 'id'])
    # Create 28 one-hot columns, all default to 0
    for i, emo in enumerate(emotion_names):
        df[emo] = 0
    # Fill columns by parsing emotion_labels
    for idx, row in df.iterrows():
        label_idxs = list(map(int, row['emotion_labels'].split(',')))
        for label in label_idxs:
            df.at[idx, emotion_names[label]] = 1
    return df

# Load all splits
go_emotions_train = load_go_emotions_split('../datasets/GoEmotions/train.tsv')
go_emotions_val   = load_go_emotions_split('../datasets/GoEmotions/dev.tsv')
go_emotions_test  = load_go_emotions_split('../datasets/GoEmotions/test.tsv')

# Read in the Ekman mapping
with open('../datasets/GoEmotions/ekman_mapping.json', 'r') as f:
    ekman_map = json.load(f)

## Helper Functions

### Ekman Mapping

In [3]:
def ekman_category_breakdown(
    df: pd.DataFrame,
    emotion_columns: list,
    ekman_mapping: dict
) -> None:
    """
    Prints a clean percentage breakdown of each Ekman umbrella category in the dataset.

    Args:
        df (pd.DataFrame): DataFrame with one-hot columns for emotions.
        emotion_columns (list): List of the 28 emotion column names.
        ekman_mapping (dict): Dict mapping Ekman categories to emotion names.
    """
    total = len(df)
    print("Ekman category percentage breakdown:")
    for ekman_cat, fine_emotions in ekman_mapping.items():
        present = df[fine_emotions].any(axis=1)
        pct = present.sum() / total * 100
        print(f"{ekman_cat:<9} : {pct:.2f}%")


### Text Stats

In [4]:
def text_length_stats(
    df: pd.DataFrame,
    text_col: str = "text",
    by: str = "char",
    return_df: bool = False
):
    """
    Print a neatly aligned stats table for lengths of the text column (char or word count).

    Args:
        df (pd.DataFrame): DataFrame with the text data.
        text_col (str): Column name containing text. Default 'text'.
        by (str): 'char' for character count, 'word' for word count.
        return_df (bool): If True, also return the stats as a DataFrame.

    Returns:
        Optional[pd.DataFrame]: Stats DataFrame if requested.
    """
    if by == "char":
        lengths = df[text_col].astype(str).apply(len)
    elif by == "word":
        lengths = df[text_col].astype(str).apply(lambda x: len(x.split()))
    else:
        raise ValueError("`by` must be 'char' or 'word'")

    stats = [
        ("count",        lengths.count()),
        ("min",          lengths.min()),
        ("Q1",           lengths.quantile(0.25)),
        ("median",       lengths.median()),
        ("mean",         lengths.mean()),
        ("Q3",           lengths.quantile(0.75)),
        ("max",          lengths.max()),
        ("mode",         lengths.mode().values[0] if not lengths.mode().empty else None),
        ("std dev",      lengths.std()),
        ("variance",     lengths.var()),
        ("IQR",          lengths.quantile(0.75) - lengths.quantile(0.25)),
    ]
    print(f"\n{'Text length statistics (' + ('characters' if by=='char' else 'words') + ')':^36}")
    print("=" * 36)
    for label, val in stats:
        print(f"{label:<10}: {val:>10.2f}" if isinstance(val, float) else f"{label:<10}: {val:>10}")
    if return_df:
        return pd.DataFrame(stats, columns=["statistic", "value"]).set_index("statistic")


# Exploring Training Set

In [5]:
print("5 random rows of the training set:\n")
display(go_emotions_train.sample(5).style.set_properties(**{'white-space': 'pre-wrap'}))


5 random rows of the training set:



Unnamed: 0,text,emotion_labels,id,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
8164,"Guys, are you really sure those are men?",6,ef6126n,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7768,and everyone else is worried about themselves not paying attention to you,19,eeq4ja6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
32118,just some freak accident with the medicine he was prescribed. He had a seizure,27,efbvdad,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
24108,😵 when 😤 mommy 🤨 gets 👀 the 🤔 🅱️endies 😫 just 🔥🔥🔥 right 👌,27,ef4h3fk,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
26483,That guy was definitely in the process of starting a cult.,27,edosubc,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [6]:
print("go_emotion_train info:\n")
print(go_emotions_train.info())

go_emotion_train info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43410 entries, 0 to 43409
Data columns (total 31 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            43410 non-null  object
 1   emotion_labels  43410 non-null  object
 2   id              43410 non-null  object
 3   admiration      43410 non-null  int64 
 4   amusement       43410 non-null  int64 
 5   anger           43410 non-null  int64 
 6   annoyance       43410 non-null  int64 
 7   approval        43410 non-null  int64 
 8   caring          43410 non-null  int64 
 9   confusion       43410 non-null  int64 
 10  curiosity       43410 non-null  int64 
 11  desire          43410 non-null  int64 
 12  disappointment  43410 non-null  int64 
 13  disapproval     43410 non-null  int64 
 14  disgust         43410 non-null  int64 
 15  embarrassment   43410 non-null  int64 
 16  excitement      43410 non-null  int64 
 17  fear            43410 non-

In [7]:
ekman_category_breakdown(go_emotions_train, emotion_names, ekman_map)

Ekman category percentage breakdown:
anger     : 12.85%
disgust   : 1.83%
fear      : 1.67%
joy       : 40.11%
sadness   : 7.52%
surprise  : 12.36%


In [8]:
text_length_stats(go_emotions_train, text_col="text", by="char")


Text length statistics (characters) 
count     :      43410
min       :          2
Q1        :      38.00
median    :      65.00
mean      :      68.40
Q3        :      96.00
max       :        703
mode      :         56
std dev   :      36.72
variance  :    1348.50
IQR       :      58.00


# Exploring Validation Set

In [9]:
print("5 random rows of the validation set:\n")
display(go_emotions_val.sample(5).style.set_properties(**{'white-space': 'pre-wrap'}))


5 random rows of the validation set:



Unnamed: 0,text,emotion_labels,id,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
4107,"Also, AAA",27,ed0kepy,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3056,That boy of yours is a real hero,0,edbfhws,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2372,Does it tickle inside your nostril? I always imagined it would be annoying.,3,edkgol2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4818,oh [NAME]...,13,edf1zmd,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2966,[NAME] that was funny too. She was mothering him the whole episode,1,eezttob,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
print("go_emotions_val info:\n")
print(go_emotions_val.info())

go_emotions_val info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5426 entries, 0 to 5425
Data columns (total 31 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            5426 non-null   object
 1   emotion_labels  5426 non-null   object
 2   id              5426 non-null   object
 3   admiration      5426 non-null   int64 
 4   amusement       5426 non-null   int64 
 5   anger           5426 non-null   int64 
 6   annoyance       5426 non-null   int64 
 7   approval        5426 non-null   int64 
 8   caring          5426 non-null   int64 
 9   confusion       5426 non-null   int64 
 10  curiosity       5426 non-null   int64 
 11  desire          5426 non-null   int64 
 12  disappointment  5426 non-null   int64 
 13  disapproval     5426 non-null   int64 
 14  disgust         5426 non-null   int64 
 15  embarrassment   5426 non-null   int64 
 16  excitement      5426 non-null   int64 
 17  fear            5426 non-null

In [11]:
ekman_category_breakdown(go_emotions_val, emotion_names, ekman_map)

Ekman category percentage breakdown:
anger     : 13.21%
disgust   : 1.79%
fear      : 1.94%
joy       : 40.90%
sadness   : 7.19%
surprise  : 11.50%


In [12]:
text_length_stats(go_emotions_val, text_col="text", by="char")


Text length statistics (characters) 
count     :       5426
min       :          5
Q1        :      37.00
median    :      64.00
mean      :      68.24
Q3        :      96.00
max       :        187
mode      :         37
std dev   :      36.91
variance  :    1362.24
IQR       :      59.00


# Exploring Test Set

In [13]:
print("5 random rows of the test set:\n")
display(go_emotions_test.sample(5).style.set_properties(**{'white-space': 'pre-wrap'}))


5 random rows of the test set:



Unnamed: 0,text,emotion_labels,id,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
3129,When her daughter says she wants to take all the weight from her and put it on her own body so her mom can go outside and play 😢❤️😢,27,eebhxqu,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3006,Rip the guy from psych,16,eedxtl7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2987,There is no such Thing as [NAME] or an Almighty !,10,eepmpf7,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4962,I was in awe the whole hour I spent in there.,14,efb7e1k,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
105,it is actually called a mechanical bull,4,edpnfan,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
print("go_emotions_test info:\n")
print(go_emotions_test.info())

go_emotions_test info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5427 entries, 0 to 5426
Data columns (total 31 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            5427 non-null   object
 1   emotion_labels  5427 non-null   object
 2   id              5427 non-null   object
 3   admiration      5427 non-null   int64 
 4   amusement       5427 non-null   int64 
 5   anger           5427 non-null   int64 
 6   annoyance       5427 non-null   int64 
 7   approval        5427 non-null   int64 
 8   caring          5427 non-null   int64 
 9   confusion       5427 non-null   int64 
 10  curiosity       5427 non-null   int64 
 11  desire          5427 non-null   int64 
 12  disappointment  5427 non-null   int64 
 13  disapproval     5427 non-null   int64 
 14  disgust         5427 non-null   int64 
 15  embarrassment   5427 non-null   int64 
 16  excitement      5427 non-null   int64 
 17  fear            5427 non-nul

In [15]:
ekman_category_breakdown(go_emotions_test, emotion_names, ekman_map)

Ekman category percentage breakdown:
anger     : 13.38%
disgust   : 2.27%
fear      : 1.81%
joy       : 38.77%
sadness   : 6.98%
surprise  : 12.47%


In [16]:
text_length_stats(go_emotions_test, text_col="text", by="char")


Text length statistics (characters) 
count     :       5427
min       :          5
Q1        :      37.00
median    :      65.00
mean      :      67.82
Q3        :      95.00
max       :        184
mode      :         24
std dev   :      36.32
variance  :    1319.03
IQR       :      58.00


# Prototype CNN

In [17]:
sample = "UGH srsly my head is POUNDING! got no sleep AGAIN? fUCK this BS :( cant even think. just wanna cry rn 😭"

## Tokenization

In [18]:
import string

# Create a simple character vocabulary (expand as needed)
all_chars = list(string.ascii_lowercase + string.ascii_uppercase + string.digits +
                 string.punctuation + string.whitespace + "😴😭")
char2idx = {c: i for i, c in enumerate(all_chars)}
vocab_size = len(char2idx)

# Tokenize the sample
tokens = [char2idx.get(c, 0) for c in sample]
print(tokens)


[46, 32, 33, 94, 18, 17, 18, 11, 24, 94, 12, 24, 94, 7, 4, 0, 3, 94, 8, 18, 94, 41, 40, 46, 39, 29, 34, 39, 32, 62, 94, 6, 14, 19, 94, 13, 14, 94, 18, 11, 4, 4, 15, 94, 26, 32, 26, 34, 39, 82, 94, 5, 46, 28, 36, 94, 19, 7, 8, 18, 94, 27, 44, 94, 77, 69, 94, 2, 0, 13, 19, 94, 4, 21, 4, 13, 94, 19, 7, 8, 13, 10, 75, 94, 9, 20, 18, 19, 94, 22, 0, 13, 13, 0, 94, 2, 17, 24, 94, 17, 13, 94, 101]


## 16d character vectors

In [19]:
import torch
import torch.nn as nn

# 16d character embeddings
embedding_dim = 16
embeddings = nn.Embedding(vocab_size, embedding_dim)

char_tensor = torch.tensor(tokens).unsqueeze(0)  # shape: (1, seq_len)
char_embedded = embeddings(char_tensor)          # shape: (1, seq_len, 16)
print(char_embedded.shape)
print(char_embedded[0])  # Print the embeddings for each character


torch.Size([1, 103, 16])
tensor([[ 0.8550, -0.0842, -1.2513,  ...,  0.9717, -0.7049, -0.0746],
        [-0.4235, -0.0500, -0.0945,  ..., -1.4501,  1.1996,  0.4694],
        [-0.5339, -1.0215, -0.8197,  ...,  0.0403,  1.2804, -1.3513],
        ...,
        [-1.0323, -0.8736,  0.6751,  ...,  0.2274,  0.3417, -1.0639],
        [-0.8868,  0.2484,  2.3376,  ..., -0.2519, -0.7631, -0.8725],
        [-0.4366, -1.8253,  1.6152,  ..., -0.5620, -2.0106,  0.7814]],
       grad_fn=<SelectBackward0>)


## trigrams

In [20]:
conv3 = nn.Conv1d(in_channels=embedding_dim, out_channels=21, kernel_size=3, padding=1)
x3 = char_embedded.permute(0, 2, 1)  # (batch, channels, seq_len)
trigrams = conv3(x3)                 # shape: (1, 21, seq_len)
print(trigrams.shape)
print(trigrams[0].permute(1, 0))     # (seq_len, 21)


torch.Size([1, 21, 103])
tensor([[ 0.7125, -0.0503, -0.4021,  ...,  0.3132,  0.1968, -0.8263],
        [-0.6589, -0.6125, -0.4134,  ..., -0.4733, -0.3157, -0.2643],
        [-0.5124,  0.1027, -0.1836,  ..., -0.1451, -0.0095,  0.2528],
        ...,
        [ 0.3355,  0.2999, -0.4458,  ..., -0.6516,  0.1231,  0.7894],
        [ 1.1371, -0.0837,  1.3684,  ...,  0.0585,  0.2184, -0.0636],
        [ 0.6794,  0.0751, -0.6729,  ..., -0.4675,  1.0025, -0.6393]],
       grad_fn=<PermuteBackward0>)


## pentagrams

In [21]:
conv5 = nn.Conv1d(in_channels=embedding_dim, out_channels=21, kernel_size=5, padding=2)
pentagrams = conv5(x3)               # shape: (1, 21, seq_len)
print(pentagrams.shape)
print(pentagrams[0].permute(1, 0))   # (seq_len, 21)


torch.Size([1, 21, 103])
tensor([[ 0.3345,  0.1927, -0.5291,  ...,  0.5251, -0.1178, -0.2576],
        [ 0.7174,  0.8411,  0.3307,  ..., -0.2358, -0.0384, -0.3798],
        [ 0.5044, -0.5822, -0.2527,  ...,  1.6129, -0.1530, -0.4539],
        ...,
        [ 0.8198, -0.1547, -0.4959,  ..., -0.5871, -0.3672,  0.3238],
        [ 0.0587, -0.7694,  0.2390,  ..., -0.4443, -0.6162, -0.7574],
        [ 0.0483, -1.2634,  0.4807,  ..., -1.0715, -0.4078,  0.0978]],
       grad_fn=<PermuteBackward0>)


## Heptagrams

In [22]:
conv7 = nn.Conv1d(in_channels=embedding_dim, out_channels=22, kernel_size=7, padding=3)
heptagrams = conv7(x3)               # shape: (1, 22, seq_len)
print(heptagrams.shape)
print(heptagrams[0].permute(1, 0))   # (seq_len, 22)


torch.Size([1, 22, 103])
tensor([[-0.3280, -0.2087,  0.6636,  ..., -0.0744, -0.1211, -0.4341],
        [ 0.1917,  0.5555, -0.0041,  ...,  0.0149,  0.6888,  0.4496],
        [-0.1129, -0.5982, -0.5074,  ...,  0.2457, -0.0248, -0.5470],
        ...,
        [ 0.1284, -0.1166,  0.3225,  ...,  0.1613,  0.5406,  0.1644],
        [-0.4250, -0.7461, -0.1505,  ...,  1.2052,  0.0052,  0.3832],
        [ 0.8287, -1.1149,  0.4158,  ...,  1.0058, -0.1387,  0.1036]],
       grad_fn=<PermuteBackward0>)


## max pooling

In [23]:
# For each set of filters, take the max across the time dimension (seq_len)
trigram_pooled = torch.max(trigrams, dim=2).values  # (1, 21)
pentagram_pooled = torch.max(pentagrams, dim=2).values  # (1, 21)
heptagram_pooled = torch.max(heptagrams, dim=2).values  # (1, 22)

print(trigram_pooled)
print(pentagram_pooled)
print(heptagram_pooled)


tensor([[1.8359, 1.8816, 1.3684, 1.2196, 0.8973, 1.6204, 1.4869, 1.2838, 1.3287,
         1.0230, 1.9242, 1.3378, 1.5327, 1.6773, 1.8859, 1.2154, 1.1534, 1.7209,
         0.9474, 1.7464, 1.2381]], grad_fn=<MaxBackward0>)
tensor([[1.2853, 0.8545, 1.1612, 2.0426, 1.6400, 1.7351, 1.1632, 1.5184, 1.2468,
         1.8105, 0.7385, 1.6442, 1.9000, 1.2503, 1.2655, 1.4325, 0.8988, 1.9212,
         1.6129, 1.4341, 1.7036]], grad_fn=<MaxBackward0>)
tensor([[1.1235, 1.0015, 1.4288, 1.4651, 1.5506, 1.4040, 1.5391, 1.5522, 2.1932,
         1.2006, 1.8512, 2.1120, 1.4071, 1.7684, 1.6070, 1.4090, 1.8539, 1.8590,
         1.9513, 1.6780, 1.3598, 1.1843]], grad_fn=<MaxBackward0>)


## 64d output

In [24]:
final_cnn_output = torch.cat([trigram_pooled, pentagram_pooled, heptagram_pooled], dim=1)  # (1, 64)
print(final_cnn_output.shape)
print(final_cnn_output)


torch.Size([1, 64])
tensor([[1.8359, 1.8816, 1.3684, 1.2196, 0.8973, 1.6204, 1.4869, 1.2838, 1.3287,
         1.0230, 1.9242, 1.3378, 1.5327, 1.6773, 1.8859, 1.2154, 1.1534, 1.7209,
         0.9474, 1.7464, 1.2381, 1.2853, 0.8545, 1.1612, 2.0426, 1.6400, 1.7351,
         1.1632, 1.5184, 1.2468, 1.8105, 0.7385, 1.6442, 1.9000, 1.2503, 1.2655,
         1.4325, 0.8988, 1.9212, 1.6129, 1.4341, 1.7036, 1.1235, 1.0015, 1.4288,
         1.4651, 1.5506, 1.4040, 1.5391, 1.5522, 2.1932, 1.2006, 1.8512, 2.1120,
         1.4071, 1.7684, 1.6070, 1.4090, 1.8539, 1.8590, 1.9513, 1.6780, 1.3598,
         1.1843]], grad_fn=<CatBackward0>)
