Emotion Classifier

# Setup

## Imports 

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 200)

import json
from typing import List, Dict, Union


## Reading in the Data

In [2]:
# Load emotion names from file
with open('../datasets/GoEmotions/emotions.txt', 'r') as f:
    emotion_names = [line.strip() for line in f]

# Function to read and expand one file
def load_go_emotions_split(path):
    df = pd.read_csv(path, sep='\t', header=None, names=['text', 'emotion_labels', 'id'])
    # Create 28 one-hot columns, all default to 0
    for i, emo in enumerate(emotion_names):
        df[emo] = 0
    # Fill columns by parsing emotion_labels
    for idx, row in df.iterrows():
        label_idxs = list(map(int, row['emotion_labels'].split(',')))
        for label in label_idxs:
            df.at[idx, emotion_names[label]] = 1
    return df

# Load all splits
go_emotions_train = load_go_emotions_split('../datasets/GoEmotions/train.tsv')
go_emotions_val   = load_go_emotions_split('../datasets/GoEmotions/dev.tsv')
go_emotions_test  = load_go_emotions_split('../datasets/GoEmotions/test.tsv')

# Read in the Ekman mapping
with open('../datasets/GoEmotions/ekman_mapping.json', 'r') as f:
    ekman_map = json.load(f)

## Helper Functions

### Ekman Mapping

In [3]:
def ekman_category_breakdown(
    df: pd.DataFrame,
    emotion_columns: list,
    ekman_mapping: dict
) -> None:
    """
    Prints a clean percentage breakdown of each Ekman umbrella category in the dataset.

    Args:
        df (pd.DataFrame): DataFrame with one-hot columns for emotions.
        emotion_columns (list): List of the 28 emotion column names.
        ekman_mapping (dict): Dict mapping Ekman categories to emotion names.
    """
    total = len(df)
    print("Ekman category percentage breakdown:")
    for ekman_cat, fine_emotions in ekman_mapping.items():
        present = df[fine_emotions].any(axis=1)
        pct = present.sum() / total * 100
        print(f"{ekman_cat:<9} : {pct:.2f}%")


### Text Stats

In [4]:
def text_length_stats(
    df: pd.DataFrame,
    text_col: str = "text",
    by: str = "char",
    return_df: bool = False
):
    """
    Print a neatly aligned stats table for lengths of the text column (char or word count).

    Args:
        df (pd.DataFrame): DataFrame with the text data.
        text_col (str): Column name containing text. Default 'text'.
        by (str): 'char' for character count, 'word' for word count.
        return_df (bool): If True, also return the stats as a DataFrame.

    Returns:
        Optional[pd.DataFrame]: Stats DataFrame if requested.
    """
    if by == "char":
        lengths = df[text_col].astype(str).apply(len)
    elif by == "word":
        lengths = df[text_col].astype(str).apply(lambda x: len(x.split()))
    else:
        raise ValueError("`by` must be 'char' or 'word'")

    stats = [
        ("count",        lengths.count()),
        ("min",          lengths.min()),
        ("Q1",           lengths.quantile(0.25)),
        ("median",       lengths.median()),
        ("mean",         lengths.mean()),
        ("Q3",           lengths.quantile(0.75)),
        ("max",          lengths.max()),
        ("mode",         lengths.mode().values[0] if not lengths.mode().empty else None),
        ("std dev",      lengths.std()),
        ("variance",     lengths.var()),
        ("IQR",          lengths.quantile(0.75) - lengths.quantile(0.25)),
    ]
    print(f"\n{'Text length statistics (' + ('characters' if by=='char' else 'words') + ')':^36}")
    print("=" * 36)
    for label, val in stats:
        print(f"{label:<10}: {val:>10.2f}" if isinstance(val, float) else f"{label:<10}: {val:>10}")
    if return_df:
        return pd.DataFrame(stats, columns=["statistic", "value"]).set_index("statistic")


# Exploring Training Set

In [5]:
print("5 random rows of the training set:\n")
display(go_emotions_train.sample(5).style.set_properties(**{'white-space': 'pre-wrap'}))


5 random rows of the training set:



Unnamed: 0,text,emotion_labels,id,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
42050,"I don’t get why he hasn’t brought his insta account back, he did used to use it from time to time",6,eduux4x,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2043,What's the fucking point of this post? Is just baseless claims with no proof to back them up.,2,ee2tvlj,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
37775,"Can I comment on the lost opportunity/remarkable restraint by [NAME] to not have a ""MLK I Have a Dream Mattress Sale""",7,eeczpcn,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
29410,Reserved for when the wizards blow my 3 team parlay,27,efbcplr,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
29934,Don't cope,310,ed7ehin,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
print("go_emotion_train info:\n")
print(go_emotions_train.info())

go_emotion_train info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43410 entries, 0 to 43409
Data columns (total 31 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            43410 non-null  object
 1   emotion_labels  43410 non-null  object
 2   id              43410 non-null  object
 3   admiration      43410 non-null  int64 
 4   amusement       43410 non-null  int64 
 5   anger           43410 non-null  int64 
 6   annoyance       43410 non-null  int64 
 7   approval        43410 non-null  int64 
 8   caring          43410 non-null  int64 
 9   confusion       43410 non-null  int64 
 10  curiosity       43410 non-null  int64 
 11  desire          43410 non-null  int64 
 12  disappointment  43410 non-null  int64 
 13  disapproval     43410 non-null  int64 
 14  disgust         43410 non-null  int64 
 15  embarrassment   43410 non-null  int64 
 16  excitement      43410 non-null  int64 
 17  fear            43410 non-

In [7]:
ekman_category_breakdown(go_emotions_train, emotion_names, ekman_map)

Ekman category percentage breakdown:
anger     : 12.85%
disgust   : 1.83%
fear      : 1.67%
joy       : 40.11%
sadness   : 7.52%
surprise  : 12.36%


In [8]:
text_length_stats(go_emotions_train, text_col="text", by="char")


Text length statistics (characters) 
count     :      43410
min       :          2
Q1        :      38.00
median    :      65.00
mean      :      68.40
Q3        :      96.00
max       :        703
mode      :         56
std dev   :      36.72
variance  :    1348.50
IQR       :      58.00


# Exploring Validation Set

In [9]:
print("5 random rows of the validation set:\n")
display(go_emotions_val.sample(5).style.set_properties(**{'white-space': 'pre-wrap'}))


5 random rows of the validation set:



Unnamed: 0,text,emotion_labels,id,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
3469,"If this gets put in a video, hi mom!",27,ef8jhlq,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4586,I basically had no negative side effects. I did switch to IUD because I hated remembering to take a pill everyday. But no issues.,22,ee18n5p,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4414,That's beautiful and wholesome,0,ef5pgq8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
597,"I like this, but it doesn’t quite convey the horrifying rotting smell. It sounds sort of cute! She was not lol",1,ed4tgj7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4011,Oh cool another manufactured awful [NAME] pushing untalented rapper. It’s getting more obvious as time goes by,10,efh70am,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
print("go_emotions_val info:\n")
print(go_emotions_val.info())

go_emotions_val info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5426 entries, 0 to 5425
Data columns (total 31 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            5426 non-null   object
 1   emotion_labels  5426 non-null   object
 2   id              5426 non-null   object
 3   admiration      5426 non-null   int64 
 4   amusement       5426 non-null   int64 
 5   anger           5426 non-null   int64 
 6   annoyance       5426 non-null   int64 
 7   approval        5426 non-null   int64 
 8   caring          5426 non-null   int64 
 9   confusion       5426 non-null   int64 
 10  curiosity       5426 non-null   int64 
 11  desire          5426 non-null   int64 
 12  disappointment  5426 non-null   int64 
 13  disapproval     5426 non-null   int64 
 14  disgust         5426 non-null   int64 
 15  embarrassment   5426 non-null   int64 
 16  excitement      5426 non-null   int64 
 17  fear            5426 non-null

In [11]:
ekman_category_breakdown(go_emotions_val, emotion_names, ekman_map)

Ekman category percentage breakdown:
anger     : 13.21%
disgust   : 1.79%
fear      : 1.94%
joy       : 40.90%
sadness   : 7.19%
surprise  : 11.50%


In [12]:
text_length_stats(go_emotions_val, text_col="text", by="char")


Text length statistics (characters) 
count     :       5426
min       :          5
Q1        :      37.00
median    :      64.00
mean      :      68.24
Q3        :      96.00
max       :        187
mode      :         37
std dev   :      36.91
variance  :    1362.24
IQR       :      59.00


# Exploring Test Set

In [13]:
print("5 random rows of the test set:\n")
display(go_emotions_test.sample(5).style.set_properties(**{'white-space': 'pre-wrap'}))


5 random rows of the test set:



Unnamed: 0,text,emotion_labels,id,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
3674,thats even scarier haha,114,ed9w7gf,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1680,I know right. It feels really good when someone acknowledges your existence. I hope more moments like these come for you.,420,ef10f3j,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
428,Author thinks that a guy who bounced back and forth between a bench role and AAA will bounce back and forth between a bench role and AAA.,27,efdy6el,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2868,I will! This is a great new excuse for never leaving my house,0,edycbyu,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3814,Yeah this is pretty eye opening. I’ve had pretty weird relationships with my reflection in the past but thankfully it’s not always this way.,15,ef49up7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
print("go_emotions_test info:\n")
print(go_emotions_test.info())

go_emotions_test info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5427 entries, 0 to 5426
Data columns (total 31 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            5427 non-null   object
 1   emotion_labels  5427 non-null   object
 2   id              5427 non-null   object
 3   admiration      5427 non-null   int64 
 4   amusement       5427 non-null   int64 
 5   anger           5427 non-null   int64 
 6   annoyance       5427 non-null   int64 
 7   approval        5427 non-null   int64 
 8   caring          5427 non-null   int64 
 9   confusion       5427 non-null   int64 
 10  curiosity       5427 non-null   int64 
 11  desire          5427 non-null   int64 
 12  disappointment  5427 non-null   int64 
 13  disapproval     5427 non-null   int64 
 14  disgust         5427 non-null   int64 
 15  embarrassment   5427 non-null   int64 
 16  excitement      5427 non-null   int64 
 17  fear            5427 non-nul

In [15]:
ekman_category_breakdown(go_emotions_test, emotion_names, ekman_map)

Ekman category percentage breakdown:
anger     : 13.38%
disgust   : 2.27%
fear      : 1.81%
joy       : 38.77%
sadness   : 6.98%
surprise  : 12.47%


In [16]:
text_length_stats(go_emotions_test, text_col="text", by="char")


Text length statistics (characters) 
count     :       5427
min       :          5
Q1        :      37.00
median    :      65.00
mean      :      67.82
Q3        :      95.00
max       :        184
mode      :         24
std dev   :      36.32
variance  :    1319.03
IQR       :      58.00


# Prototype CNN

In [17]:
sample = "UGH srsly my head is POUNDING! got no sleep AGAIN? fUCK this BS :( cant even think. just wanna cry rn 😭"

## Tokenization

In [18]:
import string

# Create a simple character vocabulary (expand as needed)
all_chars = list(string.ascii_lowercase + string.ascii_uppercase + string.digits +
                 string.punctuation + string.whitespace + "😴😭")
char2idx = {c: i for i, c in enumerate(all_chars)}
vocab_size = len(char2idx)

# Tokenize the sample
tokens = [char2idx.get(c, 0) for c in sample]
print(tokens)


[46, 32, 33, 94, 18, 17, 18, 11, 24, 94, 12, 24, 94, 7, 4, 0, 3, 94, 8, 18, 94, 41, 40, 46, 39, 29, 34, 39, 32, 62, 94, 6, 14, 19, 94, 13, 14, 94, 18, 11, 4, 4, 15, 94, 26, 32, 26, 34, 39, 82, 94, 5, 46, 28, 36, 94, 19, 7, 8, 18, 94, 27, 44, 94, 77, 69, 94, 2, 0, 13, 19, 94, 4, 21, 4, 13, 94, 19, 7, 8, 13, 10, 75, 94, 9, 20, 18, 19, 94, 22, 0, 13, 13, 0, 94, 2, 17, 24, 94, 17, 13, 94, 101]


## 16d character vectors

In [19]:
import torch
import torch.nn as nn

# 16d character embeddings
embedding_dim = 16
embeddings = nn.Embedding(vocab_size, embedding_dim)

char_tensor = torch.tensor(tokens).unsqueeze(0)  # shape: (1, seq_len)
char_embedded = embeddings(char_tensor)          # shape: (1, seq_len, 16)
print(char_embedded.shape)
print(char_embedded[0])  # Print the embeddings for each character


torch.Size([1, 103, 16])
tensor([[-0.2623,  0.0163, -1.1457,  ...,  0.6902,  2.1133, -0.6992],
        [-0.8260, -0.3303,  0.3652,  ..., -1.2517,  1.1951, -1.4648],
        [ 0.6609, -0.7282, -2.1209,  ...,  0.2498, -0.1948,  1.2765],
        ...,
        [-0.1701, -0.1885,  2.2750,  ...,  0.2596,  0.0101,  0.9115],
        [ 0.2483,  1.0729,  0.0862,  ..., -1.0074,  0.0550,  1.2124],
        [-1.2484,  1.0322,  0.4019,  ...,  2.1959,  0.7616, -0.1421]],
       grad_fn=<SelectBackward0>)


## trigrams

In [20]:
conv3 = nn.Conv1d(in_channels=embedding_dim, out_channels=21, kernel_size=3, padding=1)
x3 = char_embedded.permute(0, 2, 1)  # (batch, channels, seq_len)
trigrams = conv3(x3)                 # shape: (1, 21, seq_len)
print(trigrams.shape)
print(trigrams[0].permute(1, 0))     # (seq_len, 21)


torch.Size([1, 21, 103])
tensor([[ 0.8434, -0.3109, -0.1925,  ..., -0.0108, -0.7222, -0.4374],
        [-0.7701,  0.8539,  0.0445,  ..., -0.6808,  0.3612, -1.0118],
        [ 0.4776, -1.4395,  0.0045,  ...,  0.5090, -1.1187,  0.0083],
        ...,
        [ 0.4338,  0.4519,  0.5709,  ..., -0.6020, -0.4900,  0.7648],
        [-0.6286,  0.0753, -0.0394,  ...,  0.4521, -0.0223,  0.3231],
        [-0.0897, -0.2866,  0.7383,  ...,  0.9964, -0.0135,  0.9860]],
       grad_fn=<PermuteBackward0>)


## pentagrams

In [21]:
conv5 = nn.Conv1d(in_channels=embedding_dim, out_channels=21, kernel_size=5, padding=2)
pentagrams = conv5(x3)               # shape: (1, 21, seq_len)
print(pentagrams.shape)
print(pentagrams[0].permute(1, 0))   # (seq_len, 21)


torch.Size([1, 21, 103])
tensor([[ 1.4851e-01, -6.4492e-01,  1.0086e-01,  ..., -1.9575e-01,
         -6.7704e-02,  1.3031e-03],
        [ 4.5662e-01, -1.8607e-01,  8.0824e-02,  ...,  1.1441e-01,
         -2.2896e-01,  4.0470e-01],
        [ 5.4311e-01,  4.8854e-02,  1.2784e-01,  ...,  1.6663e-01,
          9.3681e-01, -5.3980e-01],
        ...,
        [-8.2917e-01, -1.2030e-01, -9.9546e-02,  ..., -9.3146e-04,
          4.8401e-02, -6.1509e-01],
        [-1.9747e-01,  3.0172e-01, -1.0470e-01,  ...,  1.2386e-01,
         -2.2237e-01,  8.5577e-01],
        [-2.3720e-01,  3.2654e-01,  8.3190e-01,  ..., -1.6210e-02,
          3.6513e-01,  3.2864e-01]], grad_fn=<PermuteBackward0>)


## Heptagrams

In [22]:
conv7 = nn.Conv1d(in_channels=embedding_dim, out_channels=22, kernel_size=7, padding=3)
heptagrams = conv7(x3)               # shape: (1, 22, seq_len)
print(heptagrams.shape)
print(heptagrams[0].permute(1, 0))   # (seq_len, 22)


torch.Size([1, 22, 103])
tensor([[-0.9613, -0.0972, -0.0055,  ..., -0.6985,  0.3834, -0.2520],
        [ 0.1834,  0.4453,  0.6848,  ...,  0.2310, -0.3534, -0.4740],
        [-0.3222,  0.0558, -0.1360,  ..., -0.4393,  0.5785, -0.0588],
        ...,
        [ 0.6411,  1.2671,  0.3996,  ..., -0.3538, -0.2380, -0.7555],
        [-0.3721,  0.0162,  0.3107,  ..., -0.3084, -0.0530, -0.2694],
        [-0.1803, -0.2761,  0.2848,  ..., -0.3288,  0.4866, -0.0077]],
       grad_fn=<PermuteBackward0>)


## max pooling

In [23]:
# For each set of filters, take the max across the time dimension (seq_len)
trigram_pooled = torch.max(trigrams, dim=2).values  # (1, 21)
pentagram_pooled = torch.max(pentagrams, dim=2).values  # (1, 21)
heptagram_pooled = torch.max(heptagrams, dim=2).values  # (1, 22)

print(trigram_pooled)
print(pentagram_pooled)
print(heptagram_pooled)


tensor([[1.3013, 1.5427, 2.2468, 1.0036, 1.4046, 0.9150, 1.4890, 1.1949, 0.7565,
         1.2674, 1.2192, 1.1675, 1.5283, 1.2138, 1.4194, 1.2331, 2.1170, 0.8631,
         1.5021, 1.2395, 1.4918]], grad_fn=<MaxBackward0>)
tensor([[1.1590, 1.3543, 1.5108, 1.6406, 1.1641, 1.5700, 1.5450, 1.3899, 1.3388,
         1.0595, 1.4415, 0.9235, 1.4136, 1.1643, 1.2055, 1.2433, 1.4730, 1.2623,
         1.9006, 1.8532, 1.2329]], grad_fn=<MaxBackward0>)
tensor([[1.1715, 1.4626, 1.6549, 1.7557, 1.4528, 1.7816, 1.5882, 0.9951, 1.3683,
         1.3420, 1.3783, 1.4369, 1.5178, 1.1643, 0.8128, 1.3049, 1.1771, 0.7145,
         1.2875, 1.0187, 1.7435, 1.4272]], grad_fn=<MaxBackward0>)


## 64d output

In [24]:
final_cnn_output = torch.cat([trigram_pooled, pentagram_pooled, heptagram_pooled], dim=1)  # (1, 64)
print(final_cnn_output.shape)
print(final_cnn_output)


torch.Size([1, 64])
tensor([[1.3013, 1.5427, 2.2468, 1.0036, 1.4046, 0.9150, 1.4890, 1.1949, 0.7565,
         1.2674, 1.2192, 1.1675, 1.5283, 1.2138, 1.4194, 1.2331, 2.1170, 0.8631,
         1.5021, 1.2395, 1.4918, 1.1590, 1.3543, 1.5108, 1.6406, 1.1641, 1.5700,
         1.5450, 1.3899, 1.3388, 1.0595, 1.4415, 0.9235, 1.4136, 1.1643, 1.2055,
         1.2433, 1.4730, 1.2623, 1.9006, 1.8532, 1.2329, 1.1715, 1.4626, 1.6549,
         1.7557, 1.4528, 1.7816, 1.5882, 0.9951, 1.3683, 1.3420, 1.3783, 1.4369,
         1.5178, 1.1643, 0.8128, 1.3049, 1.1771, 0.7145, 1.2875, 1.0187, 1.7435,
         1.4272]], grad_fn=<CatBackward0>)
