## Config

In [32]:
dataset_name = 'GoEmotions (Ekman 6)'
labels_column = 'labels'    # emotion

limit_columns = False
remove_others = False

sample_size = 50
random_seed = 418   #   I'm a teacup

## Load Dataset

In [33]:
import pandas as pd
from config_files import dataset_config

dataset_metadata = dataset_config.dataset[dataset_name]

file_location = f"./{dataset_metadata['relpath']}"

if dataset_metadata['filetype'] == 'csv':
    df_full_dataset = pd.read_csv(file_location)
elif dataset_metadata['filetype'] == 'tsv':
    df_full_dataset = pd.read_csv(file_location, sep="\t")

df_full_dataset

Unnamed: 0,labels,text,original labels,source
0,neutral,My favourite food is anything I didn't have to...,['neutral'],train
1,neutral,"Now if he does off himself, everyone will thin...",['neutral'],train
2,anger,WHY THE FUCK IS BAYLESS ISOING,['anger'],train
3,fear,To make her feel threatened,['fear'],train
4,anger,Dirty Southern Wankers,['annoyance'],train
...,...,...,...,...
48861,fear,It's pretty dangerous when the state decides w...,['fear'],dev
48862,joy,I filed for divorce this morning. Hoping he mo...,['optimism'],dev
48863,anger,"The last time it happened I just said, ""No"" an...",['disapproval'],dev
48864,anger,I can’t stand this arrogant prick he’s no bett...,['annoyance'],dev


In [34]:
df_full_dataset.drop(columns = dataset_metadata['unused_columns'], inplace=True)
df_full_dataset.rename(columns = dataset_metadata['remap_columns'], inplace=True)

display(df_full_dataset)

full_label_count =  pd.Series(df_full_dataset.labels).value_counts()
print(full_label_count)

Unnamed: 0,labels,text
0,neutral,My favourite food is anything I didn't have to...
1,neutral,"Now if he does off himself, everyone will thin..."
2,anger,WHY THE FUCK IS BAYLESS ISOING
3,fear,To make her feel threatened
4,anger,Dirty Southern Wankers
...,...,...
48861,fear,It's pretty dangerous when the state decides w...
48862,joy,I filed for divorce this morning. Hoping he mo...
48863,anger,"The last time it happened I just said, ""No"" an..."
48864,anger,I can’t stand this arrogant prick he’s no bett...


labels
joy         16657
neutral     16021
anger        5420
surprise     4805
sadness      2875
love         1760
fear          693
disgust       635
Name: count, dtype: int64


In [35]:
if remove_others:
    df_full_dataset = df_full_dataset[df_full_dataset['labels'] != 'others']
    display(df_full_dataset)

    full_label_count =  pd.Series(df_full_dataset.labels).value_counts()
full_label_percentages = full_label_count / full_label_count.sum() * 100
print(full_label_percentages)

labels
joy         34.087095
neutral     32.785577
anger       11.091557
surprise     9.833013
sadness      5.883436
love         3.601686
fear         1.418164
disgust      1.299472
Name: count, dtype: float64


In [36]:
import math
sample_fraction = math.floor(sample_size / df_full_dataset.shape[0] * 1000) / 1000
df_subset = df_full_dataset.groupby('labels', group_keys=False).apply(
    lambda x: x.sample(frac=sample_fraction, random_state = random_seed)     # Get a random, stratified sample
)
df_subset = df_subset.sample(frac=1, random_state=random_seed)  # Shuffle rows
df_subset


  df_subset = df_full_dataset.groupby('labels', group_keys=False).apply(


Unnamed: 0,labels,text
6232,love,I love it. Probably my favorite game mode righ...
30473,joy,Those are some interesting stats. I’m gonna sa...
43446,neutral,Justified or not he'll probably go before [NAME]
23954,neutral,If I'm no good I can still give the 1st round ...
24657,joy,At least it made for hilarious responses by pe...
25589,neutral,"""qt 3.14"" made me retreat into my own face, so..."
27710,joy,This one is by [NAME]! Your suggestion was fun...
46841,neutral,"Assuming you're a dude, yes, just let him watc..."
11235,neutral,Grew up in an early 90's house with door handl...
1634,neutral,"I see your ""making myself undateable"" and rais..."


In [37]:
subset_label_count = pd.Series(df_subset.labels).value_counts()
print(subset_label_count)
subset_label_percentages = subset_label_count / subset_label_count.sum() * 100
print(subset_label_percentages)

labels
joy         17
neutral     16
surprise     5
anger        5
sadness      3
love         2
disgust      1
fear         1
Name: count, dtype: int64
labels
joy         34.0
neutral     32.0
surprise    10.0
anger       10.0
sadness      6.0
love         4.0
disgust      2.0
fear         2.0
Name: count, dtype: float64


## Export

In [38]:
df_subset['text'].to_csv(f"datasets/goemotions/ekman_6_single_label/poc_subset/goemotions_6_single_poc_unlabeled.csv", index=True)
df_subset.to_csv(f"datasets/goemotions/ekman_6_single_label/poc_subset/goemotions_6_single_poc.csv", index=True)