## Config

In [4]:
dataset_name = 'EmoEvent (Raw)'
labels_column = 'labels'    # emotion

limit_columns = True
limit_column_names = ['text']

remove_others = True

sample_size = 50
random_seed = 418   #   I'm a teacup

## Load Dataset

In [5]:
import pandas as pd
from config_files import dataset_config

dataset_metadata = dataset_config.dataset[dataset_name]

file_location = f"./{dataset_metadata['relpath']}"

if dataset_metadata['filetype'] == 'csv':
    df_full_dataset = pd.read_csv(file_location)
elif dataset_metadata['filetype'] == 'tsv':
    df_full_dataset = pd.read_csv(file_location, sep="\t")

df_full_dataset

Unnamed: 0,id,event,tweet,offensive,emotion
0,3B9XR6P1WE1U78OX08FW8NXJL93BJG,NotreDame,I know that the Notre Dame is a very important...,NO,others
1,3P4ZBJFX2V96Q90CC9K7G3IC235WFF,Venezuela,#BREAKING: (USER) -- Trump threatens `full an...,NO,others
2,3IQ9O0AYW65Y8JY8ICLHWGO5E1JIT6,LaLiga,#Barcelona will win La Liga with three games t...,NO,others
3,3XEIP58NL0TWKWFD977CAKHEG1ZLZ3,LaLiga,HT: Decent half. A goal would've been good tho...,NO,others
4,3BFNCI9LYKWWKIJIK6BTNEUY6MU37E,GretaThunberg,In the 20th century we had weeping statues of ...,NO,others
...,...,...,...,...,...
7261,3TTPFEFXCTQSAQCCGHV3LUVCUGPH6X,Venezuela,With regime change yet to take hold in #Venezu...,NO,fear
7262,3XT3KXP24Z4S6LNWMZDOL6WRUT46IF,NotreDame,#NotreDameCathedralFire Fantastic the response...,NO,others
7263,3EHIMLB7F75FE4V09WS158R30BIH8M,GameOfThrones,For those of you questioning the Arya thing. P...,NO,others
7264,3MYASTQBG7H48SQU9UQB0EX52P7DQG,NotreDame,Respect for people in france who've lost a his...,NO,sadness


In [6]:
df_full_dataset.drop(columns = dataset_metadata['unused_columns'], inplace=True)
df_full_dataset.rename(columns = dataset_metadata['remap_columns'], inplace=True)

display(df_full_dataset)

full_label_count =  pd.Series(df_full_dataset.labels).value_counts()
print(full_label_count)

Unnamed: 0,event,text,labels
0,NotreDame,I know that the Notre Dame is a very important...,others
1,Venezuela,#BREAKING: (USER) -- Trump threatens `full an...,others
2,LaLiga,#Barcelona will win La Liga with three games t...,others
3,LaLiga,HT: Decent half. A goal would've been good tho...,others
4,GretaThunberg,In the 20th century we had weeping statues of ...,others
...,...,...,...
7261,Venezuela,With regime change yet to take hold in #Venezu...,fear
7262,NotreDame,#NotreDameCathedralFire Fantastic the response...,others
7263,GameOfThrones,For those of you questioning the Arya thing. P...,others
7264,NotreDame,Respect for people in france who've lost a his...,sadness


labels
others      3283
joy         2034
disgust      760
sadness      414
anger        390
surprise     234
fear         151
Name: count, dtype: int64


In [7]:
if remove_others:
    df_full_dataset = df_full_dataset[df_full_dataset['labels'] != 'others']
    display(df_full_dataset)

    full_label_count =  pd.Series(df_full_dataset.labels).value_counts()
full_label_percentages = full_label_count / full_label_count.sum() * 100
print(full_label_percentages)

Unnamed: 0,event,text,labels
8,Venezuela,USER People of #Venezuela !! so hilarious movi...,disgust
10,ChampionsLeague,Remarkable match &amp; Incredible performance ...,joy
11,GameOfThrones,That episode of Game Of Thrones was not as gre...,anger
12,GameOfThrones,#AvengersEndGame may have had the visual and w...,joy
14,ChampionsLeague,Lmao. Liverpool are done 😂. 🐐 Makes it 2-0 #U...,joy
...,...,...,...
7255,WorldBookDay,#WorldBookDay in words of #IvoAndric #NobelPri...,joy
7258,GretaThunberg,"Dear Politicians: Next election, you will want...",anger
7261,Venezuela,With regime change yet to take hold in #Venezu...,fear
7264,NotreDame,Respect for people in france who've lost a his...,sadness


labels
joy         51.067035
disgust     19.081095
sadness     10.394175
anger        9.791614
surprise     5.874969
fear         3.791112
Name: count, dtype: float64


In [8]:
df_subset = df_full_dataset.groupby('labels', group_keys=False).apply(
    lambda x: x.sample(frac=0.0125, random_state = random_seed)     # Get a random, stratified sample
)
df_subset = df_subset.sample(frac=1, random_state=random_seed)  # Shuffle rows
df_subset


  df_subset = df_full_dataset.groupby('labels', group_keys=False).apply(


Unnamed: 0,event,text,labels
3948,ChampionsLeague,Sometimes you just have to admire the genius o...,joy
5636,WorldBookDay,"If anyone wants to give me a book as a gift, I...",joy
507,WorldBookDay,On #WorldBookDay I am curled up with a book my...,joy
5454,GretaThunberg,Very wise words USER_arrowsmith ...we should a...,joy
2819,ChampionsLeague,FH underway... Hope's to USER #Liverpool to wi...,joy
5252,WorldBookDay,Happy world book day from Globeprep! What are ...,joy
444,GameOfThrones,Damn....Game of Thrones sounds crazy! Can’t fo...,disgust
3329,WorldBookDay,"As we celebrate world book day, make sure to i...",joy
4879,ChampionsLeague,More #ChampionsLeague action kicking off at 8p...,joy
4625,GameOfThrones,That was the best bit of tv I have ever watche...,joy


In [9]:
subset_label_count = pd.Series(df_subset.labels).value_counts()
print(subset_label_count)
subset_label_percentages = subset_label_count / subset_label_count.sum() * 100
print(subset_label_percentages)

labels
joy         25
disgust     10
anger        5
sadness      5
surprise     3
fear         2
Name: count, dtype: int64
labels
joy         50.0
disgust     20.0
anger       10.0
sadness     10.0
surprise     6.0
fear         4.0
Name: count, dtype: float64


## Export

In [10]:
df_subset['text'].to_csv(f"./subset/EmoEvents_raw_subset_unlabeled.csv", index=True)
df_subset.to_csv(f"./subset/EmoEvents_raw_subset_original_labels.csv", index=True)