## Config

In [6]:
dataset_name = 'EmoEvent'
labels_column = 'labels'    # emotion

limit_columns = False
remove_others = False
only_others = True

sample_size = 100
random_seed = 0

## Load Dataset

In [4]:
import pandas as pd
from config_files import dataset_config

dataset_metadata = dataset_config.dataset[dataset_name]

file_location = f"./{dataset_metadata['relpath']}"

if dataset_metadata['filetype'] == 'csv':
    df_full_dataset = pd.read_csv(file_location)
elif dataset_metadata['filetype'] == 'tsv':
    df_full_dataset = pd.read_csv(file_location, sep="\t")

df_full_dataset

Unnamed: 0,id,event,tweet,offensive,emotion
0,3B9XR6P1WE1U78OX08FW8NXJL93BJG,NotreDame,I know that the Notre Dame is a very important...,NO,others
1,3P4ZBJFX2V96Q90CC9K7G3IC235WFF,Venezuela,#BREAKING: (USER) -- Trump threatens `full an...,NO,others
2,3IQ9O0AYW65Y8JY8ICLHWGO5E1JIT6,LaLiga,#Barcelona will win La Liga with three games t...,NO,others
3,3XEIP58NL0TWKWFD977CAKHEG1ZLZ3,LaLiga,HT: Decent half. A goal would've been good tho...,NO,others
4,3BFNCI9LYKWWKIJIK6BTNEUY6MU37E,GretaThunberg,In the 20th century we had weeping statues of ...,NO,others
...,...,...,...,...,...
7261,3TTPFEFXCTQSAQCCGHV3LUVCUGPH6X,Venezuela,With regime change yet to take hold in #Venezu...,NO,fear
7262,3XT3KXP24Z4S6LNWMZDOL6WRUT46IF,NotreDame,#NotreDameCathedralFire Fantastic the response...,NO,others
7263,3EHIMLB7F75FE4V09WS158R30BIH8M,GameOfThrones,For those of you questioning the Arya thing. P...,NO,others
7264,3MYASTQBG7H48SQU9UQB0EX52P7DQG,NotreDame,Respect for people in france who've lost a his...,NO,sadness


In [5]:
df_full_dataset.drop(columns = dataset_metadata['unused_columns'], inplace=True)
df_full_dataset.rename(columns = dataset_metadata['remap_columns'], inplace=True)

display(df_full_dataset)

full_label_count =  pd.Series(df_full_dataset.labels).value_counts()
print(full_label_count)

Unnamed: 0,context,text,labels
0,NotreDame,I know that the Notre Dame is a very important...,others
1,Venezuela,#BREAKING: (USER) -- Trump threatens `full an...,others
2,LaLiga,#Barcelona will win La Liga with three games t...,others
3,LaLiga,HT: Decent half. A goal would've been good tho...,others
4,GretaThunberg,In the 20th century we had weeping statues of ...,others
...,...,...,...
7261,Venezuela,With regime change yet to take hold in #Venezu...,fear
7262,NotreDame,#NotreDameCathedralFire Fantastic the response...,others
7263,GameOfThrones,For those of you questioning the Arya thing. P...,others
7264,NotreDame,Respect for people in france who've lost a his...,sadness


labels
others      3283
joy         2034
disgust      760
sadness      414
anger        390
surprise     234
fear         151
Name: count, dtype: int64


In [9]:
if only_others:
    df_full_dataset = df_full_dataset[df_full_dataset['labels'] == 'others']
    display(df_full_dataset)

Unnamed: 0,context,text,labels
0,NotreDame,I know that the Notre Dame is a very important...,others
1,Venezuela,#BREAKING: (USER) -- Trump threatens `full an...,others
2,LaLiga,#Barcelona will win La Liga with three games t...,others
3,LaLiga,HT: Decent half. A goal would've been good tho...,others
4,GretaThunberg,In the 20th century we had weeping statues of ...,others
...,...,...,...
7257,WorldBookDay,Today is #WorldBookDay &amp; we're still celeb...,others
7259,Venezuela,What is happening in #Venezuela should be used...,others
7260,GameOfThrones,I'm hyperventilating. this episode is too inte...,others
7262,NotreDame,#NotreDameCathedralFire Fantastic the response...,others


In [None]:
if remove_others:
    df_full_dataset = df_full_dataset[df_full_dataset['labels'] != 'others']
    display(df_full_dataset)

    full_label_count =  pd.Series(df_full_dataset.labels).value_counts()
full_label_percentages = full_label_count / full_label_count.sum() * 100
print(full_label_percentages)

In [10]:
sample_fraction = (sample_size+1) / df_full_dataset.shape[0]
df_subset = df_full_dataset.groupby('labels', group_keys=False).apply(
    lambda x: x.sample(frac=sample_fraction, random_state = random_seed)     # Get a random, stratified sample
)

df_subset = df_subset.sample(frac=1, random_state=random_seed)  # Shuffle rows
df_subset


  df_subset = df_full_dataset.groupby('labels', group_keys=False).apply(


Unnamed: 0,context,text,labels
3866,NotreDame,#French investigators probing the devastating ...,others
5908,ChampionsLeague,What a special team this Ajax team is and it’s...,others
3934,GameOfThrones,Literally was not ready for this episode 😱 Ary...,others
3069,ChampionsLeague,Liverpool score tomorrow and don’t lose by mor...,others
1524,GameOfThrones,What #GameofThrones does with battle scenes an...,others
...,...,...,...
5434,GretaThunberg,"USER 16 year old Swede #GretaThunberg, has mor...",others
5313,GretaThunberg,"Dear #GretaThunberg, we're not even close to b...",others
3499,WorldBookDay,I like the books which are more critical #Worl...,others
4883,ChampionsLeague,Ref really allowing it physical today. Might b...,others


In [None]:
subset_label_count = pd.Series(df_subset.labels).value_counts()
print(subset_label_count)
subset_label_percentages = subset_label_count / subset_label_count.sum() * 100
print(subset_label_percentages)

## Export

In [12]:
df_subset['text'].to_csv(f"datasets/emoevent/raw/full_subset/emoevents_others_unlabeled.csv", index=True)
df_subset.to_csv(f"datasets/emoevent/raw/full_subset/emoevents_others.csv", index=True)