The following code converts each individual single label dataset into the unified format by mapping labels to the common label set. See the data folder in this GitHub repo for the dataset information and links.

Note: Whenever the original dataset has training and test sets, we use those for experiments. However, there is code to merge the two sets into one file should that be desired.

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


#CARER

In [None]:
carer_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/CARER.pkl'
carer = pd.read_pickle(carer_path)
display(carer)

Unnamed: 0,text,emotions
27383,i feel awful about it too because it s my job ...,sadness
110083,im alone i feel awful,sadness
140764,ive probably mentioned this before but i reall...,joy
100071,i was feeling a little low few days back,sadness
2837,i beleive that i am much more sensitive to oth...,love
...,...,...
566,that was what i felt when i was finally accept...,joy
36236,i take every day as it comes i m just focussin...,fear
76229,i just suddenly feel that everything was fake,sadness
131640,im feeling more eager than ever to claw back w...,joy


In [None]:
# Rename 'emotions' to 'labels'
carer = carer.rename(columns={'emotions': 'labels'})

In [None]:
# Ensure the DataFrame is formatted with 'text' and 'labels' columns only
carer = carer[['labels', 'text']]

In [None]:
# Sample 15% of entries from each label category
carer_samp = carer.groupby('labels', group_keys=False).apply(lambda x: x.sample(frac=0.15))

  carer_samp = carer.groupby('labels', group_keys=False).apply(lambda x: x.sample(frac=0.15))


In [None]:
# Save the full processed dataset
carer_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/CARER_unified.csv'
carer.to_csv(carer_save, index=False)

# Save the sampled dataset
carer_samp_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/CARER_unified_sample.csv'
carer_samp.to_csv(carer_samp_save, index=False)

In [None]:
display(carer)

Unnamed: 0,labels,text
27383,sadness,i feel awful about it too because it s my job ...
110083,sadness,im alone i feel awful
140764,joy,ive probably mentioned this before but i reall...
100071,sadness,i was feeling a little low few days back
2837,love,i beleive that i am much more sensitive to oth...
...,...,...
566,joy,that was what i felt when i was finally accept...
36236,fear,i take every day as it comes i m just focussin...
76229,sadness,i just suddenly feel that everything was fake
131640,joy,im feeling more eager than ever to claw back w...


In [None]:
# Display the summary of counts
print(carer['labels'].value_counts())

labels
joy         141067
sadness     121187
anger        57317
fear         47712
love         34554
surprise     14972
Name: count, dtype: int64


#Covid Worry

In [None]:
# This dataset is merging the results from all 3 phases of the Covid Worry survey
# Refer to the data folder in this GitHub repo for further explanations
covid_worry_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/covid_all_phases_merged.csv'
covid_worry = pd.read_csv(covid_worry_path)
display(covid_worry)

Unnamed: 0,emotion,essay
0,1,Frustrated at the lack of clear guidelines to ...
1,3,I’m deeply concerned about the affect of the C...
2,3,I feel fed up with waking up and doing the exa...
3,3,"feel anxious about how its going to spread, ho..."
4,0,I am very worried that I might contract Corona...
...,...,...
5345,1,I feel a little frustrated that restrictions h...
5346,1,The mask mandate has just lifted where I am (W...
5347,2,Angry at how the government have handled the w...
5348,5,The government are trying to kill us all off a...


In [None]:
# Map numeric emotion values to emotion labels
inv_map = {
    0: 'Fear',
    1: 'Relaxation',
    2: 'Anger',
    3: 'Anxiety',
    4: 'Sadness',
    5: 'Disgust',
    6: 'Happiness',
    7: 'Desire'
}
covid_worry['emotion'] = covid_worry['emotion'].replace(inv_map)

In [None]:
# Rename columns and convert labels to lowercase
covid_worry = covid_worry.rename(columns={'essay': 'text', 'emotion': 'labels'})
covid_worry['labels'] = covid_worry['labels'].str.lower()

In [None]:
# Map similar emotions to a unified set of labels
covid_unified_map = {
    'anxiety': 'fear',
    'relaxation': 'joy',
    'desire': 'joy',
    'happiness': 'joy'
}
covid_worry['labels'] = covid_worry['labels'].replace(covid_unified_map)

In [None]:
# Ensure the DataFrame contains only 'text' and 'labels' columns
covid_worry = covid_worry[['text', 'labels']]

In [None]:
# Save the processed dataset
covid_worry_save = '/content/drive/My Drive/Colab Notebooks/datasets/covid_worry_ufd_single.csv'
covid_worry.to_csv(covid_worry_save, index=False)

In [None]:
display(covid_worry)

Unnamed: 0,text,labels
0,Frustrated at the lack of clear guidelines to ...,joy
1,I’m deeply concerned about the affect of the C...,fear
2,I feel fed up with waking up and doing the exa...,fear
3,"feel anxious about how its going to spread, ho...",fear
4,I am very worried that I might contract Corona...,fear
...,...,...
5345,I feel a little frustrated that restrictions h...,joy
5346,The mask mandate has just lifted where I am (W...,joy
5347,Angry at how the government have handled the w...,anger
5348,The government are trying to kill us all off a...,disgust


In [None]:
print(covid_worry['labels'].value_counts())

labels
fear       2629
joy        1490
sadness     842
anger       305
disgust      84
Name: count, dtype: int64


#EmoEvent

In [None]:
emoevent_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/emoevent_en.csv'
emoevent = pd.read_csv(emoevent_path)
display(emoevent)

Unnamed: 0,id,tweet,emotion,offensive
0,1,Something to keep in mind... When situations l...,disgust,0
1,2,I'm really sorry about a whole history of 850 ...,sadness,0
2,3,The kid in me is sobbing at all the Hunchback ...,sadness,0
3,4,HASHTAG April30 New thread to share. A large g...,others,0
4,5,It’s World Book Day and a great day to visit t...,joy,0
...,...,...,...,...
7298,7299,In support of HASHTAG and all the kids (and ad...,sadness,0
7299,7300,A functioning coalition government in Spain is...,others,0
7300,7301,Apparently there was paparazzi in my living ro...,others,0
7301,7302,Your 2018/19 HASHTAG champions....⚽⚽⚽. Booooo...,joy,0


In [None]:
# Rename labels for consistency
emoevent = emoevent.rename(columns={'emotion': 'labels', 'tweet': 'text'})

In [None]:
# Map specific emotion categories to a unified set
emoevent_unified_map = {
    'others': 'neutral'
}
emoevent['labels'] = emoevent['labels'].replace(emoevent_unified_map)

In [None]:
# Ensure the DataFrame has only 'text' and 'labels' columns
emoevent = emoevent[['labels', 'text']]

In [None]:
# Save the processed dataset
emoevent_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/emoevent_unified.csv'
emoevent.to_csv(emoevent_save, index=False)

In [None]:
display(emoevent)

Unnamed: 0,labels,text
0,disgust,Something to keep in mind... When situations l...
1,sadness,I'm really sorry about a whole history of 850 ...
2,sadness,The kid in me is sobbing at all the Hunchback ...
3,neutral,HASHTAG April30 New thread to share. A large g...
4,joy,It’s World Book Day and a great day to visit t...
...,...,...
7298,sadness,In support of HASHTAG and all the kids (and ad...
7299,neutral,A functioning coalition government in Spain is...
7300,neutral,Apparently there was paparazzi in my living ro...
7301,joy,Your 2018/19 HASHTAG champions....⚽⚽⚽. Booooo...


In [None]:
print(emoevent['labels'].value_counts())

labels
neutral     3305
joy         2039
disgust      765
sadness      416
anger        392
surprise     235
fear         151
Name: count, dtype: int64


#enISEAR

In [None]:
enISEAR_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/enISEAR.tsv'
enISEAR = pd.read_csv(enISEAR_path, sep='\t')
display(enISEAR)

Unnamed: 0,Sentence_id,Prior_Emotion,Sentence,Temporal_Distance,Intensity,Duration,Gender,City,Country,Worker_id,Time,Anger,Disgust,Fear,Guilt,Joy,Sadness,Shame
0,271,Fear,"I felt ... when my 2 year old broke her leg, a...",Y,Vi,Dom,Ml,Bristol,GBR,87,11/28/2018 00:58:52,0,0,0,1,0,3,1
1,597,Shame,I felt ... one Christmas as one of our patient...,Y,I,Dom,Fl,Dulwich,GBR,86,11/26/2018 06:52:02,1,0,0,4,0,0,0
2,282,Guilt,I felt ... because I could not help a friend w...,M,Mi,Dom,Fl,Linlithgow,GBR,83,11/21/2018 18:45:00,0,0,0,4,0,1,0
3,171,Disgust,I felt ... when I read that hunters had killed...,Y,Mi,H,Ml,Bristol,GBR,87,11/28/2018 00:55:11,3,0,0,0,0,2,0
4,509,Sadness,I felt ... when my Gran passed away.,Y,Vi,Dom,Fl,Stoke-on-trent,GBR,92,11/26/2018 09:23:38,0,0,0,0,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,579,Shame,I felt ... that the neighbours in the small vi...,Y,Vi,Dom,Fl,Dulwich,GBR,86,11/25/2018 16:32:23,1,1,0,0,0,0,3
997,593,Shame,I feel ... because I behave in a way that I am...,W,Mi,H,Fl,Tunbridge Wells,GBR,122,11/24/2018 11:11:15,0,0,0,3,0,0,2
998,605,Shame,I felt ... because I fell over in public.,W,N,Fm,Fl,Sheffield,GBR,56,11/26/2018 17:28:12,0,0,0,0,0,0,5
999,606,Shame,I felt ... giving a cheque to the managing age...,W,I,H,Fl,Shepherds Bush,GBR,90,11/26/2018 21:26:45,0,0,0,2,0,0,3


In [None]:
# Select and rename columns and convert labels to lowercase for consistency
enISEAR = enISEAR[['Prior_Emotion','Sentence']]
enISEAR = enISEAR.rename(columns = {'Prior_Emotion': 'labels', 'Sentence': 'text'})
enISEAR['labels'] = enISEAR['labels'].apply(str.lower)

In [None]:
# Map emotion labels to a unified set
enISEAR_unified_map = {
    'shame': 'anger',
    'guilt': 'sadness'
}
enISEAR['original labels'] = enISEAR['labels']
enISEAR['labels'] = enISEAR['labels'].replace(enISEAR_unified_map)

In [None]:
# Save the processed dataset
enISEAR_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/enISEAR_unified.csv'
enISEAR.to_csv(enISEAR_save, index=False)

In [None]:
display(enISEAR)

Unnamed: 0,labels,text,original labels
0,fear,"I felt ... when my 2 year old broke her leg, a...",fear
1,anger,I felt ... one Christmas as one of our patient...,shame
2,sadness,I felt ... because I could not help a friend w...,guilt
3,disgust,I felt ... when I read that hunters had killed...,disgust
4,sadness,I felt ... when my Gran passed away.,sadness
...,...,...,...
996,anger,I felt ... that the neighbours in the small vi...,shame
997,anger,I feel ... because I behave in a way that I am...,shame
998,anger,I felt ... because I fell over in public.,shame
999,anger,I felt ... giving a cheque to the managing age...,shame


In [None]:
print(enISEAR['labels'].value_counts())

labels
anger      286
sadness    286
fear       143
disgust    143
joy        143
Name: count, dtype: int64


#GitHub Love

In [None]:
git_love_train_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/github-emotion-love-train.csv'
git_love_train = pd.read_csv(git_love_train_path)
display(git_love_train)

git_love_test_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/github-emotion-love-test.csv'
git_love_test = pd.read_csv(git_love_test_path)
display(git_love_test)

Unnamed: 0,id,modified_comment,Anger,Love,Fear,Joy,Sadness,Surprise
0,704844644,This change doesn't affect anything but makes ...,0,0,0,1,0,0
1,886568180,Thanks very much for your feedback [USER] Your...,0,1,0,0,0,0
2,950480127,[BLOCK QUOTE].\n\nWorks perfectly that way! Th...,0,1,0,0,0,0
3,897560867,[USER] Thanks for the quick answers! It is goo...,0,1,0,0,0,0
4,878517138,[USER] I don't think that it would be possible...,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
1595,958353427,[USER] Any updates on this PR? Is it still nee...,0,0,0,0,0,1
1596,958721169,"Hi [USER], can you explain more what exactly y...",0,0,0,0,0,1
1597,524603224,I understand why we skip the work if there are...,0,0,0,0,0,1
1598,920311878,"I can't wait for this <3 \nI am curious, what ...",0,1,0,0,0,1


Unnamed: 0,id,modified_comment,Anger,Love,Fear,Joy,Sadness,Surprise
0,949058424,Assuming this is not an issue any more. Please...,0,0,0,0,0,0
1,121870320,I guess it's because when creating the module ...,0,0,0,0,0,0
2,156956746,It's ok. You don't have to change it.,0,0,0,0,0,0
3,644409171,I didn't look at what it would take to remove ...,0,0,0,0,0,0
4,657568251,I've reverted this change since it's not neede...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
395,152840278,That's strange,0,0,0,0,0,1
396,950365195,yes i am really surprised that it's only a p4 ...,0,0,0,0,0,1
397,921044961,I was quite surprised when my prebaked webpack...,0,1,0,0,0,1
398,733155987,Thanks! That's what I was curious about,0,1,0,0,0,1


In [None]:
# Define a function to convert binary multi-label data to single-label per sample
def binary_multi_to_single(df, left_bound, right_bound):
    temp = df.loc[:, left_bound:right_bound].replace(1, pd.Series(df.columns, df.columns))
    df['y'] = temp.apply(lambda x: [i for i in x if i != 0], axis=1)
    df = df.explode('y').fillna(0).reset_index(drop=True)
    m = df.loc[:, left_bound:right_bound].replace(1, pd.Series(df.columns, df.columns)).apply(
        lambda x: x == df.y.values[int(x.name)], axis=1).astype(int)
    df.loc[:, left_bound:right_bound] = m.astype(int)
    return df

In [None]:
# Process the data
new_git_tr = binary_multi_to_single(git_love_train, "Anger", "Surprise")
new_git_tst = binary_multi_to_single(git_love_test, "Anger", "Surprise")

In [None]:
# Drop unnecessary columns
new_git_tr = new_git_tr.drop(columns=git_love_train.loc[:, "Anger":"Surprise"].columns)
new_git_tr = new_git_tr.drop(columns=['id'])
new_git_tst = new_git_tst.drop(columns=git_love_test.loc[:, "Anger":"Surprise"].columns)
new_git_tst = new_git_tst.drop(columns=['id'])

In [None]:
# Rename columns to 'text' and 'labels'
new_git_tr = new_git_tr.rename(columns={"modified_comment": "text", "y": "labels"})
new_git_tst = new_git_tst.rename(columns={"modified_comment": "text", "y": "labels"})

In [None]:
# Select only 'labels' and 'text' columns
new_git_tr = new_git_tr[['labels', 'text']]
new_git_tst = new_git_tst[['labels', 'text']]

In [None]:
# Remove rows where labels are 0
new_git_tr = new_git_tr[new_git_tr['labels'] != 0]
new_git_tst = new_git_tst[new_git_tst['labels'] != 0]

In [None]:
# Convert labels to lowercase
new_git_tr['labels'] = new_git_tr['labels'].str.lower()
new_git_tst['labels'] = new_git_tst['labels'].str.lower()

In [None]:
# Save the processed training and test datasets
git_tr_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/github_love_train_unified.csv'
git_tst_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/github_love_test_unified.csv'
new_git_tr.to_csv(git_tr_save, index=False)
new_git_tst.to_csv(git_tst_save, index=False)

In [None]:
display(new_git_tr)
display(new_git_tst)

Unnamed: 0,labels,text
0,joy,This change doesn't affect anything but makes ...
1,love,Thanks very much for your feedback [USER] Your...
2,love,[BLOCK QUOTE].\n\nWorks perfectly that way! Th...
3,love,[USER] Thanks for the quick answers! It is goo...
4,love,[USER] I don't think that it would be possible...
...,...,...
1604,surprise,"Hi [USER], can you explain more what exactly y..."
1605,surprise,I understand why we skip the work if there are...
1606,love,"I can't wait for this <3 \nI am curious, what ..."
1607,surprise,"I can't wait for this <3 \nI am curious, what ..."


Unnamed: 0,labels,text
50,anger,I'm sure there was a good reason not to allow ...
51,anger,[BLOCK QUOTE]. \n\n[USER] Can't we setup a han...
52,anger,"I disagree, JIT/AOT doesn't make any differenc..."
53,anger,"No, those are actually different bugs (they hi..."
54,anger,The callback doesn't make sense here.
...,...,...
401,surprise,I was quite surprised when my prebaked webpack...
402,love,Thanks! That's what I was curious about
403,surprise,Thanks! That's what I was curious about
404,joy,It’s very strange why there is a display error...


In [None]:
print(new_git_tr['labels'].value_counts())
print(new_git_tst['labels'].value_counts())

labels
joy         338
anger       272
sadness     219
surprise    200
love        176
fear        158
Name: count, dtype: int64
labels
joy         84
anger       68
surprise    65
sadness     55
love        44
fear        40
Name: count, dtype: int64


##Merged

In [None]:
# Add 'source' column to distinguish train and test data
new_git_tr['source'] = 'train'
new_git_tst['source'] = 'test'

In [None]:
# Merge the training and test datasets
git_love_merged = pd.concat([new_git_tr, new_git_tst])

In [None]:
# Save the merged dataset
git_love_merged_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/github_love_merged_unified.csv'
git_love_merged.to_csv(git_love_merged_save, index=False)

In [None]:
display(git_love_merged)

Unnamed: 0,labels,text,source
0,joy,This change doesn't affect anything but makes ...,train
1,love,Thanks very much for your feedback [USER] Your...,train
2,love,[BLOCK QUOTE].\n\nWorks perfectly that way! Th...,train
3,love,[USER] Thanks for the quick answers! It is goo...,train
4,love,[USER] I don't think that it would be possible...,train
...,...,...,...
401,surprise,I was quite surprised when my prebaked webpack...,test
402,love,Thanks! That's what I was curious about,test
403,surprise,Thanks! That's what I was curious about,test
404,joy,It’s very strange why there is a display error...,test


In [None]:
# Display the summary of counts
label_counts_merged = git_love_merged['labels'].value_counts()
print(label_counts_merged)

labels
joy         422
anger       340
sadness     274
surprise    265
love        220
fear        198
Name: count, dtype: int64


#GoEmotion

In [None]:
go_emo_train_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/goemotion/train.tsv'
go_emo_train = pd.read_csv(go_emo_train_path, sep='\t', header=None)
go_emo_train.columns = ['text', 'labels', 'id']
display(go_emo_train)

go_emo_test_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/goemotion/test.tsv'
go_emo_test = pd.read_csv(go_emo_test_path, sep='\t', header=None)
go_emo_test.columns = ['text', 'labels', 'id']
display(go_emo_test)

go_emo_dev_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/goemotion/dev.tsv'
go_emo_dev = pd.read_csv(go_emo_dev_path, sep='\t', header=None)
go_emo_dev.columns = ['text', 'labels', 'id']
display(go_emo_dev)

Unnamed: 0,text,labels,id
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj
...,...,...,...
43405,Added you mate well I’ve just got the bow and ...,18,edsb738
43406,Always thought that was funny but is it a refe...,6,ee7fdou
43407,What are you talking about? Anything bad that ...,3,efgbhks
43408,"More like a baptism, with sexy results!",13,ed1naf8


Unnamed: 0,text,labels,id
0,I’m really sorry about your situation :( Altho...,25,eecwqtt
1,It's wonderful because it's awful. At not with.,0,ed5f85d
2,"Kings fan here, good luck to you guys! Will be...",13,een27c3
3,"I didn't know that, thank you for teaching me ...",15,eelgwd1
4,They got bored from haunting earth for thousan...,27,eem5uti
...,...,...,...
5422,Thanks. I was diagnosed with BP 1 after the ho...,15,efeeasc
5423,Well that makes sense.,4,ef9c7s3
5424,Daddy issues [NAME],27,efbiugo
5425,So glad I discovered that subreddit a couple m...,0,efbvgp9


Unnamed: 0,text,labels,id
0,Is this in New Orleans?? I really feel like th...,27,edgurhb
1,"You know the answer man, you are programmed to...",427,ee84bjg
2,I've never been this sad in my life!,25,edcu99z
3,The economy is heavily controlled and subsidiz...,427,edc32e2
4,He could have easily taken a real camera from ...,20,eepig6r
...,...,...,...
5421,It's pretty dangerous when the state decides w...,14,edyrazk
5422,I filed for divorce this morning. Hoping he mo...,20,edi2z3y
5423,"The last time it happened I just said, ""No"" an...",10,eewbqtx
5424,I can’t stand this arrogant prick he’s no bett...,3,eefx57m


In [None]:
# Define emotion mappings
ekman_emo_map = {
    "anger": ["anger", "annoyance", "disapproval"],
    "disgust": ["disgust"],
    "fear": ["fear", "nervousness"],
    "joy": ["joy", "amusement", "approval", "excitement", "gratitude",
            "optimism", "relief", "pride", "admiration", "desire", "caring"],
    "sadness": ["sadness", "disappointment", "embarrassment", "grief", "remorse"],
    "surprise": ["surprise", "realization", "confusion", "curiosity"],
    "love": ["love"],
}

In [None]:
# Taken from their original GoEmotion mapping
text_emotions = {
    "0": "admiration",
    "1": "amusement",
    "2": "anger",
    "3": "annoyance",
    "4": "approval",
    "5": "caring",
    "6": "confusion",
    "7": "curiosity",
    "8": "desire",
    "9": "disappointment",
    "10": "disapproval",
    "11": "disgust",
    "12": "embarrassment",
    "13": "excitement",
    "14": "fear",
    "15": "gratitude",
    "16": "grief",
    "17": "joy",
    "18": "love",
    "19": "nervousness",
    "20": "optimism",
    "21": "pride",
    "22": "realization",
    "23": "relief",
    "24": "remorse",
    "25": "sadness",
    "26": "surprise",
    "27": "neutral",
}

In [None]:
# Define helper functions for converting dataset from multi-label to single label
def invert_dict(d):
    inverse = {}
    for key, values in d.items():
        for value in values:
            inverse[value] = key
    return inverse

def check_list_repeat(lst):
    return all(x == lst[0] for x in lst)

def remap_go_emo(df):
    # Split labels into lists
    df['labels'] = df['labels'].str.split(',')

    # Map label indices to emotion text
    df['labels'] = df['labels'].explode().map(text_emotions).groupby(level=0).agg(list)

    # Save original labels
    df['original_labels'] = df['labels'].copy()

    # Invert emotion mapping
    emo_map = invert_dict(ekman_emo_map)
    emo_map['neutral'] = 'neutral'

    # Map emotions to unified labels
    df['labels'] = df['labels'].explode().map(emo_map).groupby(level=0).agg(list)

    # Keep only single-label records
    df['is_single_label'] = df['labels'].apply(check_list_repeat)
    df = df[df['is_single_label'] == True]

    # Drop unnecessary columns
    df = df.drop(columns=['is_single_label', 'id'])

    # Reorder columns
    df = df[['labels', 'text', 'original_labels']]

    # Convert label lists to single label strings
    df['labels'] = df['labels'].apply(lambda x: x[0])

    return df

In [None]:
# Preprocess the datasets
new_emo_train = remap_go_emo(go_emo_train)
new_emo_test = remap_go_emo(go_emo_test)
new_emo_dev = remap_go_emo(go_emo_dev)

In [None]:
# Save the processed datasets
emo_tr_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/go_emotions_train_unified.csv'
new_emo_train.to_csv(emo_tr_save, index=False)

emo_tst_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/go_emotions_test_unified.csv'
new_emo_test.to_csv(emo_tst_save, index=False)

emo_dev_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/go_emotions_dev_unified.csv'
new_emo_dev.to_csv(emo_dev_save, index=False)

In [None]:
display(new_emo_train)
display(new_emo_test)
display(new_emo_dev)

Unnamed: 0,labels,text,original_labels
0,neutral,My favourite food is anything I didn't have to...,[neutral]
1,neutral,"Now if he does off himself, everyone will thin...",[neutral]
2,anger,WHY THE FUCK IS BAYLESS ISOING,[anger]
3,fear,To make her feel threatened,[fear]
4,anger,Dirty Southern Wankers,[annoyance]
...,...,...,...
43405,love,Added you mate well I’ve just got the bow and ...,[love]
43406,surprise,Always thought that was funny but is it a refe...,[confusion]
43407,anger,What are you talking about? Anything bad that ...,[annoyance]
43408,joy,"More like a baptism, with sexy results!",[excitement]


Unnamed: 0,labels,text,original_labels
0,sadness,I’m really sorry about your situation :( Altho...,[sadness]
1,joy,It's wonderful because it's awful. At not with.,[admiration]
2,joy,"Kings fan here, good luck to you guys! Will be...",[excitement]
3,joy,"I didn't know that, thank you for teaching me ...",[gratitude]
4,neutral,They got bored from haunting earth for thousan...,[neutral]
...,...,...,...
5422,joy,Thanks. I was diagnosed with BP 1 after the ho...,[gratitude]
5423,joy,Well that makes sense.,[approval]
5424,neutral,Daddy issues [NAME],[neutral]
5425,joy,So glad I discovered that subreddit a couple m...,[admiration]


Unnamed: 0,labels,text,original_labels
0,neutral,Is this in New Orleans?? I really feel like th...,[neutral]
2,sadness,I've never been this sad in my life!,[sadness]
4,joy,He could have easily taken a real camera from ...,[optimism]
5,joy,"Thank you for your vote of confidence, but we ...",[gratitude]
6,anger,Wah Mum other people call me on my bullshit an...,[anger]
...,...,...,...
5421,fear,It's pretty dangerous when the state decides w...,[fear]
5422,joy,I filed for divorce this morning. Hoping he mo...,[optimism]
5423,anger,"The last time it happened I just said, ""No"" an...",[disapproval]
5424,anger,I can’t stand this arrogant prick he’s no bett...,[annoyance]


In [None]:
print(new_emo_train['labels'].value_counts())
print(new_emo_test['labels'].value_counts())
print(new_emo_dev['labels'].value_counts())

labels
joy         13295
neutral     12823
anger        4293
surprise     3858
sadness      2326
love         1427
fear          541
disgust       498
Name: count, dtype: int64
labels
joy         1649
neutral     1606
anger        572
surprise     488
sadness      283
love         160
fear          80
disgust       76
Name: count, dtype: int64
labels
joy         1713
neutral     1592
anger        555
surprise     459
sadness      266
love         173
fear          72
disgust       61
Name: count, dtype: int64


##Merged

In [None]:
new_emo_train['source'] = 'train'
new_emo_test['source'] = 'test'
new_emo_dev['source'] = 'dev'
go_emo_merged = pd.concat([new_emo_train, new_emo_test, new_emo_dev])

In [None]:
# Save the merged dataset
emo_merged_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/go_emotions_merged_unified.csv'
go_emo_merged.to_csv(emo_merged_save, index=False)

In [None]:
display(go_emo_merged)

Unnamed: 0,labels,text,original_labels,source
0,neutral,My favourite food is anything I didn't have to...,[neutral],train
1,neutral,"Now if he does off himself, everyone will thin...",[neutral],train
2,anger,WHY THE FUCK IS BAYLESS ISOING,[anger],train
3,fear,To make her feel threatened,[fear],train
4,anger,Dirty Southern Wankers,[annoyance],train
...,...,...,...,...
5421,fear,It's pretty dangerous when the state decides w...,[fear],dev
5422,joy,I filed for divorce this morning. Hoping he mo...,[optimism],dev
5423,anger,"The last time it happened I just said, ""No"" an...",[disapproval],dev
5424,anger,I can’t stand this arrogant prick he’s no bett...,[annoyance],dev


In [None]:
print(go_emo_merged['labels'].value_counts())

labels
joy         16657
neutral     16021
anger        5420
surprise     4805
sadness      2875
love         1760
fear          693
disgust       635
Name: count, dtype: int64


#Good News Everyone

In [None]:
gne_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/goodnewseveryone.tsv'
gne = pd.read_csv(gne_path, sep='\t')
display(gne)

Unnamed: 0,id,headline,url,bias_horizontal,bias_vertical,country,source,dominant_emotion,intensity,cause,experiencer,target,cue,other_emotions,reader_emotions
0,7d3fe468,Cops in One Village Have Been Convicted of 70 ...,http://tracking.feedpress.it/link/9499/12677255,-5.0,46.0,US,ProPublica,anger,medium,"[[""cops in one village have been convicted of ...","[[""cops""]]","[[""cops in one village have been convicted of ...","[[""convicted""]]",annoyance,annoyance
1,86693d59,DIY penis enlargements are a 'nationwide probl...,https://www.dailymail.co.uk/news/article-69149...,13.0,19.0,UK,dailymail,negative_surprise,medium,"[[""diy penis enlargements are a 'nationwide pr...","[[""papua new guinea""]]","[[""diy penis enlargements are a 'nationwide pr...","[[""problem""]]",negative_surprise,no_i_did_not_feel_any_emotion
2,0fb40e90,Dam breaking: New Epstein accuser comes forward,https://hotair.com/archives/ed-morrissey/2019/...,,,US,HotAir,anger,medium,"[[""new epstein accuser comes forward""]]",[[]],"[[""new epstein accuser comes forward""]]","[[""accuser""]]",no_other_additional_emotions_are_expressed,no_i_did_not_feel_any_emotion
3,fa7750d6,David Beckham gets six-month driving ban for u...,https://www.theguardian.com/football/2019/may/...,-6.0,48.0,UK,TheGuardian,negative_surprise,medium,"[[""gets six-month driving ban for using phone ...","[[""david beckham""]]","[[""driving ban""]]","[[""ban""]]",no_other_additional_emotions_are_expressed,no_i_did_not_feel_any_emotion
4,695ea7a2,Dead sea turtle found with spear through head ...,https://www.foxnews.com/science/sea-turtle-spe...,27.0,20.0,US,FoxNews,sadness,medium,"[[""dead sea turtle found with spear through he...","[[""florida national park""]]","[[""dead sea turtle found with spear through he...","[[""dead""]]",no_other_additional_emotions_are_expressed,sadness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,3cb09e5b,‘NOW do abortion. We’ll wait.’ Ilhan Omar doub...,https://twitchy.com/samj-3930/2019/06/16/now-d...,29.0,14.0,US,Twitchy,negative_surprise,medium,"[[""on her gun violence claim and only makes he...","[[""ilhan omar""]]","[[""ilhan omar""]]","[[""violence"",""doubles down""]]",negative_surprise,no_i_did_not_feel_any_emotion
4996,86f400a6,‘Triad’ Thugs Use Clubs to Punish Hong Kong’s ...,https://www.thedailybeast.com/in-hong-kong-tri...,-21.0,41.0,US,dailybeast,negative_surprise,medium,"[[""‘triad’ thugs use clubs to punish hong kong...",[[]],"[[""‘triad’ thugs use clubs to punish hong kong...","[[""punish""]]",no_other_additional_emotions_are_expressed,no_i_did_not_feel_any_emotion
4997,99ca38ca,'For real': High school football season begins...,https://www.theadvocate.com/baton_rouge/sports...,-23.0,40.0,US,TheAdvocate,positive_anticipation_including_optimism,medium,"[[""season begins this week with expectations, ...","[[""high school football""]]","[[""season begins this week with expectations, ...","[[""excitement""]]",positive_surprise,no_i_did_not_feel_any_emotion
4998,f4ae5a03,100 years on: the picture that changed our vie...,https://www.theguardian.com/science/2019/may/1...,-6.0,48.0,UK,TheGuardian,positive_surprise,medium,"[[""the picture that changed our view of the un...",[[]],"[[""the picture that changed our view of the un...","[[""changed""]]",annoyance,annoyance


In [None]:
gne = gne[['headline', 'dominant_emotion']]
gne=gne.rename(columns={'headline': 'text', 'dominant_emotion': 'labels'})

In [None]:
# Map emotions to a unified set
gne_map = {
    'negative_surprise': 'surprise',
    'positive_surprise': 'surprise',
    'annoyance': 'anger',
    'negative_anticipation_including_pessimism': 'anticipation',
    'positive_anticipation_including_optimism': 'anticipation',
    'guilt': 'sadness',
    'shame': 'anger',
    'pride': 'joy',
    'love_including_like': 'love',
    'trust': 'trust'
}
gne['labels'] = gne['labels'].replace(gne_map)

In [None]:
# Drop rows with no emotion labels
gne = gne.dropna(subset=['labels'])

In [None]:
# Ensure the DataFrame is formatted with 'text' and 'labels' columns only
gne = gne[['labels', 'text']]

In [None]:
# Save the processed DataFrame
gne_save = '/content/drive/My Drive/Emotion Data Analysis Project/John/john ufd/goodnewseveryone_ufd_single.csv'
gne.to_csv(gne_save, index=False)

In [None]:
display(gne)

Unnamed: 0,labels,text
0,anger,Cops in One Village Have Been Convicted of 70 ...
1,surprise,DIY penis enlargements are a 'nationwide probl...
2,anger,Dam breaking: New Epstein accuser comes forward
3,surprise,David Beckham gets six-month driving ban for u...
4,sadness,Dead sea turtle found with spear through head ...
...,...,...
4995,surprise,‘NOW do abortion. We’ll wait.’ Ilhan Omar doub...
4996,surprise,‘Triad’ Thugs Use Clubs to Punish Hong Kong’s ...
4997,anticipation,'For real': High school football season begins...
4998,surprise,100 years on: the picture that changed our vie...


In [None]:
label_counts = gne['labels'].value_counts()

print(label_counts)

labels
surprise        1475
anger           1005
anticipation     642
sadness          561
fear             419
disgust          382
joy              329
trust            124
love              63
Name: count, dtype: int64


In [None]:
print(gne['labels'].value_counts())

labels
surprise        1475
anger           1005
anticipation     642
sadness          561
fear             419
disgust          382
joy              329
trust            124
love              63
Name: count, dtype: int64


#StackOV-GS

In [None]:
# This file is given as an excel spreadsheet with multiple sheets/tabs
# We have to read them separately using the read_excel function

stackOV_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/Emotions_GoldStandard_andAnnotation.xlsx'
stackOV_love = pd.read_excel(stackOV_path, sheet_name='Love_all')
stackOV_joy = pd.read_excel(stackOV_path, sheet_name='Joy_all')
stackOV_surprise = pd.read_excel(stackOV_path, sheet_name='Surprise_all')
stackOV_anger = pd.read_excel(stackOV_path, sheet_name='Anger_all')
stackOV_sadness = pd.read_excel(stackOV_path, sheet_name='Sadness_all')
stackOV_fear = pd.read_excel(stackOV_path, sheet_name='Fear_all')
display(stackOV_love)

  warn(msg)


Unnamed: 0,Group,Set,Unnamed: 2,Text,rater 1,rater 2,rater 3,Gold Label,Unnamed: 8,Unnamed: 9
0,A,Second,1,SVG transform on text attribute works excellen...,X,,X,LOVE,,
1,A,Second,2,Excellent! This is exactly what I needed. Thanks!,X,x,X,LOVE,,
2,A,Second,3,Have added a modern solution as of May 2014 in...,,,,,,
3,A,Second,4,Have you tried removing 'preload' attribute? (...,,,,,,
4,A,Second,5,"A smarter, entirely C++-way of doing what you ...",,,,,,
...,...,...,...,...,...,...,...,...,...,...
4795,D,Third,496,Yes - that feature is extremely useful for wri...,,,x,,,
4796,D,Third,497,"Works great! And you can add ""desc"" after the ...",x,,x,LOVE,,
4797,D,Third,498,"Yeah, I didn't know about the non-greedy thing...",x,,,,,
4798,D,Third,499,Fortunately I'm doing *very* little with Offic...,x,X,x,LOVE,,


In [None]:
dataframes = [
    ('love', stackOV_love),
    ('joy', stackOV_joy),
    ('surprise', stackOV_surprise),
    ('anger', stackOV_anger),
    ('sadness', stackOV_sadness),
    ('fear', stackOV_fear)
]

In [None]:
processed_dfs = []

In [None]:
for label, df in dataframes:
    # Keep only rows where 'Gold Label' matches the emotion label
    df = df[df['Gold Label'] == label.upper()]
    # Keep only the 'Text' column and rename it to 'text'
    df = df[['Text']].rename(columns={'Text': 'text'})
    # Add a 'labels' column with the emotion label
    df['labels'] = label
    # Reorder columns to have 'labels' first
    df = df[['labels', 'text']]
    # Append the processed DataFrame to the list
    processed_dfs.append(df)

In [None]:
# Concatenate all processed DataFrames into one
stackOV = pd.concat(processed_dfs, ignore_index=True)

In [None]:
# Save the processed dataset
stackOV_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/stackOV_GoldLabels_unified.csv'
stackOV.to_csv(stackOV_save, index=False)

In [None]:
display(stackOV)

Unnamed: 0,labels,text
0,love,SVG transform on text attribute works excellen...
1,love,Excellent! This is exactly what I needed. Thanks!
2,love,Customise toolbar above iPad keyboard adding a...
3,love,"Excellent tips, thanks! Additionally you can t..."
4,love,Excellent! Works exactly like I wanted. Thanks.
...,...,...
2969,fear,Is this valid ? what I'm really worried about ...
2970,fear,One of my tables in my SQL database has a grow...
2971,fear,I'm working on importing data from our applica...
2972,fear,Its not One is it? I have a method that gets f...


In [None]:
print(stackOV['labels'].value_counts())

labels
love        1220
anger        882
joy          491
sadness      230
fear         106
surprise      45
Name: count, dtype: int64


#Tweeteval

In [None]:
# Read the label mapping file and parse it into a dictionary
tweeteval_mapping_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/tweeteval/mapping.txt'
with open(tweeteval_mapping_path, 'r') as file:
    mapping_content = file.read()

In [None]:
tweeteval_map = {}
for line in mapping_content.strip().split('\n'):
    key, value = line.strip().split('\t')
    tweeteval_map[key] = value

In [None]:
# Read the train, test, and validation data
train_text_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/tweeteval/train_text.txt'
train_labels_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/tweeteval/train_labels.txt'

test_text_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/tweeteval/test_text.txt'
test_labels_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/tweeteval/test_labels.txt'

val_text_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/tweeteval/val_text.txt'
val_labels_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/tweeteval/val_labels.txt'

In [None]:
# Read each text file, spliting each line into its own record.
with open(train_text_path, 'r') as f:
    train_text = f.read().split('\n')
with open(train_labels_path, 'r') as f:
    train_labels = f.read().split('\n')

with open(test_text_path, 'r') as f:
    test_text = f.read().split('\n')
with open(test_labels_path, 'r') as f:
    test_labels = f.read().split('\n')

with open(val_text_path, 'r') as f:
    val_text = f.read().split('\n')
with open(val_labels_path, 'r') as f:
    val_labels = f.read().split('\n')

In [None]:
# Create DataFrames for train, test, and validation sets
tweeteval_train = pd.DataFrame({'text': train_text, 'labels': train_labels})
tweeteval_test = pd.DataFrame({'text': test_text, 'labels': test_labels})
tweeteval_val = pd.DataFrame({'text': val_text, 'labels': val_labels})

In [None]:
display(tweeteval_train.head())
display(tweeteval_test.head())
display(tweeteval_val.head())

Unnamed: 0,text,labels
0,“Worry is a down payment on a problem you may ...,2
1,My roommate: it's okay that we can't spell bec...,0
2,No but that's so cute. Atsu was probably shy a...,1
3,Rooneys fucking untouchable isn't he? Been fuc...,0
4,it's pretty depressing when u hit pan on ur fa...,3


Unnamed: 0,text,labels
0,#Deppression is real. Partners w/ #depressed p...,3
1,@user Interesting choice of words... Are you c...,0
2,My visit to hospital for care triggered #traum...,3
3,@user Welcome to #MPSVT! We are delighted to h...,1
4,What makes you feel #joyful?,1


Unnamed: 0,text,labels
0,"@user @user Oh, hidden revenge and anger...I r...",0
1,if not then #teamchristine bc all tana has don...,0
2,Hey @user #Fields in #skibbereen give your onl...,0
3,Why have #Emmerdale had to rob #robron of havi...,0
4,@user I would like to hear a podcast of you go...,0


In [None]:
# Map numeric labels to emotion labels
tweeteval_train['labels'] = tweeteval_train['labels'].replace(tweeteval_map)
tweeteval_test['labels'] = tweeteval_test['labels'].replace(tweeteval_map)
tweeteval_val['labels'] = tweeteval_val['labels'].replace(tweeteval_map)

In [None]:
# Drop records with empty labels
tweeteval_train = tweeteval_train[tweeteval_train['labels'] != '']
tweeteval_test = tweeteval_test[tweeteval_test['labels'] != '']
tweeteval_val = tweeteval_val[tweeteval_val['labels'] != '']

In [None]:
# Apply unified label mapping
tweeteval_unified_map = {'optimism': 'joy'}
tweeteval_train['original labels'] = tweeteval_train['labels']
tweeteval_test['original labels'] = tweeteval_test['labels']
tweeteval_val['original labels'] = tweeteval_val['labels']

tweeteval_train['labels'] = tweeteval_train['labels'].replace(tweeteval_unified_map)
tweeteval_test['labels'] = tweeteval_test['labels'].replace(tweeteval_unified_map)
tweeteval_val['labels'] = tweeteval_val['labels'].replace(tweeteval_unified_map)

In [None]:
train_save_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/tweeteval_train_unified.csv'
test_save_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/tweeteval_test_unified.csv'
val_save_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/tweeteval_val_unified.csv'

tweeteval_train.to_csv(train_save_path, index=False)
tweeteval_test.to_csv(test_save_path, index=False)
tweeteval_val.to_csv(val_save_path, index=False)

In [None]:
display(tweeteval_train.head())
display(tweeteval_test.head())
display(tweeteval_val.head())

Unnamed: 0,text,labels,original labels
0,“Worry is a down payment on a problem you may ...,joy,optimism
1,My roommate: it's okay that we can't spell bec...,anger,anger
2,No but that's so cute. Atsu was probably shy a...,joy,joy
3,Rooneys fucking untouchable isn't he? Been fuc...,anger,anger
4,it's pretty depressing when u hit pan on ur fa...,sadness,sadness


Unnamed: 0,text,labels,original labels
0,#Deppression is real. Partners w/ #depressed p...,sadness,sadness
1,@user Interesting choice of words... Are you c...,anger,anger
2,My visit to hospital for care triggered #traum...,sadness,sadness
3,@user Welcome to #MPSVT! We are delighted to h...,joy,joy
4,What makes you feel #joyful?,joy,joy


Unnamed: 0,text,labels,original labels
0,"@user @user Oh, hidden revenge and anger...I r...",anger,anger
1,if not then #teamchristine bc all tana has don...,anger,anger
2,Hey @user #Fields in #skibbereen give your onl...,anger,anger
3,Why have #Emmerdale had to rob #robron of havi...,anger,anger
4,@user I would like to hear a podcast of you go...,anger,anger


In [None]:
print("Train label counts:")
print(tweeteval_train['labels'].value_counts())
print("\nTest label counts:")
print(tweeteval_test['labels'].value_counts())
print("\nValidation label counts:")
print(tweeteval_val['labels'].value_counts())

Train label counts:
labels
anger      1400
joy        1002
sadness     855
Name: count, dtype: int64

Test label counts:
labels
anger      558
joy        481
sadness    382
Name: count, dtype: int64

Validation label counts:
labels
anger      160
joy        125
sadness     89
Name: count, dtype: int64


##Merged

In [None]:
# Add 'source' column to each set
tweeteval_train['source'] = 'train'
tweeteval_test['source'] = 'test'
tweeteval_val['source'] = 'val'

In [None]:
# Merge the datasets
tweeteval_merged = pd.concat([tweeteval_train, tweeteval_test, tweeteval_val], ignore_index=True)

In [None]:
merged_save_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/formatted data/tweeteval_merged_unified.csv'
tweeteval_merged.to_csv(merged_save_path, index=False)

In [None]:
display(tweeteval_merged.head())
print("\nMerged label counts:")
print(tweeteval_merged['labels'].value_counts())

Unnamed: 0,text,labels,original labels,source
0,“Worry is a down payment on a problem you may ...,joy,optimism,train
1,My roommate: it's okay that we can't spell bec...,anger,anger,train
2,No but that's so cute. Atsu was probably shy a...,joy,joy,train
3,Rooneys fucking untouchable isn't he? Been fuc...,anger,anger,train
4,it's pretty depressing when u hit pan on ur fa...,sadness,sadness,train



Merged label counts:
labels
anger      2118
joy        1608
sadness    1326
Name: count, dtype: int64


# Universal Joy

In [None]:
uni_joy_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/universal_joy_eng.csv'
uni_joy = pd.read_csv(uni_joy_path)
display(uni_joy)

Unnamed: 0,text,emotion,language
0,"[PERSON] , is one of my favorite people. [PHOT...",joy,en
1,[PHOTO] STARTED FROM THE BOTTOM NOW I'M HERE !...,anticipation,en
2,UNWRAPPED TOY is all you need to get in! OPEN ...,anticipation,en
3,"If anyone's in Coburg tonight, my old friend [...",anticipation,en
4,So super proud of my mom [PERSON] I could scre...,joy,en
...,...,...,...
282308,[PHOTO] After my interview at panda :) excited...,anticipation,en
282309,[PHOTO] NEW Ink I just got today. Done by my e...,joy,en
282310,[PHOTO] Ce sooooooir ! Unchained Mind � [WITH].,anticipation,en
282311,[PHOTO] THIS FRIDAY 00.00.000 (Rome,joy,en


In [None]:
# Renaming 'emotion' to 'labels' and dropping 'language'
uni_joy = uni_joy.rename(columns={"emotion": "labels"}).drop(columns={"language"})

In [None]:
# Ensure the DataFrame is formatted with 'text' and 'labels' columns only
uni_joy = uni_joy[['labels', 'text']]

In [None]:
# Save the processed dataset
uni_joy_save = '/content/drive/My Drive/Emotion Data Analysis Project/John/john ufd/universal_joy_ufd_single.csv'
uni_joy.to_csv(uni_joy_save, index=False)

In [None]:
display(uni_joy)

Unnamed: 0,labels,text
0,joy,"[PERSON] , is one of my favorite people. [PHOT..."
1,anticipation,[PHOTO] STARTED FROM THE BOTTOM NOW I'M HERE !...
2,anticipation,UNWRAPPED TOY is all you need to get in! OPEN ...
3,anticipation,"If anyone's in Coburg tonight, my old friend [..."
4,joy,So super proud of my mom [PERSON] I could scre...
...,...,...
282308,anticipation,[PHOTO] After my interview at panda :) excited...
282309,joy,[PHOTO] NEW Ink I just got today. Done by my e...
282310,anticipation,[PHOTO] Ce sooooooir ! Unchained Mind � [WITH].
282311,joy,[PHOTO] THIS FRIDAY 00.00.000 (Rome


In [None]:
print(uni_joy['labels'].value_counts())

labels
anticipation    115000
joy             110446
sadness          36905
anger            16726
fear              3236
Name: count, dtype: int64


##Sample

In [None]:
# Sampling 30% of entries from each label category
uni_joy_samp = uni_joy.groupby('labels', group_keys=False).apply(lambda x: x.sample(frac=0.3))

# Save the sampled dataset
uni_samp_save = '/content/drive/My Drive/Emotion Data Analysis Project/John/john ufd/universal_joy_sample_ufd_single.csv'
uni_joy_samp.to_csv(uni_samp_save, index=False)

  uni_joy_samp = uni_joy.groupby('labels', group_keys=False).apply(lambda x: x.sample(frac=0.3))


In [None]:
display(uni_joy_samp)

Unnamed: 0,labels,text
126966,anger,THIS! I HATE THIS! I HATE THIS!
225367,anger,[PHOTO] hey friends .... I just wanted to anno...
24716,anger,[PHOTO] What u bro...... u no everyone boiling...
121277,anger,"it and destroyed all the Gods, and Goddess (00..."
17648,anger,of your choice after your recovery time is up....
...,...,...
124834,sadness,[PHOTO] BREAKING NEWS : EGOISTIC RACIST CAPITA...
124126,sadness,fare not at all fare... Please don't scroll do...
238576,sadness,[PHOTO] Rajasthan assembelly 0000 Congress [PE...
156279,sadness,do we Sri Lankans trying so hard to prove that...


In [None]:
print(uni_joy_samp['labels'].value_counts())

labels
anticipation    34500
joy             33134
sadness         11072
anger            5018
fear              971
Name: count, dtype: int64


#WASSA-21

In [None]:
wassa_train_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/WASSA_train_all.csv'
wassa_test_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/WASSA_test_all.csv'
wassa_dev_path = '/content/drive/My Drive/Emotion Data Analysis Project/data/english/WASSA_dev_all.csv'

wassa_train = pd.read_csv(wassa_train_path)
wassa_test = pd.read_csv(wassa_test_path)
wassa_dev = pd.read_csv(wassa_dev_path)

display(wassa_train)
display(wassa_test)
display(wassa_dev)

Unnamed: 0,essay,emotion
0,it is really diheartening to read about these ...,0
1,the phone lines from the suicide prevention li...,0
2,"no matter what your heritage, you should be ab...",6
3,it is frightening to learn about all these sha...,2
4,the eldest generation of russians aren't being...,0
...,...,...
1855,"These days, women are winning in sports. There...",1
1856,I hate ISIS. They are a group full of hate and...,3
1857,This is just disgusting. I cannot believe that...,4
1858,I feel like the world is so corrupt that it no...,0


Unnamed: 0,essay,emotion,Unnamed: 2,Unnamed: 3
0,"Hello Friend, i am writing to you as regards a...",0,,
1,Hello friend i will like to tell you that Indi...,1,,
2,Hello friend I will like to let you know Leona...,6,,
3,"Hello friend, I will like to tell you Qatar lo...",6,,
4,"Dear friend, I will like to know that Trump’s ...",6,,
...,...,...,...,...
520,"Hey, I've always liked Billy Bob Thornton and ...",1,,
521,The fact that Donald Trump just peaced out of ...,3,,
522,I'm not okay! How can anyone harm something so...,0,,
523,I don't know anything outside of this article ...,0,,


Unnamed: 0,essay,emotion
0,The story about the air strikes is very sadden...,0
1,It is clear that climate change is something t...,0
2,I did not know this comedian but thinking abou...,2
3,I am an affirmed believer that your punishment...,3
4,"Okay, I hate hearing about disease outbreaks. ...",3
...,...,...
265,"While I feel bad for the arrest charges, I don...",3
266,This is such a tragedy. All I can think about...,2
267,I'm still on the fence regarding climate chang...,6
268,I can understand why the Jewish community woul...,1


In [None]:
# Rename columns and select relevant ones
wassa_train = wassa_train.rename(columns={'emotion': 'labels', 'essay': 'text'})[['labels', 'text']]
wassa_test = wassa_test.rename(columns={'emotion': 'labels', 'essay': 'text'})[['labels', 'text']]
wassa_dev = wassa_dev.rename(columns={'emotion': 'labels', 'essay': 'text'})[['labels', 'text']]

In [None]:
# Map numeric labels to emotion names
wassa_dict = {
    0: 'sadness',
    1: 'neutral',
    2: 'fear',
    3: 'anger',
    4: 'disgust',
    5: 'surprise',
    6: 'joy'
}
wassa_train['labels'] = wassa_train['labels'].replace(wassa_dict)
wassa_test['labels'] = wassa_test['labels'].replace(wassa_dict)
wassa_dev['labels'] = wassa_dev['labels'].replace(wassa_dict)

In [None]:
# Save the processed datasets
wassa_train_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/wassa_train_unified.csv'
wassa_test_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/wassa_test_unified.csv'
wassa_dev_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/wassa_dev_unified.csv'

wassa_train.to_csv(wassa_train_save, index=False)
wassa_test.to_csv(wassa_test_save, index=False)
wassa_dev.to_csv(wassa_dev_save, index=False)

In [None]:
display(wassa_train)
display(wassa_test)
display(wassa_dev)

Unnamed: 0,labels,text
0,sadness,it is really diheartening to read about these ...
1,sadness,the phone lines from the suicide prevention li...
2,joy,"no matter what your heritage, you should be ab..."
3,fear,it is frightening to learn about all these sha...
4,sadness,the eldest generation of russians aren't being...
...,...,...
1855,neutral,"These days, women are winning in sports. There..."
1856,anger,I hate ISIS. They are a group full of hate and...
1857,disgust,This is just disgusting. I cannot believe that...
1858,sadness,I feel like the world is so corrupt that it no...


Unnamed: 0,labels,text
0,sadness,"Hello Friend, i am writing to you as regards a..."
1,neutral,Hello friend i will like to tell you that Indi...
2,joy,Hello friend I will like to let you know Leona...
3,joy,"Hello friend, I will like to tell you Qatar lo..."
4,joy,"Dear friend, I will like to know that Trump’s ..."
...,...,...
520,neutral,"Hey, I've always liked Billy Bob Thornton and ..."
521,anger,The fact that Donald Trump just peaced out of ...
522,sadness,I'm not okay! How can anyone harm something so...
523,sadness,I don't know anything outside of this article ...


Unnamed: 0,labels,text
0,sadness,The story about the air strikes is very sadden...
1,sadness,It is clear that climate change is something t...
2,fear,I did not know this comedian but thinking abou...
3,anger,I am an affirmed believer that your punishment...
4,anger,"Okay, I hate hearing about disease outbreaks. ..."
...,...,...
265,anger,"While I feel bad for the arrest charges, I don..."
266,fear,This is such a tragedy. All I can think about...
267,joy,I'm still on the fence regarding climate chang...
268,neutral,I can understand why the Jewish community woul...


In [None]:
print(wassa_train['labels'].value_counts())
print(wassa_test['labels'].value_counts())
print(wassa_dev['labels'].value_counts())

labels
sadness     647
anger       349
joy         275
fear        194
surprise    164
disgust     149
neutral      82
Name: count, dtype: int64
labels
sadness     177
anger       122
fear         70
joy          55
surprise     40
neutral      33
disgust      28
Name: count, dtype: int64
labels
sadness     98
anger       76
fear        31
joy         25
neutral     14
surprise    14
disgust     12
Name: count, dtype: int64


##Merged

In [None]:
# Add 'source' column to each dataset
wassa_train['source'] = 'train'
wassa_test['source'] = 'test'
wassa_dev['source'] = 'dev'

In [None]:
# Concatenate the datasets into one DataFrame
wassa_merged = pd.concat([wassa_train, wassa_test, wassa_dev], ignore_index=True)

In [None]:
# Save the merged dataset
wassa_merged_save = '/content/drive/My Drive/Emotion Data Analysis Project/data/unified formatted data/wassa_merged_unified.csv'
wassa_merged.to_csv(wassa_merged_save, index=False)

In [None]:
display(wassa_merged)

Unnamed: 0,labels,text,source
0,sadness,it is really diheartening to read about these ...,train
1,sadness,the phone lines from the suicide prevention li...,train
2,joy,"no matter what your heritage, you should be ab...",train
3,fear,it is frightening to learn about all these sha...,train
4,sadness,the eldest generation of russians aren't being...,train
...,...,...,...
2650,anger,"While I feel bad for the arrest charges, I don...",dev
2651,fear,This is such a tragedy. All I can think about...,dev
2652,joy,I'm still on the fence regarding climate chang...,dev
2653,neutral,I can understand why the Jewish community woul...,dev


In [None]:
print(wassa_merged['labels'].value_counts())

labels
sadness     922
anger       547
joy         355
fear        295
surprise    218
disgust     189
neutral     129
Name: count, dtype: int64
