In [47]:
relative_path = "../original_datasets/StackOverflow/"
labels_path = "exports_from_xlsx/"
import os
filenames = []
for entry in os.scandir(relative_path+labels_path):
    if entry.is_file():
        filenames.append(entry.name)
    
label_files = {}    
for filename in filenames:
    label = filename.split("_")[2][:-4]
    label_files[label] = relative_path + labels_path + filename

for label, filename in label_files.items():
    print(f"{label}: {filename}")

anger: ../original_datasets/StackOverflow/records_by_label/stack_overflow_anger.csv
fear: ../original_datasets/StackOverflow/records_by_label/stack_overflow_fear.csv
joy: ../original_datasets/StackOverflow/records_by_label/stack_overflow_joy.csv
love: ../original_datasets/StackOverflow/records_by_label/stack_overflow_love.csv
sadness: ../original_datasets/StackOverflow/records_by_label/stack_overflow_sadness.csv
surprise: ../original_datasets/StackOverflow/records_by_label/stack_overflow_surprise.csv


# Load datasets and remove duplicates

In [48]:
import pandas as pd

label_dataframes = {}
stack_overflow_df = pd.DataFrame()
total = 0

for label, filename in label_files.items():
    # Get the data into a dataframe
    label_dataframes[label] = pd.read_csv(filename, encoding='ISO-8859-1')
    
    # Get rid of unlabeled entries
    label_dataframes[label].dropna(subset=['Gold Label'], inplace=True)
    
    # Get label count
    print(f"{label}: {len(label_dataframes[label])}")
    total += len(label_dataframes[label])
    
    # Remove second+ copies of an entry in a label
    label_dataframes[label].drop_duplicates(subset=['Text'], keep='first', inplace=True)
    
    # Add to the combined labels dataframe
    stack_overflow_df = pd.concat([stack_overflow_df, label_dataframes[label]])
    
print(f"TOTAL: {total}")
# Per paper, label counts should be:
# Love :   1220
# Joy :     491
# Surprise : 45
# Anger :   882
# Sadness : 230
# Fear :    106
# TOTAL :  2974

anger: 882
fear: 106
joy: 491
love: 1220
sadness: 230
surprise: 45
TOTAL: 2974


In [49]:
# Clean up dataframe

stack_overflow_df.columns=['Group', 'Set', 'Id', 'Text', 'Rater 1', 'Rater 2', 'Rater 3', 'Gold Label', 'Empty1', 'Empty2']
stack_overflow_df.drop(['Empty1', 'Empty2'], axis=1, inplace=True)
stack_overflow_df.sort_values(['Group', 'Set', 'Id'], ascending=True, inplace=True)

In [50]:
# Remove all copies of any duplicates, because at this point they have different labels 
stack_overflow_df.drop_duplicates(subset=['Text'], keep=False, inplace=True)

stack_overflow_df

Unnamed: 0,Group,Set,Id,Text,Rater 1,Rater 2,Rater 3,Gold Label
1100,A,Pilot,1.0,"Vineet, what you are trying to do is a terribl...",X,X,x,ANGER
1101,A,Pilot,2.0,"'Course I do, corrected.",X,X,x,JOY
1103,A,Pilot,4.0,@DrabJay: excellent suggestion! Code changed. :-),X,X,x,JOY
1106,A,Pilot,7.0,"It is specific, I'm not asking for a discussio...",X,,x,ANGER
1107,A,Pilot,8.0,@talnicolas I'm using it a few dozen times in ...,X,x,,ANGER
...,...,...,...,...,...,...,...,...
4792,D,Third,493.0,Nick Rigby wrote an excellent article for A Li...,x,X,x,LOVE
4794,D,Third,495.0,"+1 from me, I loved the leap from MFC to Qt ;)",x,X,x,LOVE
4795,D,Third,496.0,Yes - that feature is extremely useful for wri...,x,X,,JOY
4796,D,Third,497.0,"Works great! And you can add ""desc"" after the ...",x,,x,LOVE


# Exporting

In [51]:
stack_overflow_df.to_csv(f"{relative_path}stack_overflow.csv", index=False)

# Other cleaning?

In [52]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True)

In [53]:
stack_overflow_df.insert(4, 'Tokenized',
                         stack_overflow_df.apply(lambda row: tknzr.tokenize(row.Text), axis=1)
                         )


stack_overflow_df

Unnamed: 0,Group,Set,Id,Text,Tokenized,Rater 1,Rater 2,Rater 3,Gold Label
1100,A,Pilot,1.0,"Vineet, what you are trying to do is a terribl...","[Vineet, ,, what, you, are, trying, to, do, is...",X,X,x,ANGER
1101,A,Pilot,2.0,"'Course I do, corrected.","[', Course, I, do, ,, corrected, .]",X,X,x,JOY
1103,A,Pilot,4.0,@DrabJay: excellent suggestion! Code changed. :-),"[:, excellent, suggestion, !, Code, changed, ....",X,X,x,JOY
1106,A,Pilot,7.0,"It is specific, I'm not asking for a discussio...","[It, is, specific, ,, I'm, not, asking, for, a...",X,,x,ANGER
1107,A,Pilot,8.0,@talnicolas I'm using it a few dozen times in ...,"[I'm, using, it, a, few, dozen, times, in, my,...",X,x,,ANGER
...,...,...,...,...,...,...,...,...,...
4792,D,Third,493.0,Nick Rigby wrote an excellent article for A Li...,"[Nick, Rigby, wrote, an, excellent, article, f...",x,X,x,LOVE
4794,D,Third,495.0,"+1 from me, I loved the leap from MFC to Qt ;)","[+, 1, from, me, ,, I, loved, the, leap, from,...",x,X,x,LOVE
4795,D,Third,496.0,Yes - that feature is extremely useful for wri...,"[Yes, -, that, feature, is, extremely, useful,...",x,X,,JOY
4796,D,Third,497.0,"Works great! And you can add ""desc"" after the ...","[Works, great, !, And, you, can, add, "", desc,...",x,,x,LOVE


In [54]:
stack_overflow_df.insert(5, 'Tokens',
                         stack_overflow_df.apply(lambda row: len(row.Tokenized), axis=1)
                         )

stack_overflow_df

Unnamed: 0,Group,Set,Id,Text,Tokenized,Tokens,Rater 1,Rater 2,Rater 3,Gold Label
1100,A,Pilot,1.0,"Vineet, what you are trying to do is a terribl...","[Vineet, ,, what, you, are, trying, to, do, is...",61,X,X,x,ANGER
1101,A,Pilot,2.0,"'Course I do, corrected.","[', Course, I, do, ,, corrected, .]",7,X,X,x,JOY
1103,A,Pilot,4.0,@DrabJay: excellent suggestion! Code changed. :-),"[:, excellent, suggestion, !, Code, changed, ....",8,X,X,x,JOY
1106,A,Pilot,7.0,"It is specific, I'm not asking for a discussio...","[It, is, specific, ,, I'm, not, asking, for, a...",23,X,,x,ANGER
1107,A,Pilot,8.0,@talnicolas I'm using it a few dozen times in ...,"[I'm, using, it, a, few, dozen, times, in, my,...",47,X,x,,ANGER
...,...,...,...,...,...,...,...,...,...,...
4792,D,Third,493.0,Nick Rigby wrote an excellent article for A Li...,"[Nick, Rigby, wrote, an, excellent, article, f...",21,x,X,x,LOVE
4794,D,Third,495.0,"+1 from me, I loved the leap from MFC to Qt ;)","[+, 1, from, me, ,, I, loved, the, leap, from,...",14,x,X,x,LOVE
4795,D,Third,496.0,Yes - that feature is extremely useful for wri...,"[Yes, -, that, feature, is, extremely, useful,...",12,x,X,,JOY
4796,D,Third,497.0,"Works great! And you can add ""desc"" after the ...","[Works, great, !, And, you, can, add, "", desc,...",43,x,,x,LOVE


In [55]:
stack_overflow_df[stack_overflow_df['Tokens'] < 3]

Unnamed: 0,Group,Set,Id,Text,Tokenized,Tokens,Rater 1,Rater 2,Rater 3,Gold Label
258,A,Second,259.0,#ERROR!,"[#ERROR, !]",2,X,x,X,LOVE
784,A,Third,185.0,Excellent answer,"[Excellent, answer]",2,X,x,X,LOVE
838,A,Third,239.0,totally awesome,"[totally, awesome]",2,X,x,X,LOVE
2389,B,Third,490.0,Congratulations!,"[Congratulations, !]",2,X,x,x,LOVE
4195,D,Second,496.0,Beaaaaaaautiful!,"[Beaaaaaaautiful, !]",2,x,,x,LOVE
4467,D,Third,168.0,goooooooooooooood!,"[goooooooooooooood, !]",2,x,X,x,JOY


In [56]:
stack_overflow_df[stack_overflow_df['Text'].str.contains(":") & ~stack_overflow_df['Text'].str.contains(": ")]

Unnamed: 0,Group,Set,Id,Text,Tokenized,Tokens,Rater 1,Rater 2,Rater 3,Gold Label
1138,A,Pilot,39.0,"Not for me, but I can suggest some possibiliti...","[Not, for, me, ,, but, I, can, suggest, some, ...",189,,X,x,ANGER
1145,A,Pilot,46.0,"Here's a secret. In the chrome dev tools, do a...","[Here's, a, secret, ., In, the, chrome, dev, t...",128,X,X,x,JOY
1168,A,Pilot,69.0,"I'm starting my first Drupal project, pretty e...","[I'm, starting, my, first, Drupal, project, ,,...",101,X,X,x,JOY
1170,A,Pilot,71.0,"It's not a homework, and since there is nowher...","[It's, not, a, homework, ,, and, since, there,...",27,X,X,x,JOY
1180,A,Pilot,81.0,Excellent point! :),"[Excellent, point, !, :)]",4,X,X,x,JOY
...,...,...,...,...,...,...,...,...,...,...
4682,D,Third,383.0,"+1 for the docbkx Maven plugin, it provides ev...","[+, 1, for, the, docbkx, Maven, plugin, ,, it,...",37,x,X,x,LOVE
4689,D,Third,390.0,Great question! There have already been a few ...,"[Great, question, !, There, have, already, bee...",15,x,,x,LOVE
4726,D,Third,427.0,pr0n for sure :))))))) (kidding Jonathan),"[pr0n, for, sure, :), ), ), (, kidding, Jonath...",10,x,X,x,JOY
4786,D,Third,487.0,@StoneFree welcome to the future brave time tr...,"[welcome, to, the, future, brave, time, travel...",9,,X,x,JOY


In [57]:
stack_overflow_df[stack_overflow_df['Text'].str.contains(":")]

Unnamed: 0,Group,Set,Id,Text,Tokenized,Tokens,Rater 1,Rater 2,Rater 3,Gold Label
1103,A,Pilot,4.0,@DrabJay: excellent suggestion! Code changed. :-),"[:, excellent, suggestion, !, Code, changed, ....",8,X,X,x,JOY
1116,A,Pilot,17.0,When I refactor the following line: using Resh...,"[When, I, refactor, the, following, line, :, u...",51,X,X,x,ANGER
1120,A,Pilot,21.0,"To push it to Heroku, you'll have to add it to...","[To, push, it, to, Heroku, ,, you'll, have, to...",71,,X,x,ANGER
1130,A,Pilot,31.0,"@njahnke: to me, it does not matter what they ...","[:, to, me, ,, it, does, not, matter, what, th...",43,X,X,x,ANGER
1134,A,Pilot,35.0,I absolutely hate cluttered seed files. Admitt...,"[I, absolutely, hate, cluttered, seed, files, ...",109,X,X,x,ANGER
...,...,...,...,...,...,...,...,...,...,...
4754,D,Third,455.0,I have a small WPF application that hosts a Ri...,"[I, have, a, small, WPF, application, that, ho...",84,x,X,x,LOVE
4777,D,Third,478.0,I have the following code for searching multip...,"[I, have, the, following, code, for, searching...",93,,X,x,JOY
4780,D,Third,481.0,"Dave, very nice!! I didn't know that was in th...","[Dave, ,, very, nice, !, !, I, didn't, know, t...",57,x,,x,LOVE
4786,D,Third,487.0,@StoneFree welcome to the future brave time tr...,"[welcome, to, the, future, brave, time, travel...",9,,X,x,JOY
