In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Read in the database and csv file
file_path = "./01-419FinalProjectDataset.csv"
df = pd.read_csv(file_path, index_col='ID')

#df

In [3]:
#Split the dataset into half for testing to see results.
#df = df.head(106)  #5 participant labelled data
#df = df.tail(167)  #2 participant labelled data

#df

In [4]:
#Reformat the dataframe
df.drop("Research Paper ID", axis=1, inplace=True) #Research Paper ID column is unneeded

#Rename column names to shorten their names
df = df.rename(
    columns={
        "Research Paper Intensity": "ori_i",
        "Researcher 1 Classified Emotion": "r1_e",
        "Researcher 1 Classified Intensity": "r1_i",
        "Researcher 2 Classified Emotion": "r2_e",
        "Researcher 2 Classified Intensity": "r2_i",
        "Participant 1 Classified Emotion": "p1_e",
        "Participant 1 Classified Intensity" : "p1_i",
        "Participant 2 Classified Emotion": "p2_e",
        "Participant 2 Classified Intensity": "p2_i",
        "Participant 3 Classified Emotion": "p3_e",
        "Participant 3 Classified Intensity": "p3_i",
        "Participant 4 Classified Emotion": "p4_e",
        "Participant 4 Classified Intensity": "p4_i",
        "Participant 5 Classified Emotion": "p5_e",
        "Participant 5 Classified Intensity": "p5_i",
        "Participant 6 Classified Emotion": "p6_e",
        "Participant 6 Classified Intensity": "p6_i"
    }
)

#For easy reference later
emotion_cols = ['r1_e', 'r2_e', 'p1_e', 'p2_e', 'p3_e', 'p4_e', 'p5_e', 'p6_e']
intensity_cols = ['r1_i', 'r2_i', 'p1_i', 'p2_i', 'p3_i', 'p4_i', 'p5_i', 'p6_i']

In [5]:
#Convert all annotated emotions into lowercase
for col in emotion_cols:
    df[col] = df[col].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [6]:
#df

In [7]:
df.to_csv("./02-Cleaned_Dataset.csv")

In [8]:
#Deprecated functions. 

def is_num(x):
    if type(x) == type(int()) or type(x) == type(float()): return True
    else: return False

#An override of np.nan that accepts non-numbers
def is_nan(x):
    if is_num(x): return np.isnan(x)
    else: return False

def combine_annotations(anno1, anno2):
    #Both annotations are NaN
    if is_nan(anno1) and is_nan(anno2): return np.nan
    
    #If only 1 annotation is empty, return the non-empty annotation
    elif is_nan(anno1) or is_nan(anno2):
        return (anno2 if is_nan(anno1) else anno1)
        
    else:
        return "ERROR"
        #raise Exception("Combining two annotations!")
        
#nan, nan (ok)   
#nan, emotion (ok)
#emotion, nan (ok)
#emotion, emotion
#nan, intensity (ok)
#intensity, nan (ok)
#intensity, intensity

## 2-Annotation Datasets

Reducing to only 2 annotations per tweet. Some annotation information is lost.

### Research Focussed 2-Annotation Set (anno2_r)
Annotation 1 = Researcher annotations <br>
Annotation 2 = The most common participant emotion and intensity annotations

In [9]:
#Annotation 1: the researcher annotations
anno1_e = df['r1_e'].combine_first(df['r2_e'])
anno1_i = df['r1_i'].combine_first(df['r2_i'])

#Annotation 2: the most common emotion and intensity from entire dataset (excluding r1+r2)
temp_e_df = df.copy(deep=True)
temp_e_df = temp_e_df.drop(columns=['Tweet', 'ori_i', 'r1_e', 'r2_e']+intensity_cols)
anno2_e = temp_e_df.mode(axis='columns')[0]

temp_i_df = df.copy(deep=True)
temp_i_df = df.drop(columns=['Tweet', 'ori_i', 'r1_i', 'r2_i']+emotion_cols)
anno2_i = temp_i_df.mode(axis='columns')[0]

#Rename the new combined series
anno1_e = anno1_e.rename('anno1_e')
anno1_i = anno1_i.rename('anno1_i')
anno2_e = anno2_e.rename('anno2_e')
anno2_i = anno2_i.rename('anno2_i')

#pd.options.display.max_rows = 999
#display(anno2_i)
#pd.options.display.max_rows = 10

In [10]:
anno2_r = pd.concat([df['Tweet'], df['ori_i'], anno1_e, anno1_i, anno2_e, anno2_i],
                    axis=1)
anno2_r

Unnamed: 0_level_0,Tweet,ori_i,anno1_e,anno1_i,anno2_e,anno2_i
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Depression sucks! #depression,0.958,depressed,3.0,depressed,2.0
1,Feeling worthless as always #depression,0.958,depressed,4.0,depressed,1.0
2,Feeling worthless as always,0.958,depressed,4.0,depressed,2.0
3,Im think ima lay in bed all day and sulk. Life...,0.934,depressed,5.0,depressed,4.0
4,So when I try I fail... and when I don't try.....,0.917,depressed,4.0,displeased,4.0
...,...,...,...,...,...,...
269,"Pops are joyless, soulless toys which look nea...",0.354,displeased,3.0,displeased,1.0
270,Why is it that we rejoice at a birth and griev...,0.354,depressed,3.0,displeased,2.0
271,Regret for the things we did can be tempered b...,0.354,hurt,1.0,guilty,3.0
272,In serious need of a nap,0.354,depressed,4.0,depressed,1.0


In [11]:
anno2_r.to_csv("./02-anno2_r.csv")

### Participant Focussed 2-Annotation Set (anno2_p)
Annotation 1 = Combined Annotations from Participant 1,2,3 <br>
Annotation 2 = The most common participant4,5,6 and researcher emotion and intensity annotations

In [12]:
#annotation 1: p1 + p2 + top half of p3
anno1_e = df['p1_e'].combine_first(df['p2_e'])
anno1_e = anno2_e.combine_first(df['p3_e'])
anno1_i = df['p1_i'].combine_first(df['p2_i'])
anno1_i = anno2_i.combine_first(df['p3_i'])

#annotation 2: the most common emotion and most common intensity from entire dataset (excluding p1,p2,p3)
temp_e_df = df.copy(deep=True)
temp_e_df = temp_e_df.drop(columns=['Tweet', 'ori_i', 'p1_e', 'p2_e', 'p3_e']+intensity_cols)
anno2_e = temp_e_df.mode(axis='columns')[0]

temp_i_df = df.copy(deep=True)
temp_i_df = df.drop(columns=['Tweet', 'ori_i', 'p1_i', 'p2_i', 'p3_i']+emotion_cols)
anno2_i = temp_i_df.mode(axis='columns')[0]

#Rename the new combined series
anno1_e = anno1_e.rename('anno1_e')
anno1_i = anno1_i.rename('anno1_i')
anno2_e = anno2_e.rename('anno2_e')
anno2_i = anno2_i.rename('anno2_i')

In [13]:
anno2_p = pd.concat([df['Tweet'], df['ori_i'], anno1_e, anno1_i, anno2_e, anno2_i],
                    axis=1)
anno2_p

Unnamed: 0_level_0,Tweet,ori_i,anno1_e,anno1_i,anno2_e,anno2_i
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Depression sucks! #depression,0.958,depressed,2.0,depressed,2.0
1,Feeling worthless as always #depression,0.958,depressed,1.0,depressed,1.0
2,Feeling worthless as always,0.958,depressed,2.0,depressed,2.0
3,Im think ima lay in bed all day and sulk. Life...,0.934,depressed,4.0,depressed,4.0
4,So when I try I fail... and when I don't try.....,0.917,displeased,4.0,depressed,4.0
...,...,...,...,...,...,...
269,"Pops are joyless, soulless toys which look nea...",0.354,displeased,1.0,displeased,3.0
270,Why is it that we rejoice at a birth and griev...,0.354,displeased,2.0,depressed,3.0
271,Regret for the things we did can be tempered b...,0.354,guilty,3.0,hurt,1.0
272,In serious need of a nap,0.354,depressed,1.0,depressed,4.0


In [14]:
anno2_p.to_csv("./02-anno2_p.csv")

## Triple Annotation Dataset

In [15]:
#Combine the researcher annotations together
anno1_e = df['r1_e'].combine_first(df['r2_e'])
anno1_i = df['r1_i'].combine_first(df['r2_i'])

#Combine participant 1, 2, 3 annotations together
anno2_e = df['p1_e'].combine_first(df['p2_e'])
anno2_e = anno2_e.combine_first(df['p3_e'])
anno2_i = df['p1_i'].combine_first(df['p2_i'])
anno2_i = anno2_i.combine_first(df['p3_i'])

#Combine participant 4's top half annotations with participant3's bottom half
anno3_e = df['p4_e'].combine_first(df['p3_e'])
anno3_i = df['p4_i'].combine_first(df['p3_i'])

#Rename the new combined series
anno1_e = anno1_e.rename('anno1_e')
anno1_i = anno1_i.rename('anno1_i')
anno2_e = anno2_e.rename('anno2_e')
anno2_i = anno2_i.rename('anno2_i')
anno3_e = anno3_e.rename('anno3_e')
anno3_i = anno3_i.rename('anno3_i')

In [16]:
anno3 = pd.concat([df['Tweet'], df['ori_i'], anno1_e, anno1_i, anno2_e, anno2_i, anno3_e, anno3_i],
                    axis=1)
anno3

Unnamed: 0_level_0,Tweet,ori_i,anno1_e,anno1_i,anno2_e,anno2_i,anno3_e,anno3_i
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Depression sucks! #depression,0.958,depressed,3.0,depressed,5.0,displeased,2.0
1,Feeling worthless as always #depression,0.958,depressed,4.0,hurt,4.0,displeased,1.0
2,Feeling worthless as always,0.958,depressed,4.0,hurt,4.0,displeased,3.0
3,Im think ima lay in bed all day and sulk. Life...,0.934,depressed,5.0,displeased,2.0,depressed,4.0
4,So when I try I fail... and when I don't try.....,0.917,depressed,4.0,displeased,3.0,depressed,5.0
...,...,...,...,...,...,...,...,...
269,"Pops are joyless, soulless toys which look nea...",0.354,displeased,3.0,displeased,3.0,displeased,1.0
270,Why is it that we rejoice at a birth and griev...,0.354,depressed,3.0,grief,3.0,displeased,2.0
271,Regret for the things we did can be tempered b...,0.354,hurt,1.0,guilty,3.0,guilty,4.0
272,In serious need of a nap,0.354,depressed,4.0,depressed,2.0,depressed,1.0


In [17]:
anno3.to_csv("./02-anno3.csv")