# Descritpive evaluation kappa

Comparison of demographic groups using Cohen's kappa. 

In [None]:
import pandas as pd
import matplotlib
import numpy as np
import plotly.express as px
from sklearn.metrics import cohen_kappa_score

In [None]:
# Colorcodes for the graphs
colors = ["#7dc491","#7dc4be","#7d91c4","#8b7dc4","#a57dc4", "#6d0f70"]
classes = ['positive', 'negative', 'neutral', 'unclear']

## Data

In [None]:
# Import data IAA
iaa_tweets = pd.read_excel("tweets.xlsx")

### Annotations

In [None]:
# read excel file with every annotated tweet and the annotator id
df_annotations = pd.read_excel("annotations.xlsx")

### Annotator demographics

In [None]:
# read excel file with every annotated tweet and the annotator id
df_annotators = pd.read_excel("annotators.xlsx")

In [None]:
df_annotations = pd.merge(df_annotations, df_annotators, how='inner', on='Annotator_id')
df_annotations = pd.merge(df_annotations, iaa_tweets, how='inner', on=['Tweet_id', 'text_clean'])
df_annotations

Unnamed: 0,Sentiment,text_clean,Tweet_id,Annotator_id,Tweets_total,Tweets_correct,Acc,Gender,Flemish_Provinces,Age,Age_cat,Age_cat_two,Education,Education_cat,GR,iaa
0,neutral,ik heb 13u aan een stuk gewerkt come at me,1,67,300,252,0.840000,male,VLAAMS-BRABANT,30,30 to 39,above 23,doctorate,university,neutral,0.57927
1,neutral,ik heb 13u aan een stuk gewerkt come at me,1,82,1100,864,0.785455,female,OOST-VLAANDEREN,22,20 to 29,below 23,professional bachelor,no university,neutral,0.57927
2,neutral,ik heb 13u aan een stuk gewerkt come at me,1,18,300,199,0.663333,male,OOST-VLAANDEREN,55,50 to 59,above 23,high school,no university,neutral,0.57927
3,neutral,ik heb 13u aan een stuk gewerkt come at me,1,64,1500,1192,0.794667,male,VLAAMS-BRABANT,23,20 to 29,below 23,master,university,neutral,0.57927
4,negative,ik heb 13u aan een stuk gewerkt come at me,1,38,300,216,0.720000,female,OOST-VLAANDEREN,56,50 to 59,above 23,associate degree (HBO5),no university,neutral,0.57927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,neutral,Vandaag gepresenteerd door @BeMobileHQ: #CITRU...,2600,26,200,159,0.795000,male,VLAAMS-BRABANT,23,20 to 29,below 23,academic bachelor,university,neutral,1.00000
14996,neutral,Vandaag gepresenteerd door @BeMobileHQ: #CITRU...,2600,32,100,73,0.730000,male,WEST-VLAANDEREN,32,30 to 39,above 23,professional bachelor,no university,neutral,1.00000
14997,neutral,Vandaag gepresenteerd door @BeMobileHQ: #CITRU...,2600,5,100,71,0.710000,male,VLAAMS-BRABANT,24,20 to 29,above 23,master,university,neutral,1.00000
14998,neutral,Vandaag gepresenteerd door @BeMobileHQ: #CITRU...,2600,94,100,72,0.720000,female,WEST-VLAANDEREN,36,30 to 39,above 23,professional bachelor,no university,neutral,1.00000


## Gender

### Data

In [None]:
# create new column with a boolean value
df_annotations_gender = df_annotations
df_annotations_gender['male_pos'] = (df_annotations_gender['Sentiment'] == 'positive') & (df_annotations_gender['Gender'] == 'male')
df_annotations_gender['male_neu'] = (df_annotations_gender['Sentiment'] == 'neutral') & (df_annotations_gender['Gender'] == 'male')
df_annotations_gender['male_neg'] = (df_annotations_gender['Sentiment'] == 'negative') & (df_annotations_gender['Gender'] == 'male')
df_annotations_gender['male_unc'] = (df_annotations_gender['Sentiment'] == 'unclear') & (df_annotations_gender['Gender'] == 'male')
df_annotations_gender['female_pos'] = (df_annotations_gender['Sentiment'] == 'positive') & (df_annotations_gender['Gender'] == 'female')
df_annotations_gender['female_neu'] = (df_annotations_gender['Sentiment'] == 'neutral') & (df_annotations_gender['Gender'] == 'female')
df_annotations_gender['female_neg'] = (df_annotations_gender['Sentiment'] == 'negative') & (df_annotations_gender['Gender'] == 'female')
df_annotations_gender['female_unc'] = (df_annotations_gender['Sentiment'] == 'unclear') & (df_annotations_gender['Gender'] == 'female')

df_annotations_gender['pos'] = (df_annotations_gender['Sentiment'] == 'positive') 
df_annotations_gender['neu'] = (df_annotations_gender['Sentiment'] == 'neutral')
df_annotations_gender['neg'] = (df_annotations_gender['Sentiment'] == 'negative')
df_annotations_gender['unc'] = (df_annotations_gender['Sentiment'] == 'unclear')
df_annotations_gender['male'] = (df_annotations_gender['Gender'] == 'male')
df_annotations_gender['female'] = (df_annotations_gender['Gender'] == 'female')

In [None]:
# group by tweet and sum the nr of tweets for each gender group
df_tweet_gender = df_annotations_gender.groupby('Tweet_id').agg(text_clean=('text_clean','first'),
                   GR=('GR','first'),
                   iaa=('iaa', 'first'),
                   male_pos=('male_pos','sum'),
                   male_neu=('male_neu','sum'),
                   male_neg=('male_neg','sum'),
                   male_unc=('male_unc','sum'),
                   female_pos=('female_pos','sum'),
                   female_neu=('female_neu','sum'),
                   female_neg=('female_neg','sum'),
                   female_unc=('female_unc','sum'),
                   male=('male','sum'),
                   female=('female','sum'),
                   pos=('pos','sum'),
                   neu=('neu','sum'),
                   neg=('neg','sum'),
                   unc=('unc','sum')).reset_index()

df_tweet_gender

Unnamed: 0,Tweet_id,text_clean,GR,iaa,male_pos,male_neu,male_neg,male_unc,female_pos,female_neu,female_neg,female_unc,male,female,pos,neu,neg,unc
0,1,ik heb 13u aan een stuk gewerkt come at me,neutral,0.579270,0,3,0,0,0,1,1,0,3,2,0,4,1,0
1,2,Olive Garden - SNL,neutral,0.556808,0,2,0,1,0,2,0,0,3,2,0,4,0,1
2,3,"Jozef De Kesel wordt zondag kardinaal: ""Ik kij...",positive,0.613758,2,1,0,0,2,0,0,0,3,2,4,1,0,0
3,4,mijn wifi SUCKT echt en ik kant niemeer aan,negative,1.000000,0,0,3,0,0,0,2,0,3,2,0,0,5,0
4,5,Gedraag je maar als een hoe,negative,0.464437,0,0,2,1,0,2,0,0,3,2,0,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,Kheb mij gisterenavond best wel goe gehad 😌,positive,1.000000,2,0,0,0,3,0,0,0,2,3,5,0,0,0
2996,2997,Blij dat ze zich aan de charter houden! #homo...,positive,0.518596,2,0,0,0,2,0,0,1,2,3,4,0,0,1
2997,2998,Ik mis mijn fiets,negative,0.575348,0,1,1,0,0,0,3,0,2,3,0,1,4,0
2998,2999,Weer veel te laat naar bed. Dat ga ik morgenvr...,negative,1.000000,0,0,2,0,0,0,3,0,2,3,0,0,5,0


In [None]:
# get the column name of the majority label of each demographic group
df_tweet_gender['Majority_Label_Male'] = df_tweet_gender[['male_pos', 'male_neu', 'male_neg', 'male_unc']].idxmax(axis=1)
df_tweet_gender['Majority_Label_Female'] = df_tweet_gender[['female_pos', 'female_neu', 'female_neg', 'female_unc']].idxmax(axis=1)

In [None]:
# when there is no majority label (maximum == second highest)
# a random label that is choosen out of the labels that where annotated by the demographic groups
for index_tweet, row_tweet in df_tweet_gender.iterrows():
  if max(row_tweet[['male_pos', 'male_neu', 'male_neg', 'male_unc']]) == sorted(row_tweet[['male_pos', 'male_neu', 'male_neg', 'male_unc']])[2]:
    columns = np.nonzero(row_tweet[['male_pos', 'male_neu', 'male_neg', 'male_unc']].to_numpy())[0]
    if len(columns) > 1:
      random_column = np.random.choice(columns)
      df_tweet_gender.loc[index_tweet,'Majority_Label_Male'] = random_column

  if max(row_tweet[['female_pos', 'female_neu', 'female_neg', 'female_unc']]) == sorted(row_tweet[['female_pos', 'female_neu', 'female_neg', 'female_unc']])[2]:
      columns = np.nonzero(row_tweet[['female_pos', 'female_neu', 'female_neg', 'female_unc']].to_numpy())[0]
      if len(columns) > 1:
        random_column = np.random.choice(columns)
        df_tweet_gender.loc[index_tweet,'Majority_Label_Female'] = random_column




In [None]:
# assign the correct label
label_map = {0: 'positive',
                   1: 'neutral',
                   2: 'negative',
                   3: 'unclear', 
                   'male_neg': 'negative',
                   'female_neg': 'negative',
                   'male_pos': 'positive',
                   'female_pos': 'positive',
                   'male_neu': 'neutral',
                   'female_neu': 'neutral',
                   'male_unc': 'unclear',
                   'female_unc': 'unclear'}

df_tweet_gender['Majority_Label_Male'] = df_tweet_gender['Majority_Label_Male'].map(label_map)
df_tweet_gender['Majority_Label_Female'] = df_tweet_gender['Majority_Label_Female'].map(label_map)


In [None]:
#Cohen's kappa doesn't work with NaN so only tweets that are labelled by both groups can be taken into consederation
df_tweet_gender = df_tweet_gender[df_tweet_gender['male'] != 0]
df_tweet_gender = df_tweet_gender[df_tweet_gender['female'] != 0]

In [None]:
df_tweet_gender

Unnamed: 0,Tweet_id,text_clean,GR,iaa,male_pos,male_neu,male_neg,male_unc,female_pos,female_neu,female_neg,female_unc,male,female,pos,neu,neg,unc,Majority_Label_Male,Majority_Label_Female
0,1,ik heb 13u aan een stuk gewerkt come at me,neutral,0.579270,0,3,0,0,0,1,1,0,3,2,0,4,1,0,neutral,neutral
1,2,Olive Garden - SNL,neutral,0.556808,0,2,0,1,0,2,0,0,3,2,0,4,0,1,neutral,neutral
2,3,"Jozef De Kesel wordt zondag kardinaal: ""Ik kij...",positive,0.613758,2,1,0,0,2,0,0,0,3,2,4,1,0,0,positive,positive
3,4,mijn wifi SUCKT echt en ik kant niemeer aan,negative,1.000000,0,0,3,0,0,0,2,0,3,2,0,0,5,0,negative,negative
4,5,Gedraag je maar als een hoe,negative,0.464437,0,0,2,1,0,2,0,0,3,2,0,2,2,1,negative,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,Kheb mij gisterenavond best wel goe gehad 😌,positive,1.000000,2,0,0,0,3,0,0,0,2,3,5,0,0,0,positive,positive
2996,2997,Blij dat ze zich aan de charter houden! #homo...,positive,0.518596,2,0,0,0,2,0,0,1,2,3,4,0,0,1,positive,positive
2997,2998,Ik mis mijn fiets,negative,0.575348,0,1,1,0,0,0,3,0,2,3,0,1,4,0,neutral,negative
2998,2999,Weer veel te laat naar bed. Dat ga ik morgenvr...,negative,1.000000,0,0,2,0,0,0,3,0,2,3,0,0,5,0,negative,negative


### Cohen's Kappa
Inter rater agreement for two groups


In [None]:
cohen_kappa_score(df_tweet_gender['Majority_Label_Male'], df_tweet_gender['Majority_Label_Female'], labels=classes)

0.44606023682622575

## Age

### Data

In [None]:
# create new column with a boolean value
df_tweet_age = df_annotations
df_tweet_age['b_pos'] = (df_tweet_age['Sentiment'] == 'positive') & (df_tweet_age['Age_cat_two'] == 'below 23')
df_tweet_age['b_neu'] = (df_tweet_age['Sentiment'] == 'neutral') & (df_tweet_age['Age_cat_two'] == 'below 23')
df_tweet_age['b_neg'] = (df_tweet_age['Sentiment'] == 'negative') & (df_tweet_age['Age_cat_two'] == 'below 23')
df_tweet_age['b_unc'] = (df_tweet_age['Sentiment'] == 'unclear') & (df_tweet_age['Age_cat_two'] == 'below 23')
df_tweet_age['a_pos'] = (df_tweet_age['Sentiment'] == 'positive') & (df_tweet_age['Age_cat_two'] == 'above 23')
df_tweet_age['a_neu'] = (df_tweet_age['Sentiment'] == 'neutral') & (df_tweet_age['Age_cat_two'] == 'above 23')
df_tweet_age['a_neg'] = (df_tweet_age['Sentiment'] == 'negative') & (df_tweet_age['Age_cat_two'] == 'above 23')
df_tweet_age['a_unc'] = (df_tweet_age['Sentiment'] == 'unclear') & (df_tweet_age['Age_cat_two'] == 'above 23')

df_tweet_age['pos'] = (df_tweet_age['Sentiment'] == 'positive') 
df_tweet_age['neu'] = (df_tweet_age['Sentiment'] == 'neutral')
df_tweet_age['neg'] = (df_tweet_age['Sentiment'] == 'negative')
df_tweet_age['unc'] = (df_tweet_age['Sentiment'] == 'unclear')
df_tweet_age['b'] = (df_tweet_age['Age_cat_two'] == 'below 23')
df_tweet_age['a'] = (df_tweet_age['Age_cat_two'] == 'above 23')

In [None]:
# group by tweet and sum the nr of tweets for each group
df_tweet_age = df_tweet_age.groupby('Tweet_id').agg(text_clean=('text_clean','first'),
                   GR=('GR','first'),
                   iaa=('iaa', 'first'),
                   b_pos=('b_pos','sum'),
                   b_neu=('b_neu','sum'),
                   b_neg=('b_neg','sum'),
                   b_unc=('b_unc','sum'),
                   a_pos=('a_pos','sum'),
                   a_neu=('a_neu','sum'),
                   a_neg=('a_neg','sum'),
                   a_unc=('a_unc','sum'),
                   b=('b','sum'),
                   a=('a','sum'),
                   pos=('pos','sum'),
                   neu=('neu','sum'),
                   neg=('neg','sum'),
                   unc=('unc','sum')).reset_index()

df_tweet_age

Unnamed: 0,Tweet_id,text_clean,GR,iaa,b_pos,b_neu,b_neg,b_unc,a_pos,a_neu,a_neg,a_unc,b,a,pos,neu,neg,unc
0,1,ik heb 13u aan een stuk gewerkt come at me,neutral,0.579270,0,2,0,0,0,2,1,0,2,3,0,4,1,0
1,2,Olive Garden - SNL,neutral,0.556808,0,2,0,0,0,2,0,1,2,3,0,4,0,1
2,3,"Jozef De Kesel wordt zondag kardinaal: ""Ik kij...",positive,0.613758,1,1,0,0,3,0,0,0,2,3,4,1,0,0
3,4,mijn wifi SUCKT echt en ik kant niemeer aan,negative,1.000000,0,0,2,0,0,0,3,0,2,3,0,0,5,0
4,5,Gedraag je maar als een hoe,negative,0.464437,0,1,1,0,0,1,1,1,2,3,0,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,Kheb mij gisterenavond best wel goe gehad 😌,positive,1.000000,3,0,0,0,2,0,0,0,3,2,5,0,0,0
2996,2997,Blij dat ze zich aan de charter houden! #homo...,positive,0.518596,3,0,0,0,1,0,0,1,3,2,4,0,0,1
2997,2998,Ik mis mijn fiets,negative,0.575348,0,1,2,0,0,0,2,0,3,2,0,1,4,0
2998,2999,Weer veel te laat naar bed. Dat ga ik morgenvr...,negative,1.000000,0,0,3,0,0,0,2,0,3,2,0,0,5,0


In [None]:
df_tweet_age['Majority_Label_B'] = df_tweet_age[['b_pos', 'b_neu', 'b_neg', 'b_unc']].idxmax(axis=1)
df_tweet_age['Majority_Label_A'] = df_tweet_age[['a_pos', 'a_neu', 'a_neg', 'a_unc']].idxmax(axis=1)

In [None]:
for index_tweet, row_tweet in df_tweet_age.iterrows():
  if max(row_tweet[['b_pos', 'b_neu', 'b_neg', 'b_unc']]) == sorted(row_tweet[['b_pos', 'b_neu', 'b_neg', 'b_unc']])[2]:
    columns = np.nonzero(row_tweet[['b_pos', 'b_neu', 'b_neg', 'b_unc']].to_numpy())[0]
    if len(columns) > 1:
      random_column = np.random.choice(columns)
      df_tweet_age.loc[index_tweet,'Majority_Label_B'] = random_column

  if max(row_tweet[['a_pos', 'a_neu', 'a_neg', 'a_unc']]) == sorted(row_tweet[['a_pos', 'a_neu', 'a_neg', 'a_unc']])[2]:
      columns = np.nonzero(row_tweet[['a_pos', 'a_neu', 'a_neg', 'a_unc']].to_numpy())[0]
      if len(columns) > 1:
        random_column = np.random.choice(columns)
        df_tweet_age.loc[index_tweet,'Majority_Label_A'] = random_column




In [None]:
label_map = {0: 'positive',
                   1: 'neutral',
                   2: 'negative',
                   3: 'unclear', 
                   'b_neg': 'negative',
                   'a_neg': 'negative',
                   'b_pos': 'positive',
                   'a_pos': 'positive',
                   'b_neu': 'neutral',
                   'a_neu': 'neutral',
                   'b_unc': 'unclear',
                   'a_unc': 'unclear'}

df_tweet_age['Majority_Label_B'] = df_tweet_age['Majority_Label_B'].map(label_map)
df_tweet_age['Majority_Label_A'] = df_tweet_age['Majority_Label_A'].map(label_map)


In [None]:
#Cohen's kappa doesn't work with NaN so only tweets that are labelled by both groups can be taken into consederation
df_tweet_age = df_tweet_age[df_tweet_age['b'] != 0]
df_tweet_age = df_tweet_age[df_tweet_age['a'] != 0]

In [None]:
df_tweet_age

Unnamed: 0,Tweet_id,text_clean,GR,iaa,b_pos,b_neu,b_neg,b_unc,a_pos,a_neu,a_neg,a_unc,b,a,pos,neu,neg,unc,Majority_Label_B,Majority_Label_A
0,1,ik heb 13u aan een stuk gewerkt come at me,neutral,0.579270,0,2,0,0,0,2,1,0,2,3,0,4,1,0,neutral,neutral
1,2,Olive Garden - SNL,neutral,0.556808,0,2,0,0,0,2,0,1,2,3,0,4,0,1,neutral,neutral
2,3,"Jozef De Kesel wordt zondag kardinaal: ""Ik kij...",positive,0.613758,1,1,0,0,3,0,0,0,2,3,4,1,0,0,positive,positive
3,4,mijn wifi SUCKT echt en ik kant niemeer aan,negative,1.000000,0,0,2,0,0,0,3,0,2,3,0,0,5,0,negative,negative
4,5,Gedraag je maar als een hoe,negative,0.464437,0,1,1,0,0,1,1,1,2,3,0,2,2,1,neutral,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,Kheb mij gisterenavond best wel goe gehad 😌,positive,1.000000,3,0,0,0,2,0,0,0,3,2,5,0,0,0,positive,positive
2996,2997,Blij dat ze zich aan de charter houden! #homo...,positive,0.518596,3,0,0,0,1,0,0,1,3,2,4,0,0,1,positive,unclear
2997,2998,Ik mis mijn fiets,negative,0.575348,0,1,2,0,0,0,2,0,3,2,0,1,4,0,negative,negative
2998,2999,Weer veel te laat naar bed. Dat ga ik morgenvr...,negative,1.000000,0,0,3,0,0,0,2,0,3,2,0,0,5,0,negative,negative


### Cohen's Kappa

In [None]:
cohen_kappa_score(df_tweet_age['Majority_Label_B'], df_tweet_age['Majority_Label_A'], labels=classes)

0.4282595682201964

## Education

### Data


In [None]:
# create new column with a boolean value
df_tweet_edu = df_annotations
df_tweet_edu['b_pos'] = (df_tweet_edu['Sentiment'] == 'positive') & (df_tweet_edu['Education_cat'] == 'no university')
df_tweet_edu['b_neu'] = (df_tweet_edu['Sentiment'] == 'neutral') & (df_tweet_edu['Education_cat'] == 'no university')
df_tweet_edu['b_neg'] = (df_tweet_edu['Sentiment'] == 'negative') & (df_tweet_edu['Education_cat'] == 'no university')
df_tweet_edu['b_unc'] = (df_tweet_edu['Sentiment'] == 'unclear') & (df_tweet_edu['Education_cat'] == 'no university')
df_tweet_edu['a_pos'] = (df_tweet_edu['Sentiment'] == 'positive') & (df_tweet_edu['Education_cat'] == 'university')
df_tweet_edu['a_neu'] = (df_tweet_edu['Sentiment'] == 'neutral') & (df_tweet_edu['Education_cat'] == 'university')
df_tweet_edu['a_neg'] = (df_tweet_edu['Sentiment'] == 'negative') & (df_tweet_edu['Education_cat'] == 'university')
df_tweet_edu['a_unc'] = (df_tweet_edu['Sentiment'] == 'unclear') & (df_tweet_edu['Education_cat'] == 'university')

df_tweet_edu['pos'] = (df_tweet_edu['Sentiment'] == 'positive') 
df_tweet_edu['neu'] = (df_tweet_edu['Sentiment'] == 'neutral')
df_tweet_edu['neg'] = (df_tweet_edu['Sentiment'] == 'negative')
df_tweet_edu['unc'] = (df_tweet_edu['Sentiment'] == 'unclear')
df_tweet_edu['b'] = (df_tweet_edu['Education_cat'] == 'no university')
df_tweet_edu['a'] = (df_tweet_edu['Education_cat'] == 'university')

In [None]:
# group by tweet and sum the nr of tweets for each gender group
df_tweet_edu = df_tweet_edu.groupby('Tweet_id').agg(text_clean=('text_clean','first'),
                   GR=('GR','first'),
                   iaa=('iaa', 'first'),
                   b_pos=('b_pos','sum'),
                   b_neu=('b_neu','sum'),
                   b_neg=('b_neg','sum'),
                   b_unc=('b_unc','sum'),
                   a_pos=('a_pos','sum'),
                   a_neu=('a_neu','sum'),
                   a_neg=('a_neg','sum'),
                   a_unc=('a_unc','sum'),
                   b=('b','sum'),
                   a=('a','sum'),
                   pos=('pos','sum'),
                   neu=('neu','sum'),
                   neg=('neg','sum'),
                   unc=('unc','sum')).reset_index()

df_tweet_edu

Unnamed: 0,Tweet_id,text_clean,GR,iaa,b_pos,b_neu,b_neg,b_unc,a_pos,a_neu,a_neg,a_unc,b,a,pos,neu,neg,unc
0,1,ik heb 13u aan een stuk gewerkt come at me,neutral,0.579270,0,2,1,0,0,2,0,0,3,2,0,4,1,0
1,2,Olive Garden - SNL,neutral,0.556808,0,2,0,1,0,2,0,0,3,2,0,4,0,1
2,3,"Jozef De Kesel wordt zondag kardinaal: ""Ik kij...",positive,0.613758,3,0,0,0,1,1,0,0,3,2,4,1,0,0
3,4,mijn wifi SUCKT echt en ik kant niemeer aan,negative,1.000000,0,0,3,0,0,0,2,0,3,2,0,0,5,0
4,5,Gedraag je maar als een hoe,negative,0.464437,0,2,0,1,0,0,2,0,3,2,0,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,Kheb mij gisterenavond best wel goe gehad 😌,positive,1.000000,2,0,0,0,3,0,0,0,2,3,5,0,0,0
2996,2997,Blij dat ze zich aan de charter houden! #homo...,positive,0.518596,1,0,0,1,3,0,0,0,2,3,4,0,0,1
2997,2998,Ik mis mijn fiets,negative,0.575348,0,0,2,0,0,1,2,0,2,3,0,1,4,0
2998,2999,Weer veel te laat naar bed. Dat ga ik morgenvr...,negative,1.000000,0,0,2,0,0,0,3,0,2,3,0,0,5,0


In [None]:
df_tweet_edu['Majority_Label'] = df_tweet_edu[['pos', 'neu', 'neg', 'unc']].idxmax(axis=1)

In [None]:
df_tweet_edu['Majority_Label_B'] = df_tweet_edu[['b_pos', 'b_neu', 'b_neg', 'b_unc']].idxmax(axis=1)
df_tweet_edu['Majority_Label_A'] = df_tweet_edu[['a_pos', 'a_neu', 'a_neg', 'a_unc']].idxmax(axis=1)

In [None]:
for index_tweet, row_tweet in df_tweet_edu.iterrows():
  if max(row_tweet[['b_pos', 'b_neu', 'b_neg', 'b_unc']]) == sorted(row_tweet[['b_pos', 'b_neu', 'b_neg', 'b_unc']])[2]:
    columns = np.nonzero(row_tweet[['b_pos', 'b_neu', 'b_neg', 'b_unc']].to_numpy())[0]
    if len(columns) > 1:
      random_column = np.random.choice(columns)
      df_tweet_edu.loc[index_tweet,'Majority_Label_B'] = random_column

  if max(row_tweet[['a_pos', 'a_neu', 'a_neg', 'a_unc']]) == sorted(row_tweet[['a_pos', 'a_neu', 'a_neg', 'a_unc']])[2]:
      columns = np.nonzero(row_tweet[['a_pos', 'a_neu', 'a_neg', 'a_unc']].to_numpy())[0]
      if len(columns) > 1:
        random_column = np.random.choice(columns)
        df_tweet_edu.loc[index_tweet,'Majority_Label_A'] = random_column




In [None]:
# Create categorical column university/no university degree
label_map = {0: 'positive',
                   1: 'neutral',
                   2: 'negative',
                   3: 'unclear', 
                   'b_neg': 'negative',
                   'a_neg': 'negative',
                   'b_pos': 'positive',
                   'a_pos': 'positive',
                   'b_neu': 'neutral',
                   'a_neu': 'neutral',
                   'b_unc': 'unclear',
                   'a_unc': 'unclear'}

df_tweet_edu['Majority_Label_B'] = df_tweet_edu['Majority_Label_B'].map(label_map)
df_tweet_edu['Majority_Label_A'] = df_tweet_edu['Majority_Label_A'].map(label_map)


In [None]:
#Cohen's kappa doesn't work with NaN so only tweets that are labelled by both groups can be taken into consederation
df_tweet_edu = df_tweet_edu[df_tweet_edu['b'] != 0]
df_tweet_edu = df_tweet_edu[df_tweet_edu['a'] != 0]

In [None]:
df_tweet_edu

Unnamed: 0,Tweet_id,text_clean,GR,iaa,b_pos,b_neu,b_neg,b_unc,a_pos,a_neu,...,a_unc,b,a,pos,neu,neg,unc,Majority_Label,Majority_Label_B,Majority_Label_A
0,1,ik heb 13u aan een stuk gewerkt come at me,neutral,0.579270,0,2,1,0,0,2,...,0,3,2,0,4,1,0,neu,neutral,neutral
1,2,Olive Garden - SNL,neutral,0.556808,0,2,0,1,0,2,...,0,3,2,0,4,0,1,neu,neutral,neutral
2,3,"Jozef De Kesel wordt zondag kardinaal: ""Ik kij...",positive,0.613758,3,0,0,0,1,1,...,0,3,2,4,1,0,0,pos,positive,neutral
3,4,mijn wifi SUCKT echt en ik kant niemeer aan,negative,1.000000,0,0,3,0,0,0,...,0,3,2,0,0,5,0,neg,negative,negative
4,5,Gedraag je maar als een hoe,negative,0.464437,0,2,0,1,0,0,...,0,3,2,0,2,2,1,neu,neutral,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,Kheb mij gisterenavond best wel goe gehad 😌,positive,1.000000,2,0,0,0,3,0,...,0,2,3,5,0,0,0,pos,positive,positive
2996,2997,Blij dat ze zich aan de charter houden! #homo...,positive,0.518596,1,0,0,1,3,0,...,0,2,3,4,0,0,1,pos,positive,positive
2997,2998,Ik mis mijn fiets,negative,0.575348,0,0,2,0,0,1,...,0,2,3,0,1,4,0,neg,negative,negative
2998,2999,Weer veel te laat naar bed. Dat ga ik morgenvr...,negative,1.000000,0,0,2,0,0,0,...,0,2,3,0,0,5,0,neg,negative,negative


### Cohen's Kappa

In [None]:
cohen_kappa_score(df_tweet_edu['Majority_Label_B'], df_tweet_edu['Majority_Label_A'], labels=classes)

0.4234407361954561