# The purpose of this NB is to assess the interannotator agreement for results provided by annotators. Since we expect to have multiple annotators we'll use Fleiss' kappa.

> ### Fleiss' kappa = "It can be interpreted as expressing the extent to which the observed amount of agreement among raters exceeds what would be expected if all raters made their ratings completely randomly."

> https://www.wikiwand.com/en/Fleiss%27_kappa

In [43]:
import pandas as pd
from nltk.metrics.agreement import AnnotationTask

## <span style="color:magenta"> Enter the name of the results file below </span>

In [44]:
RESULTS_FILE = "test.csv" #enter the name of a given results file

infile = pd.read_csv('./data/annotation_results/'+RESULTS_FILE, index_col=0)

In [45]:
infile.shape

(50, 7)

In [46]:
infile.isna().sum() #check if there are any missing values

Annotator_Aaron       0
Annotator_Andrew      0
Annotator_Bryan       0
Annotator_Aaron.1     0
Annotator_Andrew.1    0
Annotator_Bryan.1     0
Text                  0
dtype: int64

In [47]:
infile_sub = infile.loc[:, infile.columns.str.startswith('Annotat')]
# infile_sub = infile.loc[:,['Annotator_Bryan', 'Annotator_Aaron', 'Annotator_Andrew']]

In [48]:
infile_sub.clip(lower=0, upper=1, inplace=True) #for our current purposes, don't need counts over 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._clip_with_scalar(lower, upper, inplace=inplace)


In [49]:
#convert data to 3-tuples of (<coder>, <item>, <label>)
infile_tuples = [(col, index, infile_sub[col].iloc[index]) for col in infile_sub.columns for index in infile_sub.index]

## Calculate Fleiss kappa

### Fleiss kappa for all annotators

In [198]:
assess_infile = AnnotationTask(data = infile_tuples)
print(f"Overall Fleiss kappa: {round(assess_infile.multi_kappa(),3)}")

Overall Fleiss kappa: 0.714


### Fleiss kappa for groups of annotators

In [199]:
# us = infile_sub[['Annotator_Bryan', 'Annotator_Aaron', 'Annotator_Andrew']]
# them = infile_sub[['Annotator_Mom', 'Annotator_Kelly', 'Annotator_Abigail']]

# them_tuples = [(col, index, them[col].iloc[index]) for col in them.columns for index in them.index]
# us_tuples = [(col, index, us[col].iloc[index]) for col in us.columns for index in us.index]

# assess_them = AnnotationTask(data = them_tuples)
# print(f"Their kappa: {round(assess_them.multi_kappa(),3)}")

# assess_us = AnnotationTask(data = us_tuples)
# print(f"Our kappa: {round(assess_us.multi_kappa(),3)}")

## Get counts of all annotators agreeing, half or more annotators agreeing

In [200]:
NUM_ANNOTATORS = len(infile_sub.columns)

HALF_POSITIVE = round(NUM_ANNOTATORS/2)

print(f"NUM_ANNOTATORS: {NUM_ANNOTATORS}\nHALF_POSITIVE: {HALF_POSITIVE}")

NUM_ANNOTATORS: 6
HALF_POSITIVE: 3


In [201]:
all_agree = infile[(infile.sum(axis=1) == NUM_ANNOTATORS)|(infile.sum(axis=1) == 0)]

len(all_agree)

41

In [202]:
half_plus_agree = infile[(infile.sum(axis=1) >= HALF_POSITIVE)&(infile.sum(axis=1) < NUM_ANNOTATORS)]

len(half_plus_agree)

2

## Get counts where a label = one

In [203]:
pd.set_option('display.max_colwidth', None)

In [204]:
annotators = list(infile.columns[infile.columns.str.startswith("Annotat")])
for annotator in annotators:
    print(f"{annotator}| Positive label count: {infile[annotator].sum()}")

Annotator_Aaron| Positive label count: 12
Annotator_Andrew| Positive label count: 9
Annotator_Bryan| Positive label count: 11
Annotator_Aaron.1| Positive label count: 12
Annotator_Andrew.1| Positive label count: 9
Annotator_Bryan.1| Positive label count: 11


In [205]:
loaners = infile[(infile.sum(axis=1) == 1)|(infile.sum(axis=1) == (NUM_ANNOTATORS-1))] #count of where only one annotator assigned a one

print(f"Number of loaners: {len(loaners)}")
loaners.Text if "Text" in loaners.columns else print("No text provided")

Number of loaners: 0


Series([], Name: Text, dtype: object)

In [206]:
half_fewer_ones = infile[(infile.sum(axis = 1) <= HALF_POSITIVE)&(infile.sum(axis = 1) > 0)] #count of where half of annotators assigned a one

print(f"Number where half or fewer annotators assigned positive labels: {len(half_fewer_ones.index)}")
# half_ones.Text if "Text" in half_ones.columns else print("No text provided")

Number where half or fewer annotators assigned positive labels: 7


In [207]:
half_plus_ones = infile[(infile.sum(axis = 1) > HALF_POSITIVE)&(infile.sum(axis = 1) < NUM_ANNOTATORS)] #count of where more than half of annotators assigned a one

print(f"Number where more than half of all annotators assigned positive labels: {len(half_plus_ones.index)}")
# half_plus_ones.Text if "Text" in half_plus_ones.columns else print("No text provided")

Number where more than half of all annotators assigned positive labels: 2


## Count of any annotator awarding a one

In [208]:
len(infile[infile.sum(axis=1) >= 1])

16

## Analyzing results for "us vs. them"

In [209]:
# us["sum"] = us.sum(axis = 1)
# them["sum"] = them.sum(axis = 1)

In [210]:
# us_them = pd.merge(us, them, left_index = True, right_index = True, suffixes = ("_us", "_them"))
# us_them["abs_diff"] = abs(us_them["sum_us"]-us_them["sum_them"])

# signif_indices = list(us_them[us_them["abs_diff"]!=0].index) #list of indices with signif disagreement between us & them

# infile.iloc[signif_indices]

## Manual calculation of Fleiss kappa
> ### Intended as a verification of the calculation of multi_kappa() above. Based off: https://www.wikiwand.com/en/Fleiss%27_kappa#/Worked_example

In [211]:
# all_labels = [infile[col].iloc[index] for col in infile.columns for index in infile.index]

In [212]:
# n_var = len(infile.columns) #number of raters
# N_var = infile.shape[0] #number of observations/records
# k_var = len(set(all_labels)) #number of categories/labels

In [213]:
# #calculate P_j
# P_j1 = (1/(N_var*n_var))*sum(infile.sum(axis=1))
# P_j0 = (1/(N_var*n_var))*sum(n_var - infile.sum(axis=1))

# # P_j0, P_j1

In [214]:
# #calculate P_i

# P_i = 1/(n_var*(n_var-1))*(((infile.sum(axis=1)**2)+((n_var - infile.sum(axis=1))**2))-n_var)

# # P_i

In [215]:
# #calculate P_bar
# P_bar = (1/N_var)*sum(P_i)

# # P_bar

In [216]:
# #caculate P_bar_e

# P_bar_e = (P_j0**2)+(P_j1**2)

# # P_bar_e

In [217]:
# fleiss_kappa = (P_bar - P_bar_e)/(1-P_bar_e)

# fleiss_kappa #close-ish to the multi_kappa() calculation above

# Calculate agreement across multiple files

In [36]:
import os
import pandas as pd

In [108]:
PATH = "./data/mturk_results/"

In [109]:
filenames = []
for root, dirs, files in os.walk(PATH):
    filenames.append(files)

In [110]:
filtered_files = filenames[0]

In [227]:
these_results = []
for i in filtered_files:
    infile = pd.read_csv(PATH+i, index_col=0)
    infile["checker"] = infile.apply(lambda x: 1 if x["Answer.yes.1"]==x["Answer.no.0"] else 0, axis= 1)
    subset_infile = infile[infile["checker"]==0]
    
    find = subset_infile.groupby("HITId").mean()[["Answer.yes.1","Answer.no.0"]]
    find["agree"] = find.apply(lambda x: max(x), axis=1)
    agreement = find["agree"].mean()
    
    these_results.append((i,round(agreement,3)))
    
#     print(i, round(agreement,3))

In [234]:
agree_df = pd.DataFrame(these_results, columns = ["File","Proportion of agreement"])
agree_df #average agreement is 0.85

Unnamed: 0,File,Proportion of agreement
0,will_mturk_results_3_filtered.csv,0.857
1,get_mturk_results_1_filtered.csv,0.844
2,get_mturk_results_3_filtered.csv,0.783
3,will_mturk_results_5_filtered.csv,0.843
4,go_mturk_results_1_filtered.csv,0.899
5,get_mturk_results_2_filtered.csv,0.955
6,pos_mturk_results_2_filtered.csv,0.933
7,will_mturk_results_1_filtered.csv,0.847
8,go_mturk_results_2_filtered.csv,0.795
9,go_mturk_results_4_filtered.csv,0.827


In [111]:
this = pd.read_csv(PATH+filtered_files[0], index_col=0)

In [185]:
this["checker"] = this.apply(lambda x: 1 if x["Answer.yes.1"]==x["Answer.no.0"] else 0, axis= 1)

In [205]:
this_subset = this[this["checker"]==0]

In [206]:
chair = this_subset.groupby("HITId").mean()[["Answer.yes.1","Answer.no.0"]]

In [211]:
chair["agree"] = chair.apply(lambda x: max(x), axis=1)

In [213]:
chair["agree"].mean()

0.8566666666666666

In [169]:
# check = this[["HITId","WorkerId"]].groupby('HITId').count()

In [168]:
# for i in check["WorkerId"].unique():
#     if i > 1:
#         these_HITs = list(check[check["WorkerId"]==i].index)
#         print("*"*8) 
        
#         #subset larger df based on these_HITs
#         this_subset = this[this["HITId"].isin(these_HITs)]
        
#         these_tuples = [x for x in this_subset[["WorkerId","HITId","Answer.yes.1"]].to_records(index=False)] # (annotator_ID, item_ID, label)
#         assess_infile = AnnotationTask(data = these_tuples)
#         print(f"Overall Fleiss kappa: {round(assess_infile.multi_kappa(),3)}")
        
        

In [166]:
# for i in filtered_files:
#     infile = pd.read_csv(PATH+i, index_col=0)
#     infile["Answer.yes.1"] = [1 if x else 0 for x in infile["Answer.yes.1"]]
    
#     # get subsets
    
#     ## get tuples & calculate kappa
    
    
#     these_tuples = [x for x in infile[["WorkerId","HITId","Answer.yes.1"]].to_records(index=False)] # (annotator_ID, item_ID, label)
#     assess_infile = AnnotationTask(data = these_tuples)
#     print(f"{i} - Overall Fleiss kappa: {round(assess_infile.multi_kappa(),3)}")

In [202]:
# for i in filtered_files:
#     infile = pd.read_csv(PATH+i, index_col=0)
#     infile["checker"] = infile.apply(lambda x: 1 if x["Answer.yes.1"]==x["Answer.no.0"] else 0, axis= 1)
    
#     print(i,"-", infile[infile["checker"]==1].shape[0])