In [1]:
import numpy as np
import pandas as pd
from statsmodels.stats.inter_rater import fleiss_kappa
from statsmodels.stats import inter_rater as irr

In [9]:
def print_question_statistics(annotations):
    questions = ['cat1', 'cat2', 'cat3', 'compoundclaim', 'ironic']

    disagreement_df = pd.DataFrame(columns=questions, index=['no', 'yes'])
    disagreement_level_df = pd.DataFrame(columns=questions, index=['no', 'little', 'high'])
    mean_answers_df = pd.DataFrame(columns=questions, index=[0, 0.25, 0.5, 0.75, 1])
    consolidated_answers_df = pd.DataFrame(columns=questions, index=[0, 1])
    fleiss_df = pd.DataFrame(columns=questions, index=['fleiss_kappa'])

    all_no = len(annotations[(annotations['cat1_final_answer']==0.0) & (annotations['cat2_final_answer']==0.0) & (annotations['cat3_final_answer']==0.0)])
    one_yes = len(annotations[(annotations['cat1_final_answer']==1.0) | (annotations['cat2_final_answer']==1.0) | (annotations['cat3_final_answer']==1.0)])
    all_no_or_hd = len(annotations) - all_no - one_yes

    for question in questions:
        mean_answers = annotations[f'{question}_mean_answer']
        disagreement_level = mean_answers.apply(lambda x: {0: 'no', 1: 'no', 0.25:'little', 0.75:'little', 0.5:'high'}[x])
        consolidated_answer = annotations[annotations[f'{question}_final_answer']!=0.5][f'{question}_final_answer']

        disagreement = disagreement_level != 'no'
        answers = annotations[f'{question}_answers']

        disagreement_df.loc['yes'][question] = f'{disagreement.mean() * 100:.2f}'
        disagreement_df.loc['no'][question] = f'{100 - disagreement.mean() * 100:.2f}'

        disagreement_level_vc = disagreement_level.value_counts(normalize=False)
        disagreement_level_dict = {'no': 0, 'little':0, 'high':0}
        disagreement_level_dict.update(disagreement_level_vc.to_dict())
        disagreement_level_perc_vc = disagreement_level.value_counts(normalize=True)
        disagreement_level_perc_dict = {'no':0.0, 'little':0.0, 'high':0.0}
        disagreement_level_perc_dict.update(disagreement_level_perc_vc.to_dict())
        disagreement_level_df.loc['no'][question] = f'{disagreement_level_dict["no"]:.2f} ({disagreement_level_perc_dict["no"]*100:.2f}%)'
        disagreement_level_df.loc['little'][question] = f'{disagreement_level_dict["little"]:.2f} ({disagreement_level_perc_dict["little"]*100:.2f}%)'
        disagreement_level_df.loc['high'][question] = f'{disagreement_level_dict["high"]:.2f} ({disagreement_level_perc_dict["high"]*100:.2f}%)'

        mean_answers_vc= mean_answers.value_counts(normalize=False)
        mean_answers_dict = {0: 0, 0.25:0, 0.5:0, 0.75:0, 1:0}
        mean_answers_dict.update(mean_answers_vc.to_dict())
        mean_answers_perc_vc= mean_answers.value_counts(normalize=True)
        mean_answers_perc_dict = {0: 0.0, 0.25:0.0, 0.5:0.0, 0.75:0.0, 1:0.0}
        mean_answers_perc_dict.update(mean_answers_perc_vc.to_dict())
        mean_answers_df.loc[0][question] = f'{mean_answers_dict[0]:.2f} ({mean_answers_perc_dict[0]*100:.2f}%)'
        mean_answers_df.loc[0.25][question] = f'{mean_answers_dict[0.25]:.2f} ({mean_answers_perc_dict[0.25]*100:.2f}%)'
        mean_answers_df.loc[0.5][question] = f'{mean_answers_dict[0.5]:.2f} ({mean_answers_perc_dict[0.5]*100:.2f}%)'
        mean_answers_df.loc[0.75][question] = f'{mean_answers_dict[0.75]:.2f} ({mean_answers_perc_dict[0.75]*100:.2f}%)'
        mean_answers_df.loc[1][question] = f'{mean_answers_dict[1]:.2f} ({mean_answers_perc_dict[1]*100:.2f}%)'

        consolidated_answer_vc = consolidated_answer.value_counts(normalize=False)
        consolidated_answer_dict = {0: 0, 1:0}
        consolidated_answer_dict.update(consolidated_answer_vc.to_dict())
        consolidated_answer_perc_vc = consolidated_answer.value_counts(normalize=True)
        consolidated_answer_perc_dict = {0:0.0, 1:0.0}
        consolidated_answer_perc_dict.update(consolidated_answer_perc_vc.to_dict())
        consolidated_answers_df.loc[0][question] = f'{consolidated_answer_dict[0]:.2f} ({consolidated_answer_perc_dict[0]*100:.2f}%)'
        consolidated_answers_df.loc[1][question] = f'{consolidated_answer_dict[1]:.2f} ({consolidated_answer_perc_dict[1]*100:.2f}%)'

        answers = answers.apply(lambda x: [int(a) for a in x.split(" ")])
        answers = np.array(answers.tolist())
        agg = irr.aggregate_raters(answers)  # returns a tuple (data, categories)
        fleiss_df.loc['fleiss_kappa'][question] = f'{fleiss_kappa(agg[0], method="fleiss"):.2f}'

    print('number of tweets:')
    print(len(annotations))
    print()

    print('number of negative tweets (not related to science):')
    print(all_no)
    print()

    print('number of tweets without any positive label (only negative or high disagreement labels):')
    print(all_no_or_hd)
    print()

    print('number of positive tweets (related to science):')
    print(one_yes)
    print()

    print('disagreement (no/ yes) statistics')
    print(disagreement_df)
    print()

    print('disagreement (no/ little/ high) statistics')
    print(disagreement_level_df)
    print()

    print('mean answers statistics')
    print(mean_answers_df)
    print()

    print('consolidated answers statistics')
    print(consolidated_answers_df)
    print()

    print('fleiss kappa statistics')
    print(fleiss_df)
    print()

In [10]:
annotations = pd.read_csv('annotations/annotations.tsv', sep='\t')

In [11]:
round1_annotations = annotations[annotations['round'] == 1]
print_question_statistics(round1_annotations)

number of tweets:
1046

number of negative tweets (not related to science):
751

number of tweets without any positive label (only negative or high disagreement labels):
84

number of positive tweets (related to science):
211

disagreement (no/ yes) statistics
      cat1   cat2   cat3 compoundclaim ironic
no   72.37  86.23  82.03         86.33  95.22
yes  27.63  13.77  17.97         13.67   4.78

disagreement (no/ little/ high) statistics
                   cat1             cat2             cat3    compoundclaim  \
no      757.00 (72.37%)  902.00 (86.23%)  858.00 (82.03%)  903.00 (86.33%)   
little  216.00 (20.65%)    97.00 (9.27%)  142.00 (13.58%)  115.00 (10.99%)   
high      73.00 (6.98%)    47.00 (4.49%)    46.00 (4.40%)    28.00 (2.68%)   

                 ironic  
no      996.00 (95.22%)  
little    40.00 (3.82%)  
high      10.00 (0.96%)  

mean answers statistics
                 cat1             cat2             cat3    compoundclaim  \
0.00  653.00 (62.43%)  871.00 (83.27%) 

In [12]:
round2_annotations = annotations[annotations['round'] == 2]
print_question_statistics(round2_annotations)

number of tweets:
215

number of negative tweets (not related to science):
24

number of tweets without any positive label (only negative or high disagreement labels):
0

number of positive tweets (related to science):
191

disagreement (no/ yes) statistics
      cat1   cat2   cat3 compoundclaim ironic
no   63.72  60.47  62.33         72.09  96.74
yes  36.28  39.53  37.67         27.91   3.26

disagreement (no/ little/ high) statistics
                   cat1             cat2             cat3    compoundclaim  \
no      137.00 (63.72%)  130.00 (60.47%)  134.00 (62.33%)  155.00 (72.09%)   
little   58.00 (26.98%)   55.00 (25.58%)   56.00 (26.05%)   42.00 (19.53%)   
high      20.00 (9.30%)   30.00 (13.95%)   25.00 (11.63%)    18.00 (8.37%)   

                 ironic  
no      208.00 (96.74%)  
little     5.00 (2.33%)  
high       2.00 (0.93%)  

mean answers statistics
                cat1            cat2            cat3    compoundclaim  \
0.00  46.00 (21.40%)  70.00 (32.56%)  55.00 (

In [13]:
print_question_statistics(annotations)

number of tweets:
1261

number of negative tweets (not related to science):
775

number of tweets without any positive label (only negative or high disagreement labels):
84

number of positive tweets (related to science):
402

disagreement (no/ yes) statistics
      cat1   cat2   cat3 compoundclaim ironic
no   70.90  81.84  78.67         83.90  95.48
yes  29.10  18.16  21.33         16.10   4.52

disagreement (no/ little/ high) statistics
                   cat1              cat2             cat3     compoundclaim  \
no      894.00 (70.90%)  1032.00 (81.84%)  992.00 (78.67%)  1058.00 (83.90%)   
little  274.00 (21.73%)   152.00 (12.05%)  198.00 (15.70%)   157.00 (12.45%)   
high      93.00 (7.38%)     77.00 (6.11%)    71.00 (5.63%)     46.00 (3.65%)   

                  ironic  
no      1204.00 (95.48%)  
little     45.00 (3.57%)  
high       12.00 (0.95%)  

mean answers statistics
                 cat1             cat2             cat3     compoundclaim  \
0.00  699.00 (55.43%)  941