In [1]:
import pandas as pd
import numpy as np
import krippendorff
from scipy.stats import spearmanr
from itertools import combinations
from collections import defaultdict

Load uses, instances, and judments of a specific round

In [17]:
def load_uses(filename='data/uses.tsv', sep='\t'):
    tmp = list()
    with open(filename, mode='r', encoding='utf-8') as f:
        columns = f.readline().rstrip().split(sep)
        for line in f.readlines():
            tmp.append(dict(zip(columns, line.rstrip().split(sep))))
    
    return pd.DataFrame(tmp)

def load_instances(filename, dirname='rounds', sep='\t'):
    tmp = list()
    with open(f'{dirname}/{filename}', mode='r', encoding='utf-8') as f:
        columns = f.readline().rstrip().split(sep) + ['dataID1', 'dataID2']
        for line in f.readlines():
            tmp_record = dict(zip(columns, line[:-1].split('\t')))
            tmp_record['dataID1'], tmp_record['dataID2'] = tmp_record['dataIDs'].split(',')
            tmp.append(tmp_record)
    
    return pd.DataFrame(tmp)

def load_judgments(filename, dirname='judgments', sep='\t'):
    tmp = list()
    with open(f'{dirname}/{filename}', mode='r', encoding='utf-8') as f:
        columns = f.readline().rstrip().split(sep)
        for line in f.readlines():
            tmp_record = dict(zip(columns, line.rstrip().split(sep)))
            tmp.append(tmp_record)
            
    #tmp - to remove after fixing the bug
    tmp = list()
    with open(f'{dirname}/{filename}', mode='r', encoding='utf-8') as f:
        columns = f.readline().rstrip().split(sep)
        f = f.read().replace('shur\n', 'shur@@@').replace('Nisha\n', 'Nisha@@@').replace('AndreaMariaC\n', 'AndreaMariaC@@@').replace('\n', '--')
        lines = f.split('@@@')
        for line in lines:
            tmp_record = dict(zip(columns, line.rstrip().split(sep)))
            tmp.append(tmp_record)

    # -1: can not decide
    df = pd.DataFrame(tmp).fillna('-1')
    df['label'] = df['label'].apply(lambda x: x.replace('-', '-1')).astype(int)
    
    return df

def merge_data(df_uses, df_instances, df_judgments):
    df = df_judgments.merge(df_instances).merge(df_uses, left_on='dataID1', right_on='dataID')
    del df['dataID']
    del df['lemma']
    df = df.rename(columns={column: f'{column}1' for column in ['context', 'indices_target_token', 'indices_target_sentence']})
    df = df.merge(df_uses, left_on='dataID2', right_on='dataID')
    del df['dataID']
    df = df.rename(columns={column: f'{column}2' for column in ['context', 'indices_target_token', 'indices_target_sentence']})
    
    column_order = ['instanceID', 'dataID1', 'dataID2', 'label', 'annotator',  'lemma', 'context1', 'context2', 'indices_target_token1', 'indices_target_sentence1', 'indices_target_sentence2', 'indices_target_token2',  'comment', 'label_set', 'non_label', 'dataIDs']
    return df[column_order]

round_ = 'TRoTR.tsv'
df_uses = load_uses()
df_instances = load_instances(round_)
df_judgments = load_judgments(round_)
df = merge_data(df_uses, df_instances, df_judgments)

Group judgments

In [18]:
df.groupby(['lemma', 'label']).count()[['instanceID']]

Unnamed: 0_level_0,Unnamed: 1_level_0,instanceID
lemma,label,Unnamed: 2_level_1
"But I suffer not a woman to teach, nor to usurp authority",-1,2
"But I suffer not a woman to teach, nor to usurp authority",1,23
"But I suffer not a woman to teach, nor to usurp authority",2,77
"But I suffer not a woman to teach, nor to usurp authority",3,186
"But I suffer not a woman to teach, nor to usurp authority",4,161
...,...,...
You shall have no other gods before me,4,11
the truth will set you free,1,135
the truth will set you free,2,192
the truth will set you free,3,61


In [19]:
annotators = ['AndreaMariaC', 'Nisha', 'shur']

In [27]:
def judgment4annotator(df, annotators):
    judgments = defaultdict(lambda: defaultdict(list))
    aggregate_agreement = defaultdict(list)
    aggregate_other_agreement = defaultdict(lambda: defaultdict(list))
    disagreement_judgments = defaultdict(lambda: defaultdict(list))
    
    verbose = False
    for i, row in df[['instanceID']].drop_duplicates().iterrows():
        try:
            lemma = row['instanceID'].split("_")[2]
            
            instance_judments = list()
            for ann in annotators:
                jud = df[(df['instanceID'] == row['instanceID']) & (df['annotator'] == ann)].label.values[0]
                instance_judments.append(jud)
            
            if len(set(instance_judments)) == len(instance_judments):
                tmp = df[(df['instanceID'] == row['instanceID']) & (df['annotator'] == annotators[0])]
                
                if verbose:
                    print(tmp.context1.values[0])
                    print(tmp.context2.values[0])
                    print(row['instanceID'], " ".join(instance_judments), '\n')
    
                for k, ann in enumerate(annotators):
                    disagreement_judgments[ann]['all'].append(instance_judments[k])
                    disagreement_judgments[ann][lemma].append(instance_judments[k])
                #continue
    
            for k, ann in enumerate(annotators):
                judgments[ann]['all'].append(instance_judments[k])
                judgments[ann][lemma].append(instance_judments[k])
                tmp = [j for a, j in enumerate(instance_judments) if j > -1 and annotators[a]!=ann]
                aggregate_other_agreement[ann]['all'].append(round(sum(tmp)/len(tmp)))
                aggregate_other_agreement[ann][lemma].append(round(sum(tmp)/len(tmp)))
    
            # avoid 'cannot decide'
            instance_judments = [j for j in instance_judments if j > -1]
            aggregate_agreement['all'].append(round(sum(instance_judments)/len(instance_judments)))
            aggregate_agreement[lemma].append(round(sum(instance_judments)/len(instance_judments)))
        except:
            pass
                    

    return judgments, aggregate_agreement, disagreement_judgments, aggregate_other_agreement

judgments, aggregate_agreement, disagreement_judgments, aggregate_other_agreement = judgment4annotator(df, annotators)

In [29]:
def judgment4annotator(df, annotators):
    judgments = defaultdict(lambda: defaultdict(list))
    aggregate_agreement = defaultdict(list)
    disagreement_judgments = defaultdict(lambda: defaultdict(list))
    
    verbose = False
    for i, row in df[['instanceID']].drop_duplicates().iterrows():
        try:
            lemma = row['instanceID'].split("_")[2]
            
            instance_judments = list()
            for ann in annotators:
                jud = df[(df['instanceID'] == row['instanceID']) & (df['annotator'] == ann)].label.values[0]
                instance_judments.append(jud)
            
            if len(set(instance_judments)) == len(instance_judments):
                tmp = df[(df['instanceID'] == row['instanceID']) & (df['annotator'] == annotators[0])]
                
                if verbose:
                    print(tmp.context1.values[0])
                    print(tmp.context2.values[0])
                    print(row['instanceID'], " ".join(instance_judments), '\n')
    
                for k, ann in enumerate(annotators):
                    disagreement_judgments[ann]['all'].append(instance_judments[k])
                    disagreement_judgments[ann][lemma].append(len(disagreement_judgments[ann]['all'])-1)
                #continue
    
            for k, ann in enumerate(annotators):
                judgments[ann]['all'].append(instance_judments[k])
                judgments[ann][lemma].append(len(judgments[ann]['all'])-1)
    
            # avoid 'cannot decide'
            instance_judments = [j for j in instance_judments if j > -1]
            aggregate_agreement['all'].append(round(sum(instance_judments)/len(instance_judments)))
            aggregate_agreement[lemma].append(len(aggregate_agreement['all'])-1)
        except:
            pass

    for ann in judgments:
        judgments[ann]['all'] = np.array(judgments[ann]['all'])
        judgments[ann]['all'] = (judgments[ann]['all'] - judgments[ann]['all'].mean())/(judgments[ann]['all'].std())

    for ann in judgments:
        for lemma in judgments[ann]:
            if lemma == 'all': continue
            disagreement_judgments[ann][lemma] = np.array([disagreement_judgments[ann]['all'][i] for i in disagreement_judgments[ann][lemma]])
            judgments[ann][lemma] = np.array([judgments[ann]['all'][i] for i in judgments[ann][lemma]])

    for lemma in aggregate_agreement:
        if lemma == 'all': continue
        aggregate_agreement[lemma] = np.array([aggregate_agreement['all'][i] for i in aggregate_agreement[lemma]])
        

    return judgments, aggregate_agreement, disagreement_judgments

judgments, aggregate_agreement, disagreement_judgments = judgment4annotator(df, annotators)

In [21]:
lemmas = list(judgments[annotators[1]].keys())

In [22]:
for lemma in lemmas:
    print(f'Number of pair ({lemma}):', len(judgments[annotators[1]][lemma]))

Number of pair (all): 1560
Number of pair ((Hosea 8:7)): 150
Number of pair ((1 Timothy 2:12)): 149
Number of pair ((Matthew 7:1)): 150
Number of pair ((1 Corinthians 13:4)): 150
Number of pair ((Ephesians 5:25)): 89
Number of pair ((Jeremiah 17:9)): 99
Number of pair ((Exodus 20:3)): 88
Number of pair ((2 Corinthians 5:17)): 102
Number of pair ((Hebrews 11:1)): 93
Number of pair ((1 Samuel 16:7)): 109
Number of pair ((Genesis 1:1)): 100
Number of pair ((John 8:32)): 98
Number of pair ((1 John 4:8)): 92
Number of pair ((2 Corinthians 5:7)): 91


In [28]:
for lemma in lemmas:
    print(f'Krippendorff Ordinal ({lemma}):', round(krippendorff.alpha(np.array([judgments[ann][lemma] for ann in judgments]), level_of_measurement='ordinal'), 3))

Krippendorff Ordinal (all): 0.461
Krippendorff Ordinal ((Hosea 8:7)): 0.188
Krippendorff Ordinal ((1 Timothy 2:12)): 0.329
Krippendorff Ordinal ((Matthew 7:1)): 0.472
Krippendorff Ordinal ((1 Corinthians 13:4)): 0.282
Krippendorff Ordinal ((Ephesians 5:25)): 0.494
Krippendorff Ordinal ((Jeremiah 17:9)): 0.26
Krippendorff Ordinal ((Exodus 20:3)): 0.307
Krippendorff Ordinal ((2 Corinthians 5:17)): 0.394
Krippendorff Ordinal ((Hebrews 11:1)): 0.324
Krippendorff Ordinal ((1 Samuel 16:7)): 0.432
Krippendorff Ordinal ((Genesis 1:1)): 0.199
Krippendorff Ordinal ((John 8:32)): 0.328
Krippendorff Ordinal ((1 John 4:8)): 0.266
Krippendorff Ordinal ((2 Corinthians 5:7)): 0.306


In [30]:
for lemma in lemmas:
    print(f'Krippendorff Interval ({lemma}):', round(krippendorff.alpha(np.array([judgments[ann][lemma] for ann in judgments]), level_of_measurement='interval'), 3))

Krippendorff Interval (all): 0.5
Krippendorff Interval ((Hosea 8:7)): 0.275
Krippendorff Interval ((1 Timothy 2:12)): 0.339
Krippendorff Interval ((Matthew 7:1)): 0.537
Krippendorff Interval ((1 Corinthians 13:4)): 0.349
Krippendorff Interval ((Ephesians 5:25)): 0.446
Krippendorff Interval ((Jeremiah 17:9)): 0.403
Krippendorff Interval ((Exodus 20:3)): 0.362
Krippendorff Interval ((2 Corinthians 5:17)): 0.458
Krippendorff Interval ((Hebrews 11:1)): 0.266
Krippendorff Interval ((1 Samuel 16:7)): 0.49
Krippendorff Interval ((Genesis 1:1)): 0.247
Krippendorff Interval ((John 8:32)): 0.445
Krippendorff Interval ((1 John 4:8)): 0.322
Krippendorff Interval ((2 Corinthians 5:7)): 0.39


In [13]:
for lemma in lemmas:
    print(f'-- {lemma} --')
    for k, ann in enumerate(judgments):
        corr, pvalue = spearmanr(aggregate_other_agreement[ann][lemma], judgments[ann][lemma])
        print(f'Spearman corr. ({annotators[k]}, agg):', round(corr, 3), f'(pvalue={round(pvalue, 3)})')

-- all --
Spearman corr. (AndreaMariaC, agg): 0.57 (pvalue=0.0)
Spearman corr. (Nisha, agg): 0.526 (pvalue=0.0)
Spearman corr. (shur, agg): 0.471 (pvalue=0.0)
-- (Ephesians 5:25) --
Spearman corr. (AndreaMariaC, agg): 0.54 (pvalue=0.0)
Spearman corr. (Nisha, agg): 0.594 (pvalue=0.0)
Spearman corr. (shur, agg): 0.559 (pvalue=0.0)
-- (Jeremiah 17:9) --
Spearman corr. (AndreaMariaC, agg): 0.51 (pvalue=0.0)
Spearman corr. (Nisha, agg): 0.407 (pvalue=0.0)
Spearman corr. (shur, agg): 0.378 (pvalue=0.0)
-- (Exodus 20:3) --
Spearman corr. (AndreaMariaC, agg): 0.471 (pvalue=0.0)
Spearman corr. (Nisha, agg): 0.383 (pvalue=0.0)
Spearman corr. (shur, agg): 0.371 (pvalue=0.0)
-- (2 Corinthians 5:17) --
Spearman corr. (AndreaMariaC, agg): 0.509 (pvalue=0.0)
Spearman corr. (Nisha, agg): 0.481 (pvalue=0.0)
Spearman corr. (shur, agg): 0.497 (pvalue=0.0)
-- (Hebrews 11:1) --
Spearman corr. (AndreaMariaC, agg): 0.453 (pvalue=0.0)
Spearman corr. (Nisha, agg): 0.493 (pvalue=0.0)
Spearman corr. (shur, agg):

In [31]:
for lemma in lemmas:
    print(f'-- {lemma} --')
    for ann1, ann2 in combinations(annotators, 2):
        corr, pvalue = spearmanr(judgments[ann1][lemma], judgments[ann2][lemma])
        print(f'Spearman corr. ({ann1}, {ann2}):', round(corr, 3), f'(pvalue={round(pvalue, 3)})')

-- all --
Spearman corr. (AndreaMariaC, Nisha): 0.596 (pvalue=0.0)
Spearman corr. (AndreaMariaC, shur): 0.491 (pvalue=0.0)
Spearman corr. (Nisha, shur): 0.461 (pvalue=0.0)
-- (Hosea 8:7) --
Spearman corr. (AndreaMariaC, Nisha): 0.474 (pvalue=0.0)
Spearman corr. (AndreaMariaC, shur): 0.435 (pvalue=0.0)
Spearman corr. (Nisha, shur): 0.378 (pvalue=0.0)
-- (1 Timothy 2:12) --
Spearman corr. (AndreaMariaC, Nisha): 0.47 (pvalue=0.0)
Spearman corr. (AndreaMariaC, shur): 0.293 (pvalue=0.0)
Spearman corr. (Nisha, shur): 0.27 (pvalue=0.001)
-- (Matthew 7:1) --
Spearman corr. (AndreaMariaC, Nisha): 0.532 (pvalue=0.0)
Spearman corr. (AndreaMariaC, shur): 0.679 (pvalue=0.0)
Spearman corr. (Nisha, shur): 0.455 (pvalue=0.0)
-- (1 Corinthians 13:4) --
Spearman corr. (AndreaMariaC, Nisha): 0.466 (pvalue=0.0)
Spearman corr. (AndreaMariaC, shur): 0.39 (pvalue=0.0)
Spearman corr. (Nisha, shur): 0.29 (pvalue=0.0)
-- (Ephesians 5:25) --
Spearman corr. (AndreaMariaC, Nisha): 0.595 (pvalue=0.0)
Spearman corr.

In [61]:
for lemma in lemmas:
    print(f'-- {lemma} --')
    for ann in annotators:
        print(f'Class distribution ({ann}):', np.unique(np.array(judgments[ann][lemma]), return_counts=True))

-- all --
Class distribution (AndreaMariaC): (array([1, 2, 3, 4]), array([ 57, 350, 297,  55], dtype=int64))
Class distribution (Nisha): (array([1, 2, 3, 4]), array([259, 260, 207,  33], dtype=int64))
Class distribution (shur): (array([-1,  1,  2,  3,  4]), array([ 22, 145, 203, 263, 126], dtype=int64))
-- (Ephesians 5:25) --
Class distribution (AndreaMariaC): (array([1, 2, 3, 4]), array([ 1, 17, 44,  8], dtype=int64))
Class distribution (Nisha): (array([1, 2, 3, 4]), array([12, 20, 34,  4], dtype=int64))
Class distribution (shur): (array([-1,  1,  2,  3,  4]), array([ 4,  4, 24, 24, 14], dtype=int64))
-- (Jeremiah 17:9) --
Class distribution (AndreaMariaC): (array([1, 2, 3, 4]), array([ 9, 39, 23,  6], dtype=int64))
Class distribution (Nisha): (array([1, 2, 3, 4]), array([45, 19,  8,  5], dtype=int64))
Class distribution (shur): (array([-1,  1,  2,  3,  4]), array([ 3, 17, 15, 28, 14], dtype=int64))
-- (Exodus 20:3) --
Class distribution (AndreaMariaC): (array([1, 2, 3, 4]), array([ 3

# Quality Check

In [173]:
round_ = 'post-1st-round.tsv'

df_instances_post1st = load_instances(round_)
df_judgments_post1st = load_judgments(round_)
df_post1st = merge_data(df_uses, df_instances_post1st, df_judgments_post1st)

In [174]:
annotators = ['AndreaMariaC', 'shur', 'Nisha']

In [175]:
df_1st = df[df['instanceID'].isin(df_post1st.instanceID.values)]

In [176]:
df_post1st = df_1st[['instanceID', 'annotator', 'label']].merge(df_post1st[['instanceID', 'annotator', 'label']], left_on=['instanceID', 'annotator'], right_on=['instanceID', 'annotator'])
df_post1st['diff'] = np.abs(df_post1st['label_x'].values - df_post1st['label_y'].values)

In [177]:
for ann in annotators:
    corr, pvalue = spearmanr(df_post1st[df_post1st['annotator'] == ann].label_x.values, df_post1st[df_post1st['annotator'] == ann].label_y.values)
    print(f'Spearman corr. ({ann}):', round(corr, 3), f'(pvalue={round(pvalue, 3)})')

Spearman corr. (AndreaMariaC): 0.448 (pvalue=0.042)
Spearman corr. (shur): 0.546 (pvalue=0.01)
Spearman corr. (Nisha): 0.415 (pvalue=0.061)


In [180]:
df_post1st.groupby(['annotator', 'label_x', 'diff']).count()[['instanceID']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,instanceID
annotator,label_x,diff,Unnamed: 3_level_1
AndreaMariaC,1,0,7
AndreaMariaC,1,1,2
AndreaMariaC,2,0,2
AndreaMariaC,2,1,6
AndreaMariaC,2,2,1
AndreaMariaC,3,0,2
AndreaMariaC,3,2,1
Nisha,1,0,7
Nisha,1,1,3
Nisha,1,2,2


In [10]:
df_shur = df[df['annotator'] == 'shur'].reset_index()
del df_shur['annotator']
df_nisha = df[df['annotator'] == 'Nisha'].reset_index()
del df_nisha['annotator']

In [11]:
df_sn = df_nisha[['instanceID', 'label']].merge(df_shur[['instanceID', 'label']], left_on=['instanceID'], right_on=['instanceID'])
df_sn['diff'] = np.abs(df_sn['label_x'].values - df_sn['label_y'].values)

In [12]:
df_sn[df_sn['diff'] > 1]

Unnamed: 0,instanceID,label_x,label_y,diff
1,pair_28_(Matthew 7:1),2,4,2
2,pair_74_(Matthew 7:1),2,4,2
3,pair_4_(Matthew 7:1),2,4,2
6,pair_125_(Matthew 7:1),1,3,2
12,pair_85_(Matthew 7:1),1,3,2
13,pair_121_(Matthew 7:1),1,3,2
20,pair_66_(Matthew 7:1),1,3,2
23,pair_127_(Matthew 7:1),1,3,2
37,pair_53_(Matthew 7:1),1,4,3
41,pair_99_(Matthew 7:1),2,4,2


In [35]:
df[df['instanceID'] == 'pair_105_(Matthew 7:1)'][['context1', 'context2']].context1.iloc[0]

"Exactly. Judge not, that ye be not judged. Is NOT in the Bible. It's a cute coined phrase that sinners use to justify their sin while the meaning of the CONTEXT means to judge those who you don't have a sin in."

In [None]:
Love is eternal, indestructible, and always worth fighting for. Love is patient, love is kind, and love endures forever. Father's rights are God-given.

When the love is patient and kind, it is worthy to 'fight' for.