In [59]:
import math
import csv
import krippendorff
from scipy.stats import spearmanr
import re
import pandas as pd
import numpy as np
from pathlib import Path
from itertools import combinations
from collections import defaultdict

# Load data

In [60]:
def load_uses(filename='TRoTR/data/uses.tsv', sep='\t'):
    df = list()
    with open(filename, mode='r', encoding='utf-8') as f:
        columns = f.readline().rstrip().split(sep)
        for line in f.readlines():
            df.append(dict(zip(columns, line.rstrip().split(sep))))
    
    return pd.DataFrame(df)

def load_instances(filename, dirname='TRoTR/rounds', sep='\t'):
    df = list()
    with open(f'{dirname}/{filename}', mode='r', encoding='utf-8') as f:
        columns = f.readline().rstrip().split(sep) + ['dataID1', 'dataID2']
        for line in f.readlines():
            record = dict(zip(columns, line[:-1].split('\t')))
            record['dataID1'], record['dataID2'] = record['dataIDs'].split(',')
            df.append(record)
    
    return pd.DataFrame(df)

def load_judgments(filename, dirname='TRoTR/judgments', sep='\t'):
    df = list()
    with open(f'{dirname}/{filename}', mode='r', encoding='utf-8') as f:
        columns = f.readline().rstrip().split(sep)
        for line in f.readlines():
            record = dict(zip(columns, line.rstrip().split(sep)))
            if record['label'] == '-':
                record['label'] = math.nan
            df.append(record)

    df = pd.DataFrame(df)
    df['label'] = df['label'].astype(float)
    
    return df

def merge_data(df_uses, df_instances, df_judgments):
    df = df_judgments.merge(df_instances).merge(df_uses, left_on='dataID1', right_on='dataID')
    del df['dataID']
    del df['lemma']
    df = df.rename(columns={column: f'{column}1' for column in ['context', 'indices_target_token', 'indices_target_sentence']})
    df = df.merge(df_uses, left_on='dataID2', right_on='dataID')
    del df['dataID']
    df = df.rename(columns={column: f'{column}2' for column in ['context', 'indices_target_token', 'indices_target_sentence']})
    
    column_order = ['instanceID', 'dataID1', 'dataID2', 'label', 'annotator',  'lemma', 'context1', 'context2', 'indices_target_token1', 'indices_target_sentence1', 'indices_target_sentence2', 'indices_target_token2',  'comment', 'label_set', 'non_label', 'dataIDs']
    return df[column_order]

In [61]:
round_ = 'TRoTR.tsv'
df_judgments = load_judgments(round_)
df_uses = load_uses()
df_instances = load_instances(round_)
df = merge_data(df_uses, df_instances, df_judgments)

# Statistics

In [62]:
def inter_annotator_agreement(df, targets=None, instances=None):
    targets = df.lemma.unique() if targets is None else targets
    instances = df.instanceID.unique() if instances is None else instances
    df = df[df['lemma'].isin(targets) & df['instanceID'].isin(instances)]
    
    pairwise_spearman = list()
    weights = list()
    
    annotators = df.annotator.unique()
    for annotator1 in annotators:
        for annotator2 in annotators:
            if annotator1 == annotator2: continue
            
            df1 = df[df['annotator'] == annotator1]
            df2 = df[df['annotator'] == annotator2]
            
            if df2.shape[0] > df1.shape[0]:
                df1, df2 = df2, df1
    
            instances=df2.instanceID.values
            df1=df1[df1['instanceID'].isin(instances)].sort_values('instanceID')
            df2=df2[df2['instanceID'].isin(instances)].sort_values('instanceID')
            corr, pvalue = spearmanr(df1.label.values, df2.label.values, nan_policy='omit')
            pairwise_spearman.append(corr)
            weights.append(instances.shape[0])

    return np.average(pairwise_spearman, weights=weights).round(3)

def krippendorff_agreement(df, targets=None, instances=None):
    targets = df.lemma.unique() if targets is None else targets
    instances = df.instanceID.unique() if instances is None else instances
    df = df[df['lemma'].isin(targets) & df['instanceID'].isin(instances)]

    instances = df.instanceID.unique()
    annotators = df.annotator.unique()
    
    judgments = list()
    for annotator in annotators:
        df_ann = df[df['annotator']==annotator].sort_values('instanceID')

        # add nan values
        if df_ann.shape[0] < instances.shape[0]:
            nan_instances = np.setdiff1d(instances, df_ann.instanceID.values)
            new_df_ann = pd.DataFrame()
            new_df_ann['instanceID'] = nan_instances
            new_df_ann['label'] = math.nan
            df_ann = pd.concat([df_ann, new_df_ann]).sort_values('instanceID')
        
        judgments.append(df_ann.label.values)

    return round(krippendorff.alpha(judgments, level_of_measurement='ordinal'), 3)

def inter_annotator_agreement_per_target(df, targets=None, instances=None):
    targets = df.lemma.unique() if targets is None else targets
    instances = df.instanceID.unique() if instances is None else instances
    df = df[df['lemma'].isin(targets) & df['instanceID'].isin(instances)]
    
    pairwise_spearman = defaultdict(list)
    weights = defaultdict(list)
    
    annotators = df.annotator.unique()
    targets = df.lemma.unique()
    instances_dict = defaultdict(int)
    for target in targets:
        for annotator1 in annotators:
            for annotator2 in annotators:
                if annotator1 == annotator2: continue
                    
                df1 = df[(df['annotator'] == annotator1) & (df['lemma'] == target)]
                df2 = df[(df['annotator'] == annotator2)  & (df['lemma'] == target)]
                
                if df2.shape[0] > df1.shape[0]:
                    df1, df2 = df2, df1
        
                instances=df2.instanceID.values
                instances_dict[target]=max(instances.shape[0], instances_dict[target])
                df1=df1[df1['instanceID'].isin(instances)].sort_values('instanceID')
                df2=df2[df2['instanceID'].isin(instances)].sort_values('instanceID')
                corr, pvalue = spearmanr(df1.label.values, df2.label.values, nan_policy='omit')
                if corr == corr: # != math.nan
                    pairwise_spearman[target].append(corr)
                    weights[target].append(instances.shape[0])

    df_res = pd.DataFrame()
    df_res['lemma'] = targets
    df_res['avg_pairwise_spearman_agreement'] = [np.average(pairwise_spearman[target], weights=weights[target]).round(3) for target in targets]
    df_res['n_instances'] = [instances_dict[target] for target in targets]

    return df_res

def krippendorff_agreement_per_target(df, targets=None, instances=None):
    targets = df.lemma.unique() if targets is None else targets
    instances = df.instanceID.unique() if instances is None else instances
    df = df[df['lemma'].isin(targets) & df['instanceID'].isin(instances)]
    
    targets = df.lemma.unique()
    annotators = df.annotator.unique()

    df_res = list()
    for target in targets:
        instances = df[df['lemma']==target].instanceID.unique()
        
        judgments = list()
        for annotator in annotators:
            df_ann = df[(df['annotator']==annotator) & (df['instanceID'].isin(instances))].sort_values('instanceID')
    
            # add nan values
            if df_ann.shape[0] < instances.shape[0]:
                nan_instances = np.setdiff1d(instances, df_ann.instanceID.values)
                new_df_ann = pd.DataFrame()
                new_df_ann['instanceID'] = nan_instances
                new_df_ann['label'] = math.nan
                df_ann = pd.concat([df_ann, new_df_ann]).sort_values('instanceID')
            
            judgments.append(df_ann.label.values)

        df_res.append(dict(lemma=target, krippendorff_agreement=round(krippendorff.alpha(judgments, level_of_measurement='ordinal'), 3), n_instances=instances.shape[0]))

    return pd.DataFrame(df_res)

In [63]:
aps = inter_annotator_agreement(df) # average pairwise spearman correlation
ka = krippendorff_agreement(df)
(aps, ka)

(0.506, 0.42)

In [64]:
aps_targets = inter_annotator_agreement_per_target(df) # average pairwise spearman correlation
ka_targets = krippendorff_agreement_per_target(df)
aps_ka_targets = pd.merge(ka_targets, aps_targets, on='lemma')
aps_ka_targets['length'] = aps_ka_targets['lemma'].apply(lambda x: len(x))
aps_ka_targets.sort_values(by='length')

Unnamed: 0,lemma,krippendorff_agreement,n_instances_x,avg_pairwise_spearman_agreement,n_instances_y,length
34,that all may be one,0.118,150,0.183,150,19
37,Seventy times seven,0.494,150,0.619,150,19
19,turn the other cheek,-0.036,150,0.51,150,20
21,Seek and you will find,0.21,150,0.558,150,22
14,The Lord is my shepherd,0.213,150,0.476,150,23
11,the truth will set you free,0.25,150,0.347,150,27
3,"Love is patient, love is kind",0.282,150,0.382,150,29
35,For everything there is a season,0.263,150,0.443,150,32
2,"Judge not, that ye be not judged",0.472,150,0.555,150,32
28,all things work together for good,0.11,150,0.543,150,33


In [56]:
targets = aps_ka_targets[(aps_ka_targets['avg_pairwise_spearman_agreement'] > 0.15)].lemma.values
new_aps = inter_annotator_agreement(df, targets) # average pairwise spearman correlation
new_ka = krippendorff_agreement(df, targets)
print((new_aps, new_ka, f'{len(targets)} out of {df.lemma.unique().shape[0]} targets'))

(0.511, 0.423, '40 out of 42 targets')


In [57]:
# remove instances with judgment difference greater than 1
tmp = df[['instanceID', 'label']].groupby('instanceID').agg(['unique']).reset_index()
tmp[('label', 'unique')] = [max(i) - min(i) < 2 for i in tmp[('label', 'unique')]]
filtered_instances = tmp[tmp[('label', 'unique')]].instanceID.values

# remove instances with avg judgment between 2 and 3
tmp = df[['instanceID', 'label']].groupby('instanceID').mean().reset_index()
tmp['label'] = [0 if i <= 2 else i for i in tmp['label']]
tmp['label'] = [1 if i >= 3 else i for i in tmp['label']]
tmp = tmp[(tmp['label'].isin([0,1])) & (tmp['instanceID'].isin(filtered_instances))]
#print(np.unique(tmp.label.values, return_counts=True))
instances = tmp.instanceID.values

new_aps = inter_annotator_agreement(df, instances=instances) # average pairwise spearman correlation
new_ka = krippendorff_agreement(df, instances=instances)
print((new_aps, new_ka, f'{len(instances)} out of {df.instanceID.unique().shape[0]} instances'))

(0.811, 0.709, '3821 out of 6300 instances')


In [58]:
aps_targets = inter_annotator_agreement_per_target(df, instances=instances) # average pairwise spearman correlation
ka_targets = krippendorff_agreement_per_target(df, instances=instances)
aps_ka_targets = pd.merge(ka_targets, aps_targets, on='lemma')
aps_ka_targets['length'] = aps_ka_targets['lemma'].apply(lambda x: len(x))
aps_ka_targets.sort_values(by='length')

Unnamed: 0,lemma,krippendorff_agreement,n_instances_x,avg_pairwise_spearman_agreement,n_instances_y,length
34,that all may be one,0.677,79,0.782,79,19
37,Seventy times seven,0.764,95,0.857,95,19
19,turn the other cheek,-0.193,120,0.405,120,20
21,Seek and you will find,0.097,125,0.475,125,22
14,The Lord is my shepherd,0.138,117,0.431,117,23
11,the truth will set you free,0.217,101,0.368,101,27
3,"Love is patient, love is kind",0.798,61,0.834,61,29
35,For everything there is a season,0.369,104,0.557,104,32
2,"Judge not, that ye be not judged",0.45,96,0.469,96,32
28,all things work together for good,-0.03,124,0.43,124,33


# DURel format

In [11]:
durel_uses = df_uses.copy()
durel_judgments = df.copy()

# DURel preprocessing
durel_judgments['lemma'] = durel_judgments['lemma'].apply(lambda x: x.replace(' ', '_').replace(',', ''))
durel_judgments['timestamp'] = ''
durel_judgments = durel_judgments.rename(columns={'dataID1': 'identifier1', 
                                                  'dataID2': 'identifier2', 
                                                  'label':'judgment'})
columns = ['identifier1', 'identifier2', 'annotator', 'judgment', 'comment', 'lemma', 'timestamp']
durel_judgments = durel_judgments[columns]
durel_judgments['judgment'] = [j if j == j else 0.0 for j in durel_judgments['judgment']]

durel_uses['lemma'] = durel_uses['lemma'].apply(lambda x: x.replace(' ', '_').replace(',', ''))
durel_uses['description'] = ""
durel_uses['pos'] = ""
durel_uses['date'] = ""
durel_uses['grouping'] = ""
durel_uses = durel_uses.rename(columns={'dataID': 'identifier', 
                                        'indices_target_sentence': 'indexes_target_sentence', 
                                        'indices_target_token': 'indexes_target_token'})
columns = ['lemma', 'pos', 'date', 'grouping', 'identifier', 'description', 'context', 'indexes_target_token', 'indexes_target_sentence']
durel_uses = durel_uses[columns]

targets = durel_uses.lemma.unique()
for target in targets:    
    df_target_uses = durel_uses[durel_uses['lemma'] == target]
    df_target_judgments = durel_judgments[durel_judgments['lemma'] == target]

    # find id quote
    id_quote = re.search('\(.*\)', df_target_uses.iloc[0].identifier).group(0).replace(':', ' ')

    # make dirs and store dataframe
    Path(f'TRoTR/DURel_data/{id_quote}').mkdir(parents=True, exist_ok=True)
    Path(f'TRoTR/DURel_data/uses').mkdir(parents=True, exist_ok=True)
    df_target_uses.to_csv(f'TRoTR/DURel_data/{id_quote}/uses.tsv', index=False, sep='\t', quoting=csv.QUOTE_NONE)
    df_target_uses.to_csv(f'TRoTR/DURel_data/uses/{id_quote}.tsv', index=False, sep='\t', quoting=csv.QUOTE_NONE)

    # make dirs and store dataframe
    Path(f'TRoTR/DURel_data/judgments').mkdir(parents=True, exist_ok=True)
    df_target_judgments.to_csv(f'TRoTR/DURel_data/{id_quote}/judgments.tsv', index=False, sep='\t', quoting=csv.QUOTE_NONE)
    df_target_judgments.to_csv(f'TRoTR/DURel_data/judgments/{id_quote}.tsv', index=False, sep='\t', quoting=csv.QUOTE_NONE)

Number of pairs with more than one 'cannot_decide'

In [None]:
cannot_decide = df[~df['label'].isin([1,2,3,4])].fillna('-').groupby(['instanceID', 'label']).count().reset_index()
cannot_decide[cannot_decide['annotator'] > 1]

In [37]:
# tutorial
import math

gold = [4, 1, 2, 3, 4, 3, 3, 2, 1, 2, 4, 1, 3, 4, 1, 2, 1, 3, 2, 1, 3, 4, 1, 4, 4] 
ann1 = [4, math.nan, 2, 3, 4, 4, 3, 3, 2, 2, 4, 2, 3, 4, 1, 3, 2, 4, 3, 3, 3, 4, 3, 4, 3] 
ann2 = [3, 1, 1, 3, 4, 4, 4, 3, 3, 3, 4, 2, 3, 4, 4, 4, 1, 3, 2, 1, 2, 3, 1, 4, 3]
ann3 = [4, 1, 1, 3, 4, 3, 3, 4, 1, 2, 3, 3, 4, 4, 4, 3, 1, 4, 4, 1, 4, 4, 3, 4, 3]
ann4 = [2, 2, 3, 4, 4, 3, 4, 3, 1, 2, 4, 2, 4, 3, 3, 3, 2, 2, 3, 2, 4, 4, 1, 3, 4]

print(spearmanr(gold, ann4, nan_policy='omit'))

SignificanceResult(statistic=0.6574937649306044, pvalue=0.00035505820322849984)
