In [86]:
import json
from collections import defaultdict, Counter

In [53]:
#!/usr/bin/env python

'''
Created on Aug 1, 2016
@author: skarumbaiah

Computes Fleiss' Kappa 
Joseph L. Fleiss, Measuring Nominal Scale Agreement Among Many Raters, 1971.
'''

def checkInput(rate, n):
    """ 
    Check correctness of the input matrix
    @param rate - ratings matrix
    @return n - number of raters
    @throws AssertionError 
    """
    N = len(rate)
    k = len(rate[0])
    assert all(len(rate[i]) == k for i in range(k)), "Row length != #categories)"
    assert all(isinstance(rate[i][j], int) for i in range(N) for j in range(k)), "Element not integer" 
    assert all(sum(row) == n for row in rate), "Sum of ratings != #raters)"

def fleissKappa(rate,n):
    """ 
    Computes the Kappa value
    @param rate - ratings matrix containing number of ratings for each subject per category 
    [size - N X k where N = #subjects and k = #categories]
    @param n - number of raters   
    @return fleiss' kappa
    """

    N = len(rate)
    k = len(rate[0])
    print("#raters = ", n, ", #subjects = ", N, ", #categories = ", k)
    checkInput(rate, n)

    #mean of the extent to which raters agree for the ith subject 
    PA = sum([(sum([i**2 for i in row])- n) / (n * (n - 1)) for row in rate])/N
    print("PA = ", PA)
    
    # mean of squares of proportion of all assignments which were to jth category
    PE = sum([j**2 for j in [sum([rows[i] for rows in rate])/(N*n) for i in range(k)]])
    print("PE =", PE)
    
    kappa = -float("inf")
    try:
        kappa = (PA - PE) / (1 - PE)
        kappa = float("{:.3f}".format(kappa))
    except ZeroDivisionError:
        print("Expected agreement = 1")

    print("Fleiss' Kappa =", kappa)
    
    return kappa


In [None]:
c

In [144]:
# compiled = []
# labels = ['true','false','neutral']
# with open("../data-label-ground-truth/ncid_unique_generated_dataset_v0.4_2021_02_08.jsonl") as f:
#     for instance in map(json.loads, f):
#         anno_lines = {}
#         for evidence in instance['evidence']:
#             for page, annotations in evidence['annotations'].items():
#                 for line, scores in annotations.items():
# #                     if sum(len(v) for k,v in scores.items()) == 5:
#                         compiling = []
#                         compiling.append(len(scores['true']) if 'true' in scores else 0)
#                         compiling.append(len(scores['false']) if 'false' in scores else 0)
#                         sofar = sum(compiling)
#                         compiling.append(sum(len(v) for k,v in scores.items())-sofar)
#                         compiled.append(compiling)
#                         print(compiling)
#                         break
compiled =[]
clens = []
perf=0
tf = 0
tn = 0
fn = 0
with open("generated_dataset_v0.6_2021_03_31.jsonl") as f:
    for instance in map(json.loads, f):
        cnt = defaultdict(list)
        for evidence in instance['evidence']:
            for page, annotations in evidence['annotations'].items(): 
                for hit in evidence['metadata'][page]:
                    if True or page.endswith(":0"):
                        cnt[f"{page}_{hit['start_line']}_{hit['end_line']}"].append(hit['top_level_prediction']) 

        for k, v in cnt.items():
            ctr = Counter(v)
            compiling = []
            
            if len(ctr.keys()) == 1:
                perf +=1
            if ctr['true'] and ctr['false']:
                tf +=1
            if ctr['true'] and ctr['']:
                tn +=1
            if ctr[''] and ctr['false']:
                fn +=1
                
            compiling.append(ctr['true'] if 'true' in ctr else 0)
            compiling.append(ctr['false'] if 'false' in ctr else 0)
            sofar = sum(compiling)
            compiling.append(len(v)-sofar) 
            
            clens.append(sum(compiling))
            if sum(compiling) == 5:                        
                compiled.append(compiling)
print(perf,tf,tn,fn)  
dsit = Counter(clens)
print(dsit)

63201 6248 20366 10778
Counter({2: 55456, 5: 23025, 7: 6272, 6: 3322, 3: 2518, 4: 17, 8: 8})


In [143]:
fleissKappa(compiled,5)

#raters =  5 , #subjects =  5370 , #categories =  3
PA =  0.7297579143389145
PE = 0.43510911089610876
Fleiss' Kappa = 0.522


0.522