In [2]:
import pandas as pd
import numpy as np
import pickle
import glob
import os
import matplotlib.pyplot as plt
import statsmodels
from statsmodels import stats
from statsmodels.stats import inter_rater
import krippendorff
from statsmodels.stats.inter_rater import fleiss_kappa

""" 
Get IAA without O-Label

"""

def get_paths(input_folder):
    """
    Stores all .txt files in a list
    Returns a list of strings of filepaths from the Text (volumes) folder
    :param inputfolder: inputfolder used in main
    """
    list_files = []
    conll_folder = glob.glob(input_folder + '/*.conll')

    for filename in conll_folder:
        list_files.append(filename)

    return list_files


def load_text(txt_path):
    """
    Opens the container en reads the elements(strings)
    Returns a string
    :param txt path: list with filepaths
    """
    with open(txt_path, 'rt', encoding="utf-8") as infile:
        content = infile.readlines()

    return content


def get_counts(paths):
    
    """
    Count amounts of entities
    """
    
    entities = []    
    
    for line in paths:
        components = line.split()
        if len(components) > 0:
            entities.append(components[1])

    b_mon = entities.count("B-MON")
    i_mon = entities.count("I-MON")
    b_qnt = entities.count('B-QNT')
    i_qnt = entities.count('I-QNT')
    b_tgd = entities.count('B-TGD')
    i_tgd = entities.count('I-TGD')
    b_msr = entities.count('B-MSR')
    i_msr = entities.count('I-MSR')
    
    return b_mon, i_mon, b_qnt, i_qnt, b_tgd, i_tgd, b_msr, i_msr


def count_total_scores(scores):
    """ 
    Count total scores of labels in the datasets
    
    """
    result = []
    for i in range(len(scores[0])):
        result.append(sum([pair[i] for pair in scores]))
    return result    
    

input_folder = '../data/interannotator'
txt_paths = get_paths(input_folder)



"""
Get Krippendorff alpha and table of IAA
"""

entities = ["B-MON","I-MON","B-QNT",'I-QNT','B-TGD',"I-TGD","B-MSR","I-MSR"]

filenames = []
scores = []

for path in txt_paths:
    filename = path.split("interannotator\\")
    filenames.append(filename)
    
    paths = load_text(path)
    scores.append(get_counts(paths))

total_scores = count_total_scores(scores)
tabel_files = pd.DataFrame(filenames)
tabel_scores = pd.DataFrame(np.asarray(scores))
table = np.asarray(scores)
print(table)
kappa = krippendorff.alpha(table)
print(kappa)
tabel_names = ["none", "Filename","B-MON","I-MON","B-QNT",'I-QNT','B-TGD',"I-TGD","B-MSR","I-MSR"]

tabel = pd.concat([tabel_files, tabel_scores.reindex(tabel_files.index)], axis=1)
tabel.columns = tabel_names 
tabel = tabel.drop(['none'], axis = 1)

tabel

[[ 6  0 11  0  6  1  8  0]
 [ 2  0 13  0  7  0 10  0]
 [ 2  0 12  2 12  0 12  0]]
0.8872218969555035


Unnamed: 0,Filename,B-MON,I-MON,B-QNT,I-QNT,B-TGD,I-TGD,B-MSR,I-MSR
0,amber_missive_13_15_text.conll,6,0,11,0,6,1,8,0
1,jasmine_missive_13_15_text.conll,2,0,13,0,7,0,10,0
2,mel_missive_13_15_text.conll,2,0,12,2,12,0,12,0


In [3]:
""" 
Get IAA with O-Label

"""

def get_paths(input_folder):
    """
    Stores all .txt files in a list
    Returns a list of strings of filepaths from the Text (volumes) folder
    :param inputfolder: inputfolder used in main
    """
    list_files = []
    conll_folder = glob.glob(input_folder + '/*.conll')

    for filename in conll_folder:
        list_files.append(filename)

    return list_files


def load_text(txt_path):
    """
    Opens the container en reads the elements(strings)
    Returns a string
    :param txt path: list with filepaths
    """
    with open(txt_path, 'rt', encoding="utf-8") as infile:
        content = infile.readlines()

    return content


def get_counts(paths):
    
    """
    Count amounts of entities
    """
    
    entities = []     
    
    for line in paths:
        components = line.split()
        if len(components) > 0:
            entities.append(components[1])

    b_mon = entities.count("B-MON")
    i_mon = entities.count("I-MON")
    b_qnt = entities.count('B-QNT')
    i_qnt = entities.count('I-QNT')
    b_tgd = entities.count('B-TGD')
    i_tgd = entities.count('I-TGD')
    b_msr = entities.count('B-MSR')
    i_msr = entities.count('I-MSR')
    o = entities.count('O')
    
    return b_mon, i_mon, b_qnt, i_qnt, b_tgd, i_tgd, b_msr, i_msr, o


def count_total_scores(scores):
    """ 
    Count total scores of labels in the datasets
    
    """
    result = []
    for i in range(len(scores[0])):
        result.append(sum([pair[i] for pair in scores]))
    return result    


input_folder = '../data/interannotator'
txt_paths = get_paths(input_folder)


"""
Get Krippendorff alpha and table of IAA
"""

entities = ["B-MON","I-MON","B-QNT",'I-QNT','B-TGD',"I-TGD","B-MSR","I-MSR", "O"]

filenames = []
scores = []

for path in txt_paths:
    filename = path.split("interannotator\\")
    filenames.append(filename)
    
    paths = load_text(path)
    scores.append(get_counts(paths))

total_scores = count_total_scores(scores)
tabel_files = pd.DataFrame(filenames)
tabel_scores = pd.DataFrame(np.asarray(scores))
table = np.asarray(scores)
print(table)
kappa = krippendorff.alpha(table)
print(kappa)
tabel_names = ["none", "Filename","B-MON","I-MON","B-QNT",'I-QNT','B-TGD',"I-TGD","B-MSR","I-MSR", "O"]

tabel = pd.concat([tabel_files, tabel_scores.reindex(tabel_files.index)], axis=1)
tabel.columns = tabel_names 
tabel = tabel.drop(['none'], axis = 1)

tabel

[[  6   0  11   0   6   1   8   0 939]
 [  2   0  13   0   7   0  10   0 939]
 [  2   0  12   2  12   0  12   0 931]]
0.999945554048461


Unnamed: 0,Filename,B-MON,I-MON,B-QNT,I-QNT,B-TGD,I-TGD,B-MSR,I-MSR,O
0,amber_missive_13_15_text.conll,6,0,11,0,6,1,8,0,939
1,jasmine_missive_13_15_text.conll,2,0,13,0,7,0,10,0,939
2,mel_missive_13_15_text.conll,2,0,12,2,12,0,12,0,931


# End of Notebook
