In [13]:
import jsonlines
import spacy
import spacy.language
from spacy.tokens import Doc
from spacy.scorer import Scorer
from spacy.vocab import Vocab
import statsmodels
import pandas as pd

In [3]:
# path to jsonl overlap files
path_coco = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_coco.jsonl"
path_graf = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_leo.jsonl"
path_hoff = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_jona.jsonl"
path_jthn = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_jonathan.jsonl"

In [11]:
def jsonl_to_list(path):
    """takes path to jsonl file and returns list of dicts"""
    
    with jsonlines.open(path) as reader:
        list_of_dicts = list(reader)

    return list_of_dicts

In [133]:
def label_per_token(recipe):
    """takes annotated recipes as input and returns a dict that maps label to every token"""

    amount_tokens = len(recipe["tokens"])

    all_token_dict = { tuple(range(token["start"], token["end"]+1)) : "None" for token in recipe["tokens"]}    # each token as list and value "None"

    for span in recipe["spans"]:

        all_chars = list(range(span["start"], span["end"]+1))

        label = span["label"]

        for char in all_chars:              #compare all characters with all token characters
            for tok in all_token_dict.keys():
                if char in tok: 
                    all_token_dict[tok] = label
    
    #print(f"Amount tokens: {amount_tokens} vs. Length dict: {len(all_token_dict.keys())}")

    return all_token_dict #{token: label}

In [134]:
def token_table_per_recipe(recipe):
    """takes example annotated recipe and creates empty dict with token_start_char as indices and ent classes as column labels"""

    columns = ["Tokens", "Z", "TOOL", "V", "ATTR", "PRÄP", "ZEITP", "DAUER", "TEMP", "None"]

    token_table = pd.DataFrame(columns=columns)

    toks = [tuple(range(token["start"], token["end"]+1)) for token in recipe["tokens"]]

    token_table["Tokens"] = toks

    token_table.fillna(0, inplace=True)

    return token_table  #pd style table

In [135]:
def calculate_kappa(table):
    """takes table with tokens and labels as input and returns kappa"""

    kappa = statsmodels.stats.inter_rater.fleiss_kappa(table, method="fleiss")


    return kappa

In [136]:
# list of individual overlap dicts
ov_recipes_coco = jsonl_to_list(path_coco)
ov_recipes_graf = jsonl_to_list(path_graf)
ov_recipes_hoff = jsonl_to_list(path_hoff)
ov_recipes_jthn = jsonl_to_list(path_jthn)

In [137]:
# create dict for each annotator: key=text of recipe, value = ent_set
ov_dict_coco = {example["text"] : label_per_token(example) for example in ov_recipes_coco}
ov_dict_graf = {example["text"] : label_per_token(example) for example in ov_recipes_graf}
ov_dict_hoff = {example["text"] : label_per_token(example) for example in ov_recipes_hoff}
ov_dict_jthn = {example["text"] : label_per_token(example) for example in ov_recipes_jthn}

In [138]:
ov_all_dicts = [ov_dict_coco, ov_dict_jthn, ov_dict_graf, ov_dict_hoff]
annot_names = ["Coco", "Giov", "Graf", "Hoff"]

In [139]:
#create list with empty df per recipe 
recipe_table_dict = {example["text"] : token_table_per_recipe(example) for example in ov_recipes_coco}

In [140]:
for person in ov_all_dicts:                 #loop through all annotators
    for example_recipe in person.keys():    #loop through all recipes of each annotator example_recipe = text
        
        try: 
            dataframe = recipe_table_dict[example_recipe]

            for key in person[example_recipe].keys():  #loop through label_per_tok

                    #print(key)
                        
                    i = dataframe.index[dataframe["Tokens"] == key]
                    #print(i)
                    col_i = dataframe.columns.get_loc(person[example_recipe][key])
                    #print(col_i)
                    dataframe.iloc[i, col_i] += 1
                    
        except: 
            print("Recipe was skipped.")


In [141]:
example_text = ov_recipes_coco[6]["text"]
recipe_table_dict[example_text].head(10)

Unnamed: 0,Tokens,Z,TOOL,V,ATTR,PRÄP,ZEITP,DAUER,TEMP,None
0,"(0, 1, 2, 3)",0,0,0,0,0,0,0,0,0
1,"(4, 5, 6, 7, 8, 9, 10, 11)",0,0,0,0,0,0,0,0,0
2,"(12, 13, 14)",0,0,0,0,0,0,0,0,0
3,"(15, 16, 17, 18, 19, 20)",0,0,0,0,0,0,0,0,0
4,"(21, 22, 23, 24, 25, 26, 27)",0,0,0,0,0,0,0,0,0
5,"(28, 29, 30, 31, 32, 33, 34, 35, 36)",0,0,0,0,0,0,0,0,0
6,"(36, 37)",0,0,0,0,0,0,0,0,0
7,"(37, 38, 39, 40, 41, 42, 43, 44, 45)",0,0,0,0,0,0,0,0,0
8,"(46, 47, 48, 49)",0,0,0,0,0,0,0,0,0
9,"(50, 51, 52, 53, 54, 55, 56, 57)",0,0,0,0,0,0,0,0,0


In [90]:
dict = label_per_token(ov_recipes_coco[0])
df = token_table_per_recipe(ov_recipes_coco[0])
df.head(10)

Amount tokens: 150 vs. Length dict: 150


Unnamed: 0,Tokens,Z,TOOL,V,ATTR,PRÄP,ZEITP,DAUER,TEMP,None
0,"(0, 1, 2, 3)",0,0,0,0,0,0,0,0,0
1,"(4, 5, 6, 7)",0,0,0,0,0,0,0,0,0
2,"(8, 9, 10, 11, 12, 13, 14, 15, 16)",0,0,0,0,0,0,0,0,0
3,"(17, 18, 19, 20, 21, 22)",0,0,0,0,0,0,0,0,0
4,"(23, 24, 25, 26)",0,0,0,0,0,0,0,0,0
5,"(27, 28, 29)",0,0,0,0,0,0,0,0,0
6,"(30, 31, 32, 33, 34, 35, 36, 37, 38)",0,0,0,0,0,0,0,0,0
7,"(38, 39)",0,0,0,0,0,0,0,0,0
8,"(40, 41, 42, 43, 44, 45, 46, 47, 48)",0,0,0,0,0,0,0,0,0
9,"(49, 50, 51, 52, 53, 54, 55)",0,0,0,0,0,0,0,0,0


In [97]:
#fill table for each recipe
for key in dict.keys():

    i = df.index[df["Tokens"] == key]
    col_i = df.columns.get_loc(dict[key])

    df.iloc[i, col_i] += 1

Int64Index([0], dtype='int64')
9
Int64Index([1], dtype='int64')
9
Int64Index([2], dtype='int64')
9
Int64Index([3], dtype='int64')
9
Int64Index([4], dtype='int64')
9
Int64Index([5], dtype='int64')
1
Int64Index([6], dtype='int64')
3
Int64Index([7], dtype='int64')
3
Int64Index([8], dtype='int64')
1
Int64Index([9], dtype='int64')
4
Int64Index([10], dtype='int64')
3
Int64Index([11], dtype='int64')
3
Int64Index([12], dtype='int64')
1
Int64Index([13], dtype='int64')
1
Int64Index([14], dtype='int64')
1
Int64Index([15], dtype='int64')
1
Int64Index([16], dtype='int64')
1
Int64Index([17], dtype='int64')
9
Int64Index([18], dtype='int64')
1
Int64Index([19], dtype='int64')
3
Int64Index([20], dtype='int64')
9
Int64Index([21], dtype='int64')
7
Int64Index([22], dtype='int64')
7
Int64Index([23], dtype='int64')
3
Int64Index([24], dtype='int64')
9
Int64Index([25], dtype='int64')
9
Int64Index([26], dtype='int64')
1
Int64Index([27], dtype='int64')
1
Int64Index([28], dtype='int64')
1
Int64Index([29], dtype='

In [93]:
x = (1, 2)
y = (1, 2)

print(x == y)

True


In [98]:
df.head(20)

Unnamed: 0,Tokens,Z,TOOL,V,ATTR,PRÄP,ZEITP,DAUER,TEMP,None
0,"(0, 1, 2, 3)",0,0,0,0,0,0,0,0,1
1,"(4, 5, 6, 7)",0,0,0,0,0,0,0,0,1
2,"(8, 9, 10, 11, 12, 13, 14, 15, 16)",0,0,0,0,0,0,0,0,1
3,"(17, 18, 19, 20, 21, 22)",0,0,0,0,0,0,0,0,1
4,"(23, 24, 25, 26)",0,0,0,0,0,0,0,0,1
5,"(27, 28, 29)",1,0,0,0,0,0,0,0,0
6,"(30, 31, 32, 33, 34, 35, 36, 37, 38)",0,0,1,0,0,0,0,0,0
7,"(38, 39)",0,0,1,0,0,0,0,0,0
8,"(40, 41, 42, 43, 44, 45, 46, 47, 48)",1,0,0,0,0,0,0,0,0
9,"(49, 50, 51, 52, 53, 54, 55)",0,0,0,1,0,0,0,0,0
