In [1]:
import jsonlines
import spacy
import spacy.language
from spacy.tokens import Doc
from spacy.scorer import Scorer
from spacy.vocab import Vocab
import statsmodels
import pandas as pd
import numpy as np

In [2]:
# path to jsonl overlap files
path_coco = "/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_coco.jsonl"
path_graf = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_leo.jsonl"
path_hoff = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_jona.jsonl"
path_jthn = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_jonathan.jsonl"

In [3]:
def jsonl_to_list(path):
    """takes path to jsonl file and returns list of dicts"""
    
    with jsonlines.open(path) as reader:
        list_of_dicts = list(reader)

    return list_of_dicts

In [4]:
def rel_per_entcomb(recipe):
    """takes annotated recipe as input and returns dict {(ent1, ent2) : rel, (ent1, entx): ...}"""
    
    entcomb_rel_dict = {}
    
    
    for rel in recipe["relations"]:
        
        head_span = tuple(range(rel["head_span"]["token_start"], rel["head_span"]["token_end"]+1))
        
        child_span = tuple(range(rel["child_span"]["token_start"], rel["child_span"]["token_end"]+1))
        
        rel_type = rel["label"]
        
        entcomb_rel_dict[(head_span, child_span)] = rel_type
        
    #print(entcomb_rel_dict)
    
    
    return entcomb_rel_dict

In [5]:
def ents_per_recipe(recipe):
    """takes annotated recipe as input and returns set of entities for recipes"""
    
    ent_set = set()
    
    for ent in recipe["spans"]:
        
        tokens = tuple(list(range(ent["token_start"], ent["token_end"]+1)))
       
        entity = tuple([tokens, ent["label"]])
        
        ent_set.add(entity)
    
    
    return ent_set

In [6]:
def matching_entities_set(list_of_annotator_dicts): 
    """takes certain recipe as input and returns dict with text as key and matching entities as set"""
    
    amount_entities = 0 
    amount_match = 0
    
    match_ent_per_recipe = {}
    recipes = list_of_annotator_dicts[0]
    
    for rec in recipes: 
        try: 
            all_sets_per_rec = [ an[rec][0] for an in list_of_annotator_dicts]

            intersec = set.intersection(*all_sets_per_rec)
            
            match_ent_per_recipe[rec] = intersec
            
            tot_ents = 0
            for rec in all_sets_per_rec: 
                tot_ents += len(rec)
                
            amount_entities += tot_ents
            amount_match += len(intersec)*4
        
        except: 
            print("Recipe wasn't annotated by all.")
    
    print(amount_entities)
    print(amount_match)
    print(amount_match/amount_entities)
    
    
    return match_ent_per_recipe

In [7]:
def ent_comb_dataframe(m_entities_dict):
    """takes matching entites dict and returns dict with recipe_text as key and df with all ent_combs as value """
    
    entcomb_per_recipe_dict = {}
    
    for rec in m_entities_dict.keys():
        
        ents = list(m_entities_dict[rec])
        
        comb_list = []
        
        for i in range(len(ents)):
            for j in range(len(ents)):
                
                if i != j: 

                    comb_list.append((ents[i][0], ents[j][0]))
                else: 
                    pass
        
        zs = np.zeros(len(comb_list))
        
        df = pd.DataFrame( {"EntComb": comb_list, "ARG0": zs, "ARG1": zs, "ARG": zs, "ARGNone": zs})

        entcomb_per_recipe_dict[rec] = df
        
        
    return entcomb_per_recipe_dict

In [8]:
# list of individual overlap dicts
ov_recipes_coco = jsonl_to_list(path_coco)
ov_recipes_graf = jsonl_to_list(path_graf)
ov_recipes_hoff = jsonl_to_list(path_hoff)
ov_recipes_jthn = jsonl_to_list(path_jthn)

In [9]:
# create dict with all_ents rel per ent_comb for each recipe for each annotator
ov_dict_coco = {example["text"] : (ents_per_recipe(example), rel_per_entcomb(example)) for example in ov_recipes_coco}
ov_dict_graf = {example["text"] : (ents_per_recipe(example), rel_per_entcomb(example)) for example in ov_recipes_graf}
ov_dict_hoff = {example["text"] : (ents_per_recipe(example), rel_per_entcomb(example)) for example in ov_recipes_hoff}
ov_dict_jthn = {example["text"] : (ents_per_recipe(example), rel_per_entcomb(example)) for example in ov_recipes_jthn}

In [10]:
ov_all_dicts = [ov_dict_coco, ov_dict_jthn, ov_dict_graf, ov_dict_hoff]
annot_names = ["Coco", "Giov", "Graf", "Hoff"]

In [11]:
# create dict with matching entities between all annotators per recipe
matching_entities_per_rec = matching_entities_set(ov_all_dicts)

Recipe wasn't annotated by all.
Recipe wasn't annotated by all.
5428
5088
0.9373618275607959


In [13]:
# create dict with ent_comb_df for all recipes (based on matching entities)
dict_with_dfs = ent_comb_dataframe(matching_entities_per_rec)
print(dict_with_dfs)

Hitze kochen.2/3 vom Weißbrot grob, 1/3 fein würfeln. Die groben Brotwürfel zur Suppe geben und nochmals 10 Min. kochen lassen. 4 EL vom Parmesan einrühren und die Suppe etwas abkühlen lassen. Petersilienblätter und Sellerieblätter abzupfen, waschen und hacken. Die Suppe pürieren und mit Salz, Pfeffer und Muskat abschmecken. 3/4 der Suppe beiseite stellen. Den Rest mit den Kräutern pürieren. Die helle Suppe mit 200 g Schlagsahne aufkochen. Die kleinen Brotwürfel in 5 EL Olivenöl rösten. Die helle Suppe auf Tellern verteilen, in die Mitte etwas von der grünen Suppe geben.Mit den Brotwürfeln und 2 EL Parmesan garnieren.':                        EntComb  ARG0  ARG1  ARG  ARGNone
0               ((26,), (10,))   0.0   0.0  0.0      0.0
1               ((26,), (11,))   0.0   0.0  0.0      0.0
2               ((26,), (78,))   0.0   0.0  0.0      0.0
3               ((26,), (65,))   0.0   0.0  0.0      0.0
4     ((26,), (109, 110, 111))   0.0   0.0  0.0      0.0
...                        ...

In [14]:
ov_all_dicts

    ((52,), 'V'),
    ((54,), 'V'),
    ((56,), 'Z'),
    ((57,), 'V'),
    ((59,), 'V'),
    ((61, 62), 'ATTR'),
    ((63,), 'V'),
    ((65,), 'Z'),
    ((66,), 'V'),
    ((68, 69), 'ATTR'),
    ((70,), 'V'),
    ((72,), 'V'),
    ((75,), 'Z'),
    ((76,), 'V'),
    ((78,), 'ATTR'),
    ((79,), 'V'),
    ((82,), 'Z'),
    ((83,), 'V'),
    ((85,), 'Z'),
    ((86,), 'PRÄP'),
    ((88,), 'TOOL'),
    ((89,), 'V'),
    ((91, 92, 93, 94, 95, 96), 'Z'),
    ((97,), 'V'),
    ((100,), 'Z'),
    ((101,), 'ATTR'),
    ((102, 103, 104), 'DAUER'),
    ((105,), 'V'),
    ((107,), 'Z'),
    ((109,), 'Z'),
    ((110, 111, 112, 113, 114, 115, 116), 'DAUER'),
    ((117,), 'V'),
    ((119,), 'Z'),
    ((120,), 'PRÄP'),
    ((121,), 'Z'),
    ((123,), 'Z'),
    ((124,), 'V'),
    ((126, 127), 'ATTR'),
    ((128,), 'V'),
    ((130,), 'PRÄP'),
    ((131,), 'Z'),
    ((133,), 'Z'),
    ((134, 135, 136, 137, 138, 139, 140), 'ZEITP'),
    ((141,), 'PRÄP'),
    ((142,), 'Z'),
    ((143,), 'V'),
    ((145,),

In [15]:
# loop through each recipe in ent_comb_df dict
# loop through each annotator for each recipe (if ent_comb exists in his dict += 1, else None += 1)

for recipe_text in dict_with_dfs.keys():     # loop through all recipes
    
    for annotator_dict in ov_all_dicts: 
        
        rels_dict = annotator_dict[recipe_text][1]
        
        for i in range(len(dict_with_dfs[recipe_text])):
            
            if dict_with_dfs[recipe_text].iloc[i, 0] in rels_dict.keys():
                
                col_i = dict_with_dfs[recipe_text].columns.get_loc(rels_dict[dict_with_dfs[recipe_text].iloc[i, 0]])
                
                dict_with_dfs[recipe_text].iloc[i, col_i] += 1
                
                
            else: 
                
                dict_with_dfs[recipe_text].iloc[i, 4] += 1
        
        

In [16]:
# merge all df
only_dfs = [v for v in dict_with_dfs.values()]

merged_df = pd.concat(only_dfs, ignore_index=True)
merged_df

Unnamed: 0,EntComb,ARG0,ARG1,ARG,ARGNone
0,"((83,), (11,))",0.0,0.0,0.0,4.0
1,"((83,), (63,))",0.0,0.0,0.0,4.0
2,"((83,), (38, 39, 40))",0.0,0.0,0.0,4.0
3,"((83,), (47,))",0.0,0.0,0.0,4.0
4,"((83,), (76, 77, 78, 79))",0.0,0.0,0.0,4.0
...,...,...,...,...,...
67345,"((43,), (116,))",0.0,0.0,0.0,4.0
67346,"((43,), (131,))",0.0,0.0,0.0,4.0
67347,"((43,), (132,))",0.0,0.0,0.0,4.0
67348,"((43,), (31,))",0.0,0.0,0.0,4.0


In [17]:
# drop token column
merged_df_dropped = merged_df.drop(["EntComb"], axis=1)
merged_df_dropped

Unnamed: 0,ARG0,ARG1,ARG,ARGNone
0,0.0,0.0,0.0,4.0
1,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,4.0
3,0.0,0.0,0.0,4.0
4,0.0,0.0,0.0,4.0
...,...,...,...,...
67345,0.0,0.0,0.0,4.0
67346,0.0,0.0,0.0,4.0
67347,0.0,0.0,0.0,4.0
67348,0.0,0.0,0.0,4.0


In [18]:
# calculate kappa for all relations
from statsmodels.stats.inter_rater import fleiss_kappa

kappa = fleiss_kappa(merged_df_dropped, method="fleiss")
print(f"Overall: {round(kappa, 3)}")

Overall: 0.963


In [19]:
# calculate kappa for individual relations
import numpy as np
rels = ["ARG0", "ARG1", "ARG"]
for rel in rels: 
    df2 = merged_df_dropped.iloc[:, np.r_[merged_df_dropped.columns.get_loc(rel), merged_df_dropped.columns.get_loc("ARGNone")]]
    #print(df2)
    for i in range(len(df2)):
        if df2.iloc[i, :].sum() != 4:
            missing = 4 - (df2.iloc[i, 0] + df2.iloc[i, 1])
            df2.iloc[i, 1] += missing
    #print(df2)
    kap = fleiss_kappa(df2)
    print(f"{rel}: {round(kap, 3)}")

ARG0: 0.929
ARG1: 0.976
ARG: 0.993


In [81]:
matching_entities_per_rec = matching_entities_set(ov_all_dicts)
print("Übereinstimmung entities")

Recipe wasn't annotated by all.
Recipe wasn't annotated by all.
5428
5088
0.9373618275607959
Übereinstimmung entities
