In [56]:
import jsonlines
import spacy
import spacy.language
from spacy.tokens import Doc
from spacy.scorer import Scorer
from spacy.vocab import Vocab
import statsmodels
import pandas as pd
import numpy as np

In [57]:
# path to jsonl overlap files
path_coco = "/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_coco.jsonl"
path_graf = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_leo.jsonl"
path_hoff = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_jona.jsonl"
path_jthn = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_jonathan.jsonl"

In [58]:
def jsonl_to_list(path):
    """takes path to jsonl file and returns list of dicts"""
    
    with jsonlines.open(path) as reader:
        list_of_dicts = list(reader)

    return list_of_dicts

In [59]:
def rel_per_entcomb(recipe):
    """takes annotated recipe as input and returns dict {(ent1, ent2) : rel, (ent1, entx): ...}"""
    
    entcomb_rel_dict = {}
    
    
    for rel in recipe["relations"]:
        
        head_span = tuple(range(rel["head_span"]["token_start"], rel["head_span"]["token_end"]+1))
        
        child_span = tuple(range(rel["child_span"]["token_start"], rel["child_span"]["token_end"]+1))
        
        rel_type = rel["label"]
        
        entcomb_rel_dict[(head_span, child_span)] = rel_type
        
    print(entcomb_rel_dict)
    
    
    return entcomb_rel_dict

In [60]:
def ents_per_recipe(recipe):
    """takes annotated recipe as input and returns set of entities for recipes"""
    
    ent_set = set()
    
    for ent in recipe["spans"]:
        
        tokens = tuple(list(range(ent["token_start"], ent["token_end"]+1)))
       
        entity = tuple([tokens, ent["label"]])
        
        ent_set.add(entity)
    
    
    return ent_set

In [79]:
def matching_entities_set(list_of_annotator_dicts): 
    """takes certain recipe as input and returns dict with text as key and matching entities as set"""
    
    amount_entities = 0 
    amount_match = 0
    
    match_ent_per_recipe = {}
    recipes = list_of_annotator_dicts[0]
    
    for rec in recipes: 
        try: 
            all_sets_per_rec = [ an[rec][0] for an in list_of_annotator_dicts]

            intersec = set.intersection(*all_sets_per_rec)
            
            match_ent_per_recipe[rec] = intersec
            
            tot_ents = 0
            for rec in all_sets_per_rec: 
                tot_ents += len(rec)
                
            amount_entities += tot_ents
            amount_match += len(intersec)*4
        
        except: 
            print("Recipe wasn't annotated by all.")
    
    print(amount_entities)
    print(amount_match)
    print(amount_match/amount_entities)
    
    
    return match_ent_per_recipe

In [62]:
def ent_comb_dataframe(m_entities_dict):
    """takes matching entites dict and returns dict with recipe_text as key and df with all ent_combs as value """
    
    entcomb_per_recipe_dict = {}
    
    for rec in m_entities_dict.keys():
        
        ents = list(m_entities_dict[rec])
        
        comb_list = []
        
        for i in range(len(ents)):
            for j in range(len(ents)):
                
                if i != j: 

                    comb_list.append((ents[i][0], ents[j][0]))
                else: 
                    pass
        
        zs = np.zeros(len(comb_list))
        
        df = pd.DataFrame( {"EntComb": comb_list, "ARG0": zs, "ARG1": zs, "ARG": zs, "ARGNone": zs})

        entcomb_per_recipe_dict[rec] = df
        
        
    return entcomb_per_recipe_dict

In [63]:
# list of individual overlap dicts
ov_recipes_coco = jsonl_to_list(path_coco)
ov_recipes_graf = jsonl_to_list(path_graf)
ov_recipes_hoff = jsonl_to_list(path_hoff)
ov_recipes_jthn = jsonl_to_list(path_jthn)

In [64]:
# create dict with all_ents rel per ent_comb for each recipe for each annotator
ov_dict_coco = {example["text"] : (ents_per_recipe(example), rel_per_entcomb(example)) for example in ov_recipes_coco}
ov_dict_graf = {example["text"] : (ents_per_recipe(example), rel_per_entcomb(example)) for example in ov_recipes_graf}
ov_dict_hoff = {example["text"] : (ents_per_recipe(example), rel_per_entcomb(example)) for example in ov_recipes_hoff}
ov_dict_jthn = {example["text"] : (ents_per_recipe(example), rel_per_entcomb(example)) for example in ov_recipes_jthn}

{((6,), (5,)): 'ARG0', ((10,), (8,)): 'ARG0', ((10,), (9,)): 'ARG', ((19,), (12,)): 'ARG0', ((19,), (14,)): 'ARG0', ((19,), (16,)): 'ARG0', ((19,), (18,)): 'ARG0', ((23,), (21, 22)): 'ARG', ((23,), (12,)): 'ARG0', ((23,), (8,)): 'ARG0', ((34,), (26, 27, 28, 29, 30, 31, 32, 33)): 'ARG0', ((36,), (26, 27, 28, 29, 30, 31, 32, 33)): 'ARG0', ((41,), (39, 40)): 'ARG0', ((41,), (44, 45, 46, 47)): 'ARG', ((51,), (50,)): 'ARG1', ((51,), (49,)): 'ARG', ((51,), (26, 27, 28, 29, 30, 31, 32, 33)): 'ARG0', ((57,), (53,)): 'ARG0', ((57,), (56,)): 'ARG1', ((57,), (54,)): 'ARG', ((70,), (62, 63)): 'ARG0', ((70,), (66,)): 'ARG', ((70,), (69,)): 'ARG1', ((70,), (67,)): 'ARG', ((80,), (62, 63)): 'ARG0', ((80,), (79,)): 'ARG1', ((80,), (77,)): 'ARG', ((80,), (74, 75, 76)): 'ARG', ((80,), (73,)): 'ARG', ((80,), (72,)): 'ARG', ((104,), (100,)): 'ARG0', ((104,), (101, 102)): 'ARG', ((109,), (106, 107)): 'ARG', ((109,), (100,)): 'ARG0', ((111,), (100,)): 'ARG0'}
{((2,), (1,)): 'ARG0', ((7,), (1,)): 'ARG0', ((7

{((2,), (0,)): 'ARG0', ((2,), (1,)): 'ARG', ((18,), (10,)): 'ARG0', ((18,), (17,)): 'ARG1', ((18,), (14,)): 'ARG1', ((18,), (12,)): 'ARG', ((18,), (15,)): 'ARG', ((23,), (21, 22)): 'ARG0', ((32,), (25,)): 'ARG0', ((32,), (26, 27, 28)): 'ARG', ((32,), (29,)): 'ARG', ((32,), (30, 31)): 'ARG', ((37,), (25,)): 'ARG0', ((37,), (34, 35, 36)): 'ARG', ((46,), (41,)): 'ARG0', ((46,), (39,)): 'ARG', ((46,), (42, 43, 44, 45)): 'ARG', ((58,), (54,)): 'ARG0', ((58,), (57,)): 'ARG1', ((58,), (55,)): 'ARG', ((58,), (48, 49, 50, 51)): 'ARG', ((63,), (61,)): 'ARG0', ((63,), (62,)): 'ARG', ((66,), (61,)): 'ARG0', ((66,), (65,)): 'ARG', ((72,), (68,)): 'ARG0', ((72,), (71,)): 'ARG1', ((72,), (69,)): 'ARG', ((81,), (75,)): 'ARG0', ((81,), (76, 77, 78)): 'ARG', ((81,), (79,)): 'ARG', ((81,), (80,)): 'ARG', ((93,), (83,)): 'ARG0', ((93,), (86, 87, 88)): 'ARG1', ((93,), (84,)): 'ARG', ((93,), (92,)): 'ARG1', ((93,), (90,)): 'ARG', ((93,), (89,)): 'ARG', ((107,), (95, 96, 97, 98, 99, 100)): 'ARG0', ((107,), (

In [65]:
ov_all_dicts = [ov_dict_coco, ov_dict_jthn, ov_dict_graf, ov_dict_hoff]
annot_names = ["Coco", "Giov", "Graf", "Hoff"]

In [66]:
# create dict with matching entities between all annotators per recipe
matching_entities_per_rec = matching_entities_set(ov_all_dicts)

Recipe wasn't annotated by all.
Recipe wasn't annotated by all.


In [67]:
# create dict with ent_comb_df for all recipes (based on matching entities)
dict_with_dfs = ent_comb_dataframe(matching_entities_per_rec)
print(dict_with_dfs)

{'Kartoffeln mit Schale garen.Tomaten nach dem Häuten entkernen und würfeln. Pinienkerne ohne Fett goldbraun rösten, Salat nach dem Waschen trocken schütteln. Essig, Pesto, Kapern, Gewürze (nach Geschmack) und 6 EL Öl verrühren, beiseite stellen.Kartoffeln pellen und in Scheiben schneiden. 3/4 der Vinaigrette darüber geben, restliches Viertel über den Rucola träufeln.Doradenfilets waschen, trocknen und würzen. Die restlichen 4 EL Öl erhitzen, die Dorade auf der weißen Seite ca. 4 Min. braten. Wenden, auf der anderen Seite ca. 1 Min. fertig braten.Kartoffeln und Rucola abschmecken, mit Tomatenwürfel und Pinienkernen bestreuen und sofort mit dem Fisch servieren.':               EntComb  ARG0  ARG1  ARG  ARGNone
0      ((58,), (16,))   0.0   0.0  0.0      0.0
1      ((58,), (65,))   0.0   0.0  0.0      0.0
2      ((58,), (23,))   0.0   0.0  0.0      0.0
3      ((58,), (94,))   0.0   0.0  0.0      0.0
4     ((58,), (104,))   0.0   0.0  0.0      0.0
...               ...   ...   ...  ...   

In [68]:
ov_all_dicts

[{'Für die Barbecue Sauce das Öl erhitzen, Zwiebeln glasig anbraten. Knoblauch, Nelken, Tabasco und Kümmel zugeben und 2 Minuten brutzeln lassen. Alle anderen Zutaten (außer den Rippchen) zugeben, aufkochen lassen und 20 Minuten köcheln lassen, bis die Sauce eindickt. Vom Herd nehmen und alles durch ein Sieb passieren.Bei normalen, dünnen Rippchen, diese kurz auf dem Grill anbraten und dann großzügig nach jedem wenden mit der Sauce einpinseln. Die entstehende Kruste darf ruhig etwas schwarz werden, schmeckt extrem lecker. Wichtig ist es, die Rippchen in Bewegung zu halten und im Minutentakt zu wenden und einzupinseln!Dicke Rippchen vor dem Einpinseln länger vorbraten. Oder vorher in heißem Wasser, nicht kochendem, 30 Minuten bis eine Stunde ziehen lassen.Dazu passt ein guter schwäbischer Kartoffelsalat und jede Menge kaltes Bier.': ({((5,),
     'Z'),
    ((6,), 'V'),
    ((8,), 'Z'),
    ((9,), 'ATTR'),
    ((10,), 'V'),
    ((12,), 'Z'),
    ((14,), 'Z'),
    ((16,), 'Z'),
    ((18,)

In [69]:
# loop through each recipe in ent_comb_df dict
# loop through each annotator for each recipe (if ent_comb exists in his dict += 1, else None += 1)

for recipe_text in dict_with_dfs.keys():     # loop through all recipes
    
    for annotator_dict in ov_all_dicts: 
        
        rels_dict = annotator_dict[recipe_text][1]
        
        for i in range(len(dict_with_dfs[recipe_text])):
            
            if dict_with_dfs[recipe_text].iloc[i, 0] in rels_dict.keys():
                
                col_i = dict_with_dfs[recipe_text].columns.get_loc(rels_dict[dict_with_dfs[recipe_text].iloc[i, 0]])
                
                dict_with_dfs[recipe_text].iloc[i, col_i] += 1
                
                
            else: 
                
                dict_with_dfs[recipe_text].iloc[i, 4] += 1
        
        

In [70]:
# merge all df
only_dfs = [v for v in dict_with_dfs.values()]

merged_df = pd.concat(only_dfs, ignore_index=True)
merged_df

Unnamed: 0,EntComb,ARG0,ARG1,ARG,ARGNone
0,"((58,), (16,))",0.0,0.0,0.0,4.0
1,"((58,), (65,))",0.0,0.0,0.0,4.0
2,"((58,), (23,))",0.0,0.0,0.0,4.0
3,"((58,), (94,))",0.0,0.0,0.0,4.0
4,"((58,), (104,))",0.0,0.0,0.0,4.0
...,...,...,...,...,...
67345,"((68,), (19,))",0.0,0.0,0.0,4.0
67346,"((68,), (58,))",0.0,0.0,0.0,4.0
67347,"((68,), (76,))",0.0,0.0,0.0,4.0
67348,"((68,), (125,))",0.0,0.0,0.0,4.0


In [73]:
# drop token column
merged_df_dropped = merged_df.drop(["EntComb"], axis=1)
merged_df_dropped

Unnamed: 0,ARG0,ARG1,ARG,ARGNone
0,0.0,0.0,0.0,4.0
1,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,4.0
3,0.0,0.0,0.0,4.0
4,0.0,0.0,0.0,4.0
...,...,...,...,...
67345,0.0,0.0,0.0,4.0
67346,0.0,0.0,0.0,4.0
67347,0.0,0.0,0.0,4.0
67348,0.0,0.0,0.0,4.0


In [74]:
# calculate kappa for all relations
from statsmodels.stats.inter_rater import fleiss_kappa

kappa = fleiss_kappa(merged_df_dropped, method="fleiss")
print(f"Overall: {round(kappa, 3)}")

Overall: 0.963


In [76]:
# calculate kappa for individual relations
import numpy as np
rels = ["ARG0", "ARG1", "ARG"]
for rel in rels: 
    df2 = merged_df_dropped.iloc[:, np.r_[merged_df_dropped.columns.get_loc(rel), merged_df_dropped.columns.get_loc("ARGNone")]]
    #print(df2)
    for i in range(len(df2)):
        if df2.iloc[i, :].sum() != 4:
            missing = 4 - (df2.iloc[i, 0] + df2.iloc[i, 1])
            df2.iloc[i, 1] += missing
    #print(df2)
    kap = fleiss_kappa(df2)
    print(f"{rel}: {round(kap, 3)}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.iloc[i, 1] += missing


ARG0: 0.929
ARG1: 0.976
ARG: 0.993


In [81]:
matching_entities_per_rec = matching_entities_set(ov_all_dicts)
print("Übereinstimmung entities")

Recipe wasn't annotated by all.
Recipe wasn't annotated by all.
5428
5088
0.9373618275607959
Übereinstimmung entities
