In [1]:
import jsonlines
import spacy
import spacy.language
from spacy.tokens import Doc
from spacy.scorer import Scorer
from spacy.vocab import Vocab
import statsmodels
import pandas as pd

In [40]:
# path to jsonl overlap files
path_coco = "/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_coco.jsonl"
path_graf = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_leo.jsonl"
path_hoff = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_jona.jsonl"
path_jthn = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding/01-Data/20_overlap/overlap_total/overlap_jonathan.jsonl"

In [41]:
def jsonl_to_list(path):
    """takes path to jsonl file and returns list of dicts"""
    
    with jsonlines.open(path) as reader:
        list_of_dicts = list(reader)

    return list_of_dicts

In [42]:
def label_per_token(recipe):
    """takes annotated recipes as input and returns a dict that maps label to every token"""

    amount_tokens = len(recipe["tokens"])

    all_token_dict = { tuple(range(token["start"], token["end"]+1)): {"ent_label" : "None", "rel_label": "ARGNone"} for token in recipe["tokens"]}    # each token as list and value "None"

    for span in recipe["spans"]:

        all_chars = list(range(span["start"], span["end"]+1))

        label = span["label"]

        for char in all_chars:              #compare all characters with all token characters
            for tok in all_token_dict.keys():
                if char in tok: 
                    all_token_dict[tok]["ent_label"] = label
    
    for relation in recipe["relations"]: 

        all_chars2 = list(range(relation["child_span"]["start"], relation["child_span"]["end"]+1))
        print(all_chars2)

        label = relation["label"]

        for char2 in all_chars2: 
            for tok in all_token_dict.keys():
                if char2 in tok: 
                    all_token_dict[tok]["rel_label"] = label
    
    #print(f"Amount tokens: {amount_tokens} vs. Length dict: {len(all_token_dict.keys())}")

    return all_token_dict #{token: {ent_label : label, rel_label : label}, ...}

In [43]:
def token_table_per_recipe(recipe):
    """takes example annotated recipe and creates empty dict with token_start_char as indices and ent classes as column labels"""

    columns = ["Tokens", "Z", "TOOL", "V", "ATTR", "PRÄP", "ZEITP", "DAUER", "TEMP", "None", "ARG0", "ARG1", "ARG", "ARGNone"]

    token_table = pd.DataFrame(columns=columns)

    toks = [tuple(range(token["start"], token["end"]+1)) for token in recipe["tokens"]]

    token_table["Tokens"] = toks

    token_table.fillna(0, inplace=True)

    return token_table  #pd style table

In [44]:
def calculate_kappa(table):
    """takes table with tokens and labels as input and returns kappa"""

    from statsmodels.stats.inter_rater import fleiss_kappa 

    
    kappa = statsmodels.stats.inter_rater.fleiss_kappa(table, method="fleiss")


    return kappa

In [45]:
# list of individual overlap dicts
ov_recipes_coco = jsonl_to_list(path_coco)
ov_recipes_graf = jsonl_to_list(path_graf)
ov_recipes_hoff = jsonl_to_list(path_hoff)
ov_recipes_jthn = jsonl_to_list(path_jthn)

In [46]:
# create dict for each annotator: key=text of recipe, value = ent_set
ov_dict_coco = {example["text"] : label_per_token(example) for example in ov_recipes_coco}
ov_dict_graf = {example["text"] : label_per_token(example) for example in ov_recipes_graf}
ov_dict_hoff = {example["text"] : label_per_token(example) for example in ov_recipes_hoff}
ov_dict_jthn = {example["text"] : label_per_token(example) for example in ov_recipes_jthn}

[27, 28, 29]
[40, 41, 42, 43, 44, 45, 46, 47, 48]
[49, 50, 51, 52, 53, 54, 55]
[66, 67, 68, 69, 70, 71, 72, 73, 74, 75]
[77, 78, 79, 80, 81, 82, 83]
[85, 86, 87, 88, 89, 90, 91, 92]
[97, 98, 99, 100, 101, 102, 103]
[116, 117, 118, 119, 120, 121, 122, 123, 124, 125]
[66, 67, 68, 69, 70, 71, 72, 73, 74, 75]
[40, 41, 42, 43, 44, 45, 46, 47, 48]
[143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184]
[143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184]
[215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225]
[242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264]
[270, 271, 272, 273, 274]
[266, 267, 268, 269]
[143, 144, 145,

[115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131]
[132, 133, 134, 135, 136]
[138, 139, 140, 141, 142, 143, 144, 145, 146]
[160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177]
[178, 179, 180, 181]
[182, 183, 184, 185, 186, 187]
[198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214]
[182, 183, 184, 185, 186, 187]
[230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247]
[266, 267, 268, 269, 270, 271]
[272, 273, 274, 275, 276, 277]
[318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333]
[295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313]
[295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313]
[318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333]
[295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309,

[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]
[4, 5, 6, 7, 8, 9, 10, 11, 12]
[0, 1, 2, 3]
[73, 74, 75, 76, 77, 78, 79, 80, 81]
[91, 92, 93]
[87, 88, 89, 90]
[82, 83, 84, 85, 86]
[119, 120, 121, 122, 123, 124, 125, 126, 127]
[73, 74, 75, 76, 77, 78, 79, 80, 81]
[134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156]
[134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156]
[185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198]
[178, 179, 180, 181, 182, 183, 184]
[168, 169, 170, 171]
[217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227]
[228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239]
[217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227]
[267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278]
[260, 261, 262]
[3

[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
[0, 1, 2, 3, 4, 5]
[6, 7, 8]
[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58]
[68, 69, 70, 71, 72]
[73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86]
[59, 60, 61]
[100, 101, 102, 103, 104, 105, 106]
[100, 101, 102, 103, 104, 105, 106]
[100, 101, 102, 103, 104, 105, 106]
[130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172]
[181, 182, 183, 184, 185, 186, 187, 188, 189]
[190, 191, 192, 193, 194, 195, 196, 197, 198, 199]
[181, 182, 183, 184, 185, 186, 187, 188, 189]
[250, 251, 252, 253, 254, 255, 256]
[234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249]
[230, 231, 232, 233]
[223, 224, 225, 226, 227, 228, 229]
[214, 215, 216]
[272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285]
[301, 302, 

[139, 140, 141, 142, 143, 144, 145, 146]
[176, 177, 178, 179, 180, 181, 182, 183, 184, 185]
[199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209]
[211, 212, 213, 214, 215, 216, 217, 218, 219]
[221, 222, 223, 224, 225, 226, 227, 228, 229]
[231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243]
[250, 251, 252, 253, 254]
[259, 260, 261, 262, 263, 264, 265, 266]
[221, 222, 223, 224, 225, 226, 227, 228, 229]
[231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243]
[199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209]
[211, 212, 213, 214, 215, 216, 217, 218, 219]
[318, 319, 320, 321, 322, 323, 324, 325]
[330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342]
[281, 282, 283, 284, 285, 286, 287, 288, 289, 290]
[291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301]
[357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372]
[429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441]
[410, 411, 412, 413, 414, 415]
[406, 407, 408, 409]


[471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483]
[538, 539, 540, 541, 542, 543, 544, 545]
[538, 539, 540, 541, 542, 543, 544, 545]
[169, 170, 171, 172]
[159, 160, 161]
[190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205]
[162, 163, 164, 165, 166, 167, 168]
[177, 178, 179, 180, 181]
[221, 222, 223, 224]
[225, 226, 227, 228, 229, 230, 231, 232]
[234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248]
[250, 251, 252, 253, 254, 255, 256, 257, 258]
[260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272]
[277, 278, 279, 280, 281, 282, 283, 284, 285]
[322, 323, 324, 325, 326, 327, 328, 329, 330]
[314, 315, 316]
[351, 352, 353, 354, 355, 356, 357, 358]
[382, 383, 384, 385, 386, 387]
[421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431]
[406, 407, 408]
[396, 397, 398, 399, 400, 401, 402, 403, 404, 405]
[409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420]
[484, 485, 486, 487]
[492, 493, 494, 495, 496, 497, 498, 4

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[23, 24, 25, 26, 27, 28, 29, 30, 31]
[46, 47, 48, 49, 50, 51, 52, 53, 54]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[43, 44, 45]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[63, 64, 65, 66]
[71, 72, 73, 74, 75, 76, 77]
[97, 98, 99, 100, 101, 102, 103]
[86, 87, 88, 89, 90, 91]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[121, 122, 123, 124]
[134, 135, 136, 137, 138, 139, 140, 141]
[125, 126, 127, 128, 129]
[163, 164, 165, 166, 167, 168, 169, 170, 171, 172]
[150, 151, 152, 153, 154, 155, 156, 157, 158]
[181, 182, 183]
[184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205]
[150, 151, 152, 153, 154, 155, 156, 157, 158]
[163, 164, 165, 166, 167, 168, 169, 170, 171, 172]
[217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227]
[217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227]
[217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227]
[252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 2

[4, 5, 6, 7, 8, 9, 10, 11]
[12, 13, 14]
[21, 22, 23, 24, 25, 26, 27]
[50, 51, 52, 53, 54, 55, 56, 57]
[58, 59, 60, 61]
[37, 38, 39, 40, 41, 42, 43, 44, 45]
[81, 82, 83, 84, 85, 86, 87]
[72, 73, 74, 75, 76]
[50, 51, 52, 53, 54, 55, 56, 57]
[97, 98, 99, 100]
[101, 102, 103, 104, 105, 106, 107, 108]
[113, 114, 115, 116, 117]
[129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143]
[148, 149, 150, 151, 152, 153, 154]
[125, 126, 127, 128]
[50, 51, 52, 53, 54, 55, 56, 57]
[176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200]
[163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175]
[214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226]
[50, 51, 52, 53, 54, 55, 56, 57]
[240, 241, 242, 243, 244, 245, 246, 247, 248, 249]
[250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260]
[261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278]
[299, 300, 301, 302, 

[349, 350, 351, 352, 353]
[299, 300, 301, 302, 303, 304, 305, 306]
[387, 388, 389, 390, 391, 392]
[383, 384, 385, 386]
[370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382]
[404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416]
[404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416]
[439, 440, 441, 442, 443]
[451, 452, 453, 454, 455]
[444, 445, 446]
[451, 452, 453, 454, 455]
[468, 469, 470, 471, 472]
[477, 478, 479, 480, 481, 482, 483, 484]
[464, 465, 466, 467]
[497, 498, 499, 500, 501, 502, 503]
[512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523]
[504, 505, 506]
[531, 532, 533, 534, 535, 536]
[531, 532, 533, 534, 535, 536]
[566, 567, 568, 569, 570, 571, 572]
[558, 559, 560, 561]
[580, 581, 582, 583, 584]
[597, 598, 599, 600, 601, 602, 603, 604]
[585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596]
[531, 532, 533, 534, 535, 536]
[618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639]

[150, 151, 152, 153, 154, 155, 156, 157, 158]
[163, 164, 165, 166, 167, 168, 169, 170, 171, 172]
[150, 151, 152, 153, 154, 155, 156, 157, 158]
[163, 164, 165, 166, 167, 168, 169, 170, 171, 172]
[184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205]
[181, 182, 183]
[217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227]
[217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227]
[217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227]
[252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263]
[265, 266, 267, 268, 269, 270, 271, 272, 273]
[275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285]
[287, 288, 289, 290, 291, 292, 293, 294]
[296, 297, 298, 299, 300]
[305, 306, 307, 308, 309, 310, 311, 312]
[313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324]
[248, 249, 250, 251]
[337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347]
[356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367]
[348, 349, 350]
[337, 338, 339, 340, 341,

[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
[49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87]
[23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
[41, 42, 43]
[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
[113, 114, 115, 116, 117, 118, 119, 120, 121]
[122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141]
[156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166]
[156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166]
[191, 192, 193, 194, 195, 196, 197, 198, 199]
[183, 184, 185]
[207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217]
[219, 220, 221, 222, 223]
[225, 226, 227, 228, 229, 230, 231, 232, 233, 234]
[236, 237, 238, 239, 240, 241, 242, 243]
[245, 246, 247, 248, 249]
[254, 255, 256, 257, 258, 259, 260, 261]
[277, 278, 279, 280, 281, 282]
[283, 284, 285, 286, 2

[754, 755, 756, 757, 758]
[746, 747, 748, 749]
[728, 729, 730, 731, 732, 733, 734]
[770, 771, 772, 773, 774, 775, 776, 777]
[779, 780, 781, 782, 783, 784, 785]
[787, 788, 789, 790, 791, 792, 793]
[795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806]
[808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829]
[834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857]
[858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869]
[870, 871, 872]
[877, 878, 879, 880, 881, 882, 883]
[1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028]
[981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993]
[1013, 1014, 1015]
[998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012]
[958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976]
[1043, 1044, 1045, 1046, 1047]
[1035, 1036, 1037, 1038]
[877, 878, 879, 8

[294, 295, 296, 297]
[311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324]
[413, 414, 415, 416, 417, 418, 419, 420, 421, 422]
[404, 405, 406]
[349, 350, 351, 352, 353]
[342, 343, 344, 345, 346, 347, 348]
[342, 343, 344, 345, 346, 347, 348]
[474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493]
[456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473]
[342, 343, 344, 345, 346, 347, 348]
[438, 439, 440, 441]
[430, 431, 432, 433]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[15, 16, 17, 18, 19, 20, 21]
[11, 12, 13, 14]
[28, 29, 30, 31, 32, 33, 34, 35]
[36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51]
[28, 29, 30, 31, 32, 33, 34, 35]
[75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86]
[87, 88, 89, 90, 91, 92, 93, 94, 95, 96]
[97, 98, 99, 100, 101, 102, 103, 104, 105, 106]
[115, 116, 117, 118, 119, 120]
[138, 139, 140, 141, 142, 143, 144, 145]
[121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 

[0, 1, 2, 3, 4]
[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
[5, 6, 7]
[0, 1, 2, 3, 4]
[65, 66, 67, 68, 69, 70, 71, 72, 73]
[56, 57, 58, 59, 60]
[50, 51, 52, 53, 54]
[35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45]
[46, 47, 48, 49]
[32, 33, 34]
[0, 1, 2, 3, 4]
[84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94]
[0, 1, 2, 3, 4]
[125, 126, 127, 128, 129]
[130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157]
[125, 126, 127, 128, 129]
[175, 176, 177, 178, 179]
[125, 126, 127, 128, 129]
[194, 195, 196, 197, 198]
[125, 126, 127, 128, 129]
[217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234]
[209, 210, 211, 212]
[242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253]
[262, 263, 264, 265, 266, 267, 268, 269]
[254, 255, 256, 257]
[242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253]
[282, 283, 284, 285, 286, 287, 288, 289, 290]
[303, 304, 305, 306, 307, 308, 30

In [47]:
ov_all_dicts = [ov_dict_coco, ov_dict_jthn, ov_dict_graf, ov_dict_hoff]
annot_names = ["Coco", "Giov", "Graf", "Hoff"]

In [48]:
#create list with empty df per recipe 
recipe_table_dict = {example["text"] : token_table_per_recipe(example) for example in ov_recipes_coco}

In [49]:
ov_dict_coco

{'Für die Barbecue Sauce das Öl erhitzen, Zwiebeln glasig anbraten. Knoblauch, Nelken, Tabasco und Kümmel zugeben und 2 Minuten brutzeln lassen. Alle anderen Zutaten (außer den Rippchen) zugeben, aufkochen lassen und 20 Minuten köcheln lassen, bis die Sauce eindickt. Vom Herd nehmen und alles durch ein Sieb passieren.Bei normalen, dünnen Rippchen, diese kurz auf dem Grill anbraten und dann großzügig nach jedem wenden mit der Sauce einpinseln. Die entstehende Kruste darf ruhig etwas schwarz werden, schmeckt extrem lecker. Wichtig ist es, die Rippchen in Bewegung zu halten und im Minutentakt zu wenden und einzupinseln!Dicke Rippchen vor dem Einpinseln länger vorbraten. Oder vorher in heißem Wasser, nicht kochendem, 30 Minuten bis eine Stunde ziehen lassen.Dazu passt ein guter schwäbischer Kartoffelsalat und jede Menge kaltes Bier.': {(0,
   1,
   2,
   3): {'ent_label': 'None', 'rel_label': 'ARGNone'},
  (4, 5, 6, 7): {'ent_label': 'None', 'rel_label': 'ARGNone'},
  (8, 9, 10, 11, 12, 13

In [50]:
for person in ov_all_dicts:                 #loop through all annotators
    for example_recipe in person.keys():    #loop through all recipes of each annotator example_recipe = text
        
        try: 
            dataframe = recipe_table_dict[example_recipe]

            for key in person[example_recipe].keys():  #loop through label_per_tok
                        
                    i = dataframe.index[dataframe["Tokens"] == key]
                    col_i = dataframe.columns.get_loc(person[example_recipe][key]["ent_label"])
                    dataframe.iloc[i, col_i] += 1
                    
                    col_i2 = dataframe.columns.get_loc(person[example_recipe][key]["rel_label"])
                    dataframe.iloc[i, col_i2] += 1
                    

        except: 
            pass
            #print("Recipe was skipped.")


In [101]:
total_len = 0
for rec in recipe_table_dict.values():
    total_len += len(rec)

print(total_len)

3198


In [52]:
lst = []
for recipe in recipe_table_dict.values():
    if recipe[["Z", "TOOL", "V", "ATTR", "PRÄP", "ZEITP", "DAUER", "TEMP", "None"]].iloc[0].sum() == 4:
        lst.append(recipe)
        print
    else:
        print("skip")
print(len(lst))

skip
skip
25


In [102]:
le_t = 0 
for rec in lst:
    le_t += len(rec)

print(le_t)

2945


In [53]:
# concat
df_con = pd.concat(lst, ignore_index=True)

In [96]:
# drop column
df_con_dropped = df_con.drop(["Tokens"], axis=1)

In [86]:
# ner df
df_ner = df_con_dropped[["Z", "TOOL", "V", "ATTR", "PRÄP", "ZEITP", "DAUER", "TEMP", "None"]]

In [87]:
# token-level kappa (NER)
from statsmodels.stats.inter_rater import fleiss_kappa

kappa = fleiss_kappa(df_ner, method="fleiss")
print(f"Overall: {round(kappa, 3)}")

Overall: 0.95


In [88]:
# rel df
df_rel = df_con_dropped[["ARG0", "ARG1", "ARG", "ARGNone"]]

In [89]:
# token-level kappa (REL unfiltered)
kappa = fleiss_kappa(df_rel, method="fleiss")
print(f"Overall: {round(kappa, 3)}")

Overall: 0.938


In [97]:
# drop uneinige
filt = df_con_dropped.iloc[:, 0:9].max(axis=1) < 4
df_rel_filtered = df_con_dropped.drop(df_con_dropped[filt].index)
df_rel_filtered2 = df_rel_filtered[["ARG0", "ARG1", "ARG", "ARGNone"]]

In [90]:
# token-level kappa (REL filtered)

kappa = fleiss_kappa(df_rel_filtered2, method="fleiss")
print(f"Overall: {round(kappa, 3)}")

Overall: 0.979


In [103]:
agreement_by_all = len(df_rel_filtered2)
annotated_by_all = 2945 

percent_agreement = agreement_by_all / annotated_by_all
percent_agreement

0.9273344651952462

In [95]:
import numpy as np
rels = ["ARG0", "ARG1", "ARG"]
for rel in rels: 
    df2 = df_rel_filtered2.iloc[:, np.r_[df_rel_filtered2.columns.get_loc(rel), df_rel_filtered2.columns.get_loc("ARGNone")]]
    #print(df2)
    for i in range(len(df2)):
        if df2.iloc[i, :].sum() != 4:
            missing = 4 - (df2.iloc[i, 0] + df2.iloc[i, 1])
            df2.iloc[i, 1] += missing
    #print(df2)
    kap = fleiss_kappa(df2)
    print(f"{rel}: {round(kap, 3)}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.iloc[i, 1] += missing


ARG0: 0.964
ARG1: 0.935
ARG: 0.989
