In [6]:
import glob
import spacy
import json
import sys
import os
from pathlib import Path
from collections import namedtuple
from spacy.tokens.token import Token
from typing import List

Annotation = namedtuple('Annotation', 'start end label')
TokenRange = namedtuple('TokenRange', 'start end')
LabeledTokenSpan = namedtuple('LabeledTokenSpan', 'start end label annotator doc')

def annoation2tokenrange(annotation: Annotation,
                         tokens: List[Token]) -> TokenRange:
    start = next(t for t in tokens if start_is_close(
        t, annotation.start, 1))
    end = next(t for t in tokens if end_is_close(
        t, annotation.end, 1))
    return TokenRange(start.i, end.i + 1)


def start_is_close(token: Token, offset: int, N_fudge_characters: int) -> bool:
    acceptable_range = range(offset - N_fudge_characters,
                             offset + N_fudge_characters + 1)
    return token.idx in acceptable_range


def end_is_close(token: Token, offset: int, N_fudge_characters: int) -> bool:
    acceptable_range = range(offset - N_fudge_characters,
                             offset + N_fudge_characters + 1)
    return token.idx + len(token) in acceptable_range


def doccano_output2_annotation(doccanooutput):
    if type(doccanooutput) == list: # if annoation is just entities/spans
        start_offset = doccanooutput[0]
        end_offset = doccanooutput[1]
        label = doccanooutput[2]
    if type(doccanooutput) == dict: # if annoation is entities and relations
        start_offset = doccanooutput['start_offset']
        end_offset = doccanooutput['end_offset']
        label = doccanooutput['label']

    return Annotation(start_offset, 
                      end_offset,
                      label)

def labels2token_spans(labels: list,
                       tokens: List[Token],
                       annotator: str,
                       id_: str) -> List[LabeledTokenSpan]:
    output: List[LabeledTokenSpan] = []
    for label in labels:
        annotation = doccano_output2_annotation(label)
        if can_resolve(annotation=annotation, tokens=tokens):
            indexes = annoation2tokenrange(annotation, tokens)
            span = LabeledTokenSpan(
                indexes.start, indexes.end, label=annotation.label, annotator=annotator, doc=id_)
            output.append(span)
        else:
            sys.stderr.write("issue")
    return output


def token2label(token: Token,
                labeled_token_spans: List[LabeledTokenSpan]) -> str:
    for span in labeled_token_spans:
        if token.i in range(span.start, span.end):
            return span.label
    return "O"


def can_resolve(annotation: Annotation,
                tokens: List[Token]) -> bool:
    if not any(t for t in tokens if start_is_close(t, annotation.start, 1)):
        return False
    if not any(t for t in tokens if end_is_close(t, annotation.end, 1)):
        return False
    return True

for dir_ in ["sean", "oren", "analysis", "tmp", "sharmila"]:
    for fn in Path(dir_).iterdir():
        os.remove(fn)

nlp = spacy.load("en_core_web_sm")
for fn in glob.glob("*jsonl"):
    with open(fn, "r") as inf:
        lines = [i for i in inf]
        lines = lines[0:50]
        for i in lines:
            line = json.loads(i)
            id_ = line['id']
            doc = nlp(line['text'])
            labels = line['entities']
            labeled_token_spans = labels2token_spans(labels, [o for o in doc], annotator=fn, id_=id_)
            connl_file = fn.replace(".jsonl", "") + "/" + id_ + ".connl"
            with open(connl_file.replace(" ", "_"), "w") as of:
                for token in doc:
                    label = token2label(token, labeled_token_spans)
                    of.write(f"{token},{label}\n")

issueissueissueissueissueissueissueissueissue

In [7]:
files = []

anno1 = "sharmila"
anno2 = "sean"

for fn in Path(anno1).iterdir():
    dir_name, filename = str(fn).split("/")
    files.append(filename)
files.sort()

both_positive = 0
both_negative = 0
oren_pos_sean_neg = 0
oren_neg_sean_pos = 0
total = 0

flag_list = []

for f in files:
    seanfn = f"{anno2}/" + f
    orenfn = f"{anno1}/" + f
    orentokens = []
    seantokens = []
    with open(orenfn, "r") as inf:
        for i in inf:
            orentokens.append(i.replace("\n", ""))
    with open(seanfn, "r") as inf:
        for i in inf:
            seantokens.append(i.replace("\n", ""))

    flag = False
    for ino, oren_token in enumerate(orentokens):
        sean_token = seantokens[ino]
        try:
            oren_word, oren_label = oren_token.split(",")
            sean_word, sean_label = sean_token.split(",")
            if oren_label == sean_label == "O":
                both_negative += 1
            if oren_label == sean_label != "O":
                both_positive += 1
            if oren_label != sean_label and oren_label != "O":
                oren_pos_sean_neg += 1
                flag = True
            if oren_label != sean_label and oren_label == "O":
                oren_neg_sean_pos += 1
                flag = True
            total += 1
            #print(oren_token, sean_token)
        except ValueError:
            pass
            # pass, this is what happens when there are commas
            
    if flag:
        flag_list.append(f)
assert both_positive + both_negative + oren_pos_sean_neg + oren_neg_sean_pos == total

In [8]:
2 * both_positive / ((2 * both_positive)  + oren_pos_sean_neg + oren_neg_sean_pos)

0.4827586206896552

In [9]:
import os
for flagged in flag_list:
    cmd = f"paste {anno1}/{flagged} {anno2}/{flagged} > tmp/{flagged}"
    os.system(cmd)

In [15]:
    
for fn in Path("tmp").iterdir():
    lines = [o for o in open(str(fn))]
    agreement = []
    for l in lines:
        a, b = l.replace("\n","").split("\t")
        agreement.append(a ==  b)
    with open("analysis/" + fn.name + ".tsv", "w") as of:
        of.write(f"{anno1}\t tag \t {anno2} \t tag \t agreement \t comment \n")
        for line, agreement in zip(lines, agreement):
            line = line.replace('\n', '').replace(",", "\t")
            line = line + '\t' + str(agreement) + "\t" + " "
            of.write(line + "\n")

In [14]:
! cat analysis/Akobeng.json.connl.tsv 

sharmila	 tag 	 sean tag 	 agreement 	 comment 
This	O	This	O	True	 
paper	O	paper	O	True	 
uses	O	uses	O	True	 
a	O	a	O	True	 
nationally	O	nationally	O	True	 
representative	O	representative	O	True	 
household	O	household	O	True	 
pseudo	O	pseudo	O	True	 
-	O	-	O	True	 
panel	O	panel	O	True	 
dataset	O	dataset	O	True	 
for	O	for	O	True	 
Ghana	O	Ghana	O	True	 
		O			O	True	 
a	O	a	O	True	 
rain	O	rain	O	True	 
-	O	-	O	True	 
fed	O	fed	O	True	 
agriculture	O	agriculture	O	True	 
economy	O	economy	O	True	 
		O			O	True	 
to	O	to	O	True	 
investigate	O	investigate	O	True	 
whether	O	whether	O	True	 
there	O	there	O	True	 
is	O	is	O	True	 
a	O	a	O	True	 
positive	O	positive	O	True	 
relationship	O	relationship	O	True	 
between	O	between	O	True	 
rainfall	O	rainfall	O	True	 
-	O	-	O	True	 
driven	O	driven	O	True	 
agricultural	O	agricultural	O	True	 
income	O	income	O	True	 
and	O	and	O	True	 
household	O	household	O	True	 
per	O	per	O	True	 
capita	

True