In [34]:
import spacy
from spacy.tokens import DocBin
path = "ned.spacy"
nlp = spacy.load("nl_core_news_md")

In [2]:
docbin = DocBin().from_disk(path)

In [3]:
docs = list(docbin.get_docs(nlp.vocab))

In [116]:
def relabel(ent_list: list):
    """
    returns a relabeled list of tuples (text, label) with new labels complying with ConLL labels 
    """
    mappings = {"PERSON":"PER", "COMPANY":"ORG", "GPE":"LOC", 'EVENT':"MISC", 'FAC':"MISC", 'LANGUAGE':"MISC", 
                      'LAW':"MISC", 'NORP':"MISC", 'PRODUCT':"MISC",'WORK_OF_ART':"MISC", "MISC":"MISC", "PER":"PER", "ORG":"ORG", "LOC":"LOC"}
    
    exclude = {"CARDINAL", "ORDINAL", "DATE", "PERCENT", "QUANTITY", "TIME", "MONEY"}
    return [(ent[0], mappings[ent[1]]) for ent in ent_list if ent[1] not in exclude]

def ent_tup(ent):
    """
    returns a tuple containing (text, label, start, end) for specified ent
    """
    return (ent.text, ent.label_)

def ent_list(doc):
    """
    returns list of tuples (text, label) of all entities in doc
    """
    ent_list = [ent_tup(ent) for ent in doc.ents]
    return relabel(ent_list)

def score(docs: list, truth: list):
    """
    returns scores (precision, recall, f1) of docs using truth as ground truth. 
    """
    tp = []
    fp = []
    fn = []
    
    for i, doc in enumerate(docs):
        pred = ent_list(doc)
        true = ent_list(truth[i])
        for ent in pred:
            pred_label = ent[1]
            pred_text = ent[0]
            for true_ent in true: 
                text = true_ent[0]
                label = true_ent[1]
                if pred_text in text or text in pred_text:
                    if pred_label == label:        
                        tp.append(ent)
                        break
                    else:
                        fp.append(ent)
                        break

        doc_fn = [ent for ent in true if ent not in tp + fp]
        fn.extend(doc_fn)
    
    precision = len(tp) / (len(tp) + len(fp))
    recall = len(tp) / (len(tp) + len(fn))
    f1 = 2 * (recall * precision) / (recall + precision)
    return {"precision" : precision, "recall" : recall, "f1" : f1}

testdocs = docs
otherdocs = [nlp(doc.text) for doc in testdocs]
score(testdocs, otherdocs)

KeyboardInterrupt: 

In [118]:
show = nlp(docs[3].text)

In [119]:
import skweak
skweak.utils.display_entities(show)