In [1]:
import spacy
import os
import sys
import skweak
import evaluation as evn
import itertools
from skweak.spacy import ModelAnnotator
nlp = spacy.load("nl_core_news_md")

In [2]:
#TODAY I'LL TRAIN SOME SPACY MODELS - LET'S SEE... SONAR-1 AND CONLL-2002? 

#I ALSO HAVE TO MAKE A PRETRAINED CONLL-2002 XLM-R

In [3]:
from functools import reduce

def scream(strings: list):
  return [string.upper() for string in strings]

def double(strings: list):
  return [string + " " + string for string in strings]

In [4]:
funcs = [scream, double]
docs = ["hoi", "hoe", "gaat", "het"]
streams = itertools.tee(docs, len(funcs)+1)
[func(string) for func, string in zip(funcs, streams[:1])]

[['HOI', 'HOE', 'GAAT', 'HET']]

In [6]:
train = evn.to_spacy("data/ned_train.spacy")
sample = train[0]

In [5]:
def relabel(ent_label: str) -> str:
  """
  returns ConLL-2002 label of Spacy labelled entity
  """
  mappings = {"PERSON":"PER", "COMPANY":"ORG", "GPE":"LOC", 'EVENT':"MISC", 'FAC':"MISC", 'LANGUAGE':"MISC", 'LAW':"MISC", 'NORP':"MISC", 'PRODUCT':"MISC",'WORK_OF_ART':"MISC", "MISC":"MISC", "PER":"PER", "ORG":"ORG", "LOC":"LOC"}    
  exclude = {"CARDINAL", "ORDINAL", "DATE", "PERCENT", "QUANTITY", "TIME", "MONEY"}

  return mappings[ent_label] if ent_label != "" and ent_label not in exclude else None

class RelabelledModelAnnotator(ModelAnnotator):
  def __init__(self, name: str, model_path: str):
    super(RelabelledModelAnnotator, self).__init__(name, model_path)
    
  def find_spans(self, doc):
    # Create a new document (to avoid conflicting annotations)
    doc2 = self.create_new_doc(doc)
    # And run the model
    for _, proc in self.model.pipeline:
        doc2 = proc(doc2)
    # Add the annotation
    for ent in doc2.ents:
        #just put relabel function here
        yield ent.start, ent.end, relabel(ent.label_)

In [7]:
#model annotators
spacy_nl = RelabelledModelAnnotator("spacy", "nl_core_news_md")
spacy_conll = RelabelledModelAnnotator("conll", "models/conll2002_spacy")

In [8]:
#gazetteers 
geonames = skweak.gazetteers.extract_json_data("data/geonames.json")
geonames_annotator = skweak.gazetteers.GazetteerAnnotator("geonames", geonames)

nederlocs = skweak.gazetteers.extract_json_data("data/nl.json")
dutch_loc_annotator = skweak.gazetteers.GazetteerAnnotator("nederlocs", nederlocs)

crunchbase = skweak.gazetteers.extract_json_data("data/crunchbase_alt.json")
crunchbase_annotator = skweak.gazetteers.GazetteerAnnotator("crunchbase", crunchbase)

Extracting data from data/geonames.json
Populating trie for class LOC (number: 15205)
Extracting data from data/nl.json
Populating trie for class LOC (number: 22819)
Extracting data from data/crunchbase_alt.json
Populating trie for class PER (number: 1062669)
Populating trie for class ORG (number: 789205)


In [9]:
#functions 
import json
import spacy_wrapper

NAME_PREFIXES = {"-", "von", "van", "de", "di", "le", "la", "het", "'t'", "dem", "der", "den", "d'", "ter"}

class SpanGenerator:
    """Generate spans that satisfy a token-level constratint. From Lison et al. 2020"""
    
    def __init__(self, constraint, label="ENT", exceptions=("'s", "-")):
        """annotation with a constraint (on spacy tokens). Exceptions are sets of tokens that are allowed
        to violate the constraint inside the span"""
        
        self.constraint = constraint
        self.label = label
        self.exceptions = set(exceptions)
        
    def __call__(self, spacy_doc):    

        i = 0
        while i < len(spacy_doc):
            tok = spacy_doc[i]
                # We search for the longest span that satisfy the constraint
            if self.constraint(tok):
                j = i+1
                while True:
                    if j < len(spacy_doc) and self.constraint(spacy_doc[j]):
                        j += 1
                    # We relax the constraint a bit to allow genitive and dashes
                    elif j < (len(spacy_doc)-1) and spacy_doc[j].text in self.exceptions and self.constraint(spacy_doc[j+1]):
                        j += 2
                    else:
                        break

                # To avoid too many FPs, we only keep entities with at least 3 characters (excluding punctuation)
                if len(spacy_doc[i:j].text.rstrip(".")) > 2:
                    yield i, j, self.label
                i = j
            else:
                i += 1

class FullNameGenerator:
    """Search for occurrences of full person names (first name followed by at least one title token). From Lison et al. 2020"""

    def __init__(self):
        fd = open("data/first_names.json")
        self.first_names = set(json.load(fd))
        fd.close()
        self.suggest_generator = SpanGenerator(lambda x: is_likely_proper(x), 
                                               exceptions=NAME_PREFIXES)
        
    def __call__(self, spacy_doc):
        for start, end, _ in self.suggest_generator(spacy_doc):  
            # We assume full names are between 2 and 4 tokens
            if (end-start) < 2 or (end-start) > 5:
                continue
                
            elif (spacy_doc[start].text in self.first_names and spacy_doc[end-1].is_alpha 
                  and spacy_doc[end-1].is_title): 
                yield start, end, "PER"

def in_compound(tok):
    """Returns true if the spacy token is part of a compound phrase"""
    if tok.dep_=="compound":
        return True
    elif tok.i > 0 and tok.nbor(-1).dep_=="compound":
        return True
    return False
  
def is_likely_proper(tok):
    """Returns true if the spacy token is a likely proper name, based on its form."""
    if len(tok)< 2:
        return False
    
    # If the lemma is titled, just return True
    elif tok.lemma_.istitle():
        return True
       
    # Handling cases such as iPad
    elif len(tok)>2 and tok.text[0].islower() and tok.text[1].isupper() and tok.text[2:].islower():
        return True
    
    elif (tok.is_upper and tok.text not in spacy_wrapper.CURRENCY_CODES 
          and tok.text not in spacy_wrapper.NOT_NAMED_ENTITIES):
        return True
    
    # Else, check whether the surface token is titled and is not sentence-initial
    elif (tok.i > 0 and tok.is_title and not tok.is_sent_start and tok.nbor(-1).text not in {'\'', '"', '‘', '“', '”', '’'} 
          and not tok.nbor(-1).text.endswith(".")):
        return True
    return False  

fullname = FullNameGenerator()
fullname_annotator = skweak.heuristics.FunctionAnnotator("fullname_detector", fullname)

In [10]:
#testcell
test = nlp("Ik, William Bos, liep in Nederland over het Drijberse Veld wat te vinden is in Zwormertorenbrug op weg naar mijn werk bij Gemeente Amsterdam")

processed = spacy_conll(dutch_loc_annotator(geonames_annotator(crunchbase_annotator(fullname_annotator(spacy_nl(test))))))
hmm = skweak.aggregation.HMM("hmm", ["PER", "ORG", "LOC", "MISC"])
hmm.fit_and_aggregate([processed])
spans = processed.spans["hmm"]

processed.ents = spans
for span in processed.ents:
  print(span, span.label)
  
skweak.utils.display_entities(processed)

Starting iteration 1
Finished E-step with 1 documents
Starting iteration 2
Finished E-step with 1 documents
Starting iteration 3
Finished E-step with 1 documents
Starting iteration 4
Finished E-step with 1 documents
William Bos 4317129024397789502
Nederland 385
Drijberse Veld 385
Zwormertorenbrug 385
Gemeente Amsterdam 385


         1         -59.0496             +nan
         2         -54.4113          +4.6383
         3         -52.0182          +2.3932
         4         -51.9985          +0.0197


In [45]:

from datasets import Dataset, ClassLabel, Sequence
doc = nlp(sample.text)

def name_detector_fn(doc):
    names = ["David", "Cameron", "William", "Bos"]
    for token in doc: 
        if token.text in names:
            yield token.i, token.i+1, "PER"

name_detector = skweak.heuristics.FunctionAnnotator("name_detector", name_detector_fn)

geonames = skweak.gazetteers.extract_json_data("data/geonames.json", spacy_model="en_core_web_sm")
geonames_annotator = skweak.gazetteers.GazetteerAnnotator("locations", geonames)


#hmm = skweak.aggregation.HMM("hmm", ["PER", "ORG", "LOC", "MISC"])
#hmm.fit_and_aggregate([doc])
#spans = doc.spans["hmm"]
#doc.ents = spans

#for span in doc.ents:
#  print(span, span.label, span.start, span.end, span.doc[span.start : span.end])

Extracting data from data/geonames.json
Populating trie for class LOC (number: 15205)


De tekst van het arrest is nog niet schriftelijk beschikbaar maar het bericht werd alvast bekendgemaakt door een communicatiebureau dat Floralux inhuurde . In '81 regulariseert de toenmalige Vlaamse regering de toestand met een BPA dat het bedrijf op eigen kosten heeft laten opstellen . publicatie Vandaag is Floralux dus met alle vergunningen in orde , maar het BPA waarmee die konden verkregen worden , was omstreden omdat zaakvoerster Christiane Vandenbussche haar schepenambt van ... In eerste aanleg werd Vandenbussche begin de jaren '90 veroordeeld wegens belangenvermenging maar later vrijgesproken door het hof van beroep in Gent . SP zit moeilijk -- ( Eric , sd ) Derycke wil BPA goedkeuren op voorwaarde dat BPA ' De Hoogte ' beperkt wordt aanvaard ( burgemeester van Moorslede Walter Ghekiere zal 'n voorstel doen ) . Onvoldoende om een zware straf uit te spreken , luidt het . Dit hof verbindt nu geen straf aan de schuld die ze vaststelt . Die groeit al snel uit tot een heus tuincentru

In [87]:
type(geonames_annotator)

skweak.gazetteers.GazetteerAnnotator

In [58]:
from datasets import Dataset, ClassLabel, Sequence

def relabel(ent_label: str) -> str:
  """
  returns ConLL-2002 label of Spacy labelled entity
  """
  mappings = {"PERSON":"PER", "COMPANY":"ORG", "GPE":"LOC", 'EVENT':"MISC", 'FAC':"MISC", 'LANGUAGE':"MISC", 'LAW':"MISC", 'NORP':"MISC", 'PRODUCT':"MISC",'WORK_OF_ART':"MISC", "MISC":"MISC", "PER":"PER", "ORG":"ORG", "LOC":"LOC"}    
  exclude = {"CARDINAL", "ORDINAL", "DATE", "PERCENT", "QUANTITY", "TIME", "MONEY"}

  return mappings[ent_label] if ent_label != "" and ent_label not in exclude else None

def convert_ent(token) -> str:
  """
  returns ConLL-2002 IOB style entity label of Spacy token
  """
  return token.ent_iob_ + "-" + relabel(token.ent_type_) if relabel(token.ent_type_) else token.ent_iob_

def process_spacy(docs: list):
  store = []
  tokens = []
  ids = []

  c = 0
  classlabels = ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'])
  for doc in docs:
    ents = [classlabels.str2int(convert_ent(tok)) for tok in doc]
    toks = [token.text for token in doc]
    store.append(ents)
    tokens.append(toks)
    ids.append(str(c))
    c += 1 
    
  d = {"ids" : ids,
       "ner_tags" : store,
       "tokens" : tokens}

  class_sequence = Sequence(feature =  classlabels, id = None)
  ds = Dataset.from_dict(d)
  ds.features["ner_tags"] = class_sequence
  return ds

ds = process_spacy([doc])
ds

Dataset({
    features: ['ids', 'ner_tags', 'tokens'],
    num_rows: 1
})

In [59]:
ds[0]

{'ids': '0',
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [None]:
import tarfile

# We retrieve the texts from Wablieft tarfile
texts = [] 
archive_file = tarfile.open("data/plain.tar.xz")
for archive_member in archive_file.getnames():
    if archive_member.endswith(".txt"):
        #weird encoding because of Turkish letters
        text = archive_file.extractfile(archive_member).read().decode("cp850")
        texts.append(text)

In [None]:
docs = list(nlp.pipe(texts))

In [95]:
def name_detector_fn(doc):
    names = ["David", "Cameron", "William", "Bos"]
    for token in doc: 
        if token.text in names:
            yield token.i, token.i+1, "PER"

name_detector = skweak.heuristics.FunctionAnnotator("name_detector", name_detector_fn)

def address_detector_fn(doc):
    for token in doc: 
        if token.text[0].isupper() and doc[token.i+1].is_digit:
            yield token.i, token.i+2, "LOCATION"

address_detector = skweak.heuristics.FunctionAnnotator("address_detector", address_detector_fn)

def company_detector_fn(doc):
    companies = ["Microsoft", "Apple", "Gemeente Amsterdam", "Universiteit van Amsterdam", "UvA"]
    for token in doc:
        if token.text in companies:
            yield token.i, token.i+1, "ORG"

company_detector = skweak.heuristics.FunctionAnnotator("company_detector", company_detector_fn)

names = skweak.gazetteers.extract_json_data("data/geonames.json", spacy_model="en_core_web_sm")
name_annotator = skweak.gazetteers.GazetteerAnnotator("locations", names)

Extracting data from data/geonames.json
Populating trie for class GPE (number: 15205)


In [98]:
type(address_detector) == skweak.heuristics.FunctionAnnotator

True

In [None]:
processed = list(name_detector.pipe(name_annotator.pipe(docs[:25])))
hmm = skweak.aggregation.HMM("hmm", ["PERSON", "LOCATION", "ORG", "GPE"])

#skweak.utils.display_entities(processed[7], "locations")
hmm.fit_and_aggregate(processed)
skweak.utils.display_entities(processed[7], "hmm", add_tooltip=True)

In [None]:
for doc in processed:
    if "hmm" in doc.spans.keys():
        doc.ents = doc.spans["hmm"]
    else: 
        doc.ents = ()

docs = docs[:100]
skweak.utils.docbin_writer(docs, "data/wablieft.spacy")

In [None]:
!spacy init fill-config base_config.cfg config.cfg

In [None]:
!spacy train config.cfg --paths.train data/wablieft.spacy --paths.dev data/wablieft.spacy --initialize.vectors nl_core_news_md --output output

In [None]:
model = spacy.load("output/model-best")

In [None]:
doc = processed[7]

In [None]:
skweak.utils.display_entities(model("Hoi ik ben William en ik woon in New York. Ik kom uit Flanders en ben geboren in Antwerpen."))

In [None]:
from lison2020 import annotations
from spacy.tokens import DocBin
test = DocBin().from_disk("data/ned_testb.spacy")
sys.path.insert(0, './lison2020')

In [None]:
annotator = annotations.FullAnnotator().add_all()

In [None]:
import evaluation

In [None]:
evaluation.spacy_benchmark("data/ned_testb.spacy")