In [2]:
import spacy
import os
import sys
import skweak
nlp = spacy.load("nl_core_news_md")

In [3]:
import tarfile

# We retrieve the texts from Wablieft tarfile
texts = [] 
archive_file = tarfile.open("data/plain.tar.xz")
for archive_member in archive_file.getnames():
    if archive_member.endswith(".txt"):
        #weird encoding because of Turkish letters
        text = archive_file.extractfile(archive_member).read().decode("cp850")
        texts.append(text)

In [4]:
docs = list(nlp.pipe(texts))

In [5]:
def name_detector_fn(doc):
    names = ["David", "Cameron", "William", "Bos"]
    for token in doc: 
        if token.text in names:
            yield token.i, token.i+1, "PERSON"

name_detector = skweak.heuristics.FunctionAnnotator("name_detector", name_detector_fn)

def address_detector_fn(doc):
    for token in doc: 
        if token.text[0].isupper() and doc[token.i+1].is_digit:
            yield token.i, token.i+2, "LOCATION"

address_detector = skweak.heuristics.FunctionAnnotator("address_detector", address_detector_fn)

def company_detector_fn(doc):
    companies = ["Microsoft", "Apple", "Gemeente Amsterdam", "Universiteit van Amsterdam", "UvA"]
    for token in doc:
        if token.text in companies:
            yield token.i, token.i+1, "ORG"

company_detector = skweak.heuristics.FunctionAnnotator("company_detector", company_detector_fn)

names = skweak.gazetteers.extract_json_data("data/geonames.json", spacy_model="en_core_web_sm")
name_annotator = skweak.gazetteers.GazetteerAnnotator("locations", names)

Extracting data from data/geonames.json
Populating trie for class GPE (number: 15205)


In [6]:
processed = list(name_detector.pipe(name_annotator.pipe(docs[:25])))
hmm = skweak.aggregation.HMM("hmm", ["PERSON", "LOCATION", "ORG", "GPE"])

#skweak.utils.display_entities(processed[7], "locations")
hmm.fit_and_aggregate(processed)
skweak.utils.display_entities(processed[7], "hmm", add_tooltip=True)

Starting iteration 1
Finished E-step with 25 documents
Starting iteration 2
Finished E-step with 25 documents
Starting iteration 3
Finished E-step with 25 documents


         1        -147.6267             +nan
         2        -140.8670          +6.7598
         3        -140.8670          -0.0000


In [8]:
for doc in processed:
    if "hmm" in doc.spans.keys():
        doc.ents = doc.spans["hmm"]
    else: 
        doc.ents = ()

docs = docs[:100]
skweak.utils.docbin_writer(docs, "data/wablieft.spacy")

Write to data/wablieft.spacy...done


In [9]:
!spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [10]:
!spacy train config.cfg --paths.train data/wablieft.spacy --paths.dev data/wablieft.spacy --initialize.vectors nl_core_news_md --output output

[38;5;4mℹ Using CPU[0m
[1m
[2021-05-07 14:55:36,908] [INFO] Set up nlp object from config
[2021-05-07 14:55:36,921] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-05-07 14:55:36,927] [INFO] Created vocabulary
[2021-05-07 14:55:40,870] [INFO] Added vectors: nl_core_news_md
[2021-05-07 14:55:40,870] [INFO] Finished initializing nlp object
[2021-05-07 14:55:45,408] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    129.29    0.00    0.00    0.00    0.00
  2     200       3048.25   5624.07   60.25   56.15   65.01    0.60
  4     400       1185.68   3344.61   47.17   80.06   33.44    0.47
  6     600        124.85   2236.79   69.49   87.38   57.68    0.69
  8     800        144.04   1632.22   87.83 

In [13]:
model = spacy.load("output/model-best")

In [15]:
doc = processed[7]

In [17]:
skweak.utils.display_entities(model("Hoi ik ben William en ik woon in New York. Ik kom uit Flanders en ben geboren in Antwerpen."))