In [None]:
import spacy
import os
import sys
data_path = "data/wablieft/text/plain/"
sys.path.insert(0, "./skweak_git")
import skweak

In [None]:
files = [file for file in os.listdir(data_path)]
texts = []
for file in files:
    with open(data_path + file, "r", encoding="cp850") as file_text:
        texts.append(file_text.read())

In [None]:
nlp = spacy.load("nl_core_news_md")
docs = list(nlp.pipe(texts))

In [None]:
doc = docs[34]

In [None]:
def url_detector_fn(doc):
    for token in doc: 
        if "." in token.text and not token.is_punct:
            yield token.i, token.i+1, "URL"
        
url_detector = skweak.heuristics.FunctionAnnotator("url_detector", url_detector_fn)

def number_detector_fn(doc):
    for token in doc:
        if token.is_digit:
            yield token.i, token.i+1, "Digit"

number_detector = skweak.heuristics.FunctionAnnotator("number_detector", number_detector_fn)

In [None]:
doc = number_detector(url_detector(doc))
hmm = skweak.aggregation.HMM("hmm", ["URL", "DIGIT"])
hmm.fit_and_aggregate([doc])
skweak.utils.display_entities(doc, "hmm")

In [2]:
import spacy, re
from skweak import heuristics, gazetteers, aggregation, utils

# LF 1: heuristic to detect occurrences of MONEY entities
def money_detector(doc):
    for tok in doc[1:]:
        if tok.text[0].isdigit() and tok.nbor(-1).is_currency:
            yield tok.i-1, tok.i+1, "MONEY"
lf1 = heuristics.FunctionAnnotator("money", money_detector)

# LF 2: detection of years with a regex
lf2= heuristics.TokenConstraintAnnotator ("years", lambda tok: re.match("(19|20)\d{2}$", tok.text), "DATE")

# LF 3: a gazetteer with a few names
NAMES = [("Barack", "Obama"), ("Donald", "Trump"), ("Joe", "Biden")]
trie = gazetteers.Trie(NAMES)
lf3 = gazetteers.GazetteerAnnotator("presidents", {"PERSON":trie})

# We create a corpus (here with a single text)
nlp = spacy.load("en_core_web_sm")
doc = nlp("Donald Trump paid $750 in federal income taxes in 2016")

# apply the labelling functions
doc = lf3(lf2(lf1(doc)))

# and aggregate them
hmm = aggregation.HMM("hmm", ["PERSON", "DATE", "MONEY"])
hmm.fit_and_aggregate([doc])

# we can then visualise the final result (in Jupyter)
utils.display_entities(doc, "hmm")

Starting iteration 1
Finished E-step with 1 documents
Starting iteration 2
Finished E-step with 1 documents


         1         -18.9513             +nan
         2         -19.0673          -0.1160
