# 03 Train

In [79]:
from __future__ import print_function, unicode_literals
import spacy
import warnings
from spacy.util import minibatch, compounding
import de_core_news_lg
import pandas as pd
import numpy as np
from random import sample
import io, csv
import re
import random
import json
from spacy.training import Example
from spacy.tokens import Doc
from tqdm import tqdm
nlp = spacy.load('de_core_news_lg')
#nlp = spacy.load('de_dep_news_trf')

In [51]:
# helper function for incrementing the revision counters
def increment_revision_counters(entity_counter, entities):
    for entity in entities:
        label = entity[2]
        if label in entity_counter:
            entity_counter[label] += 1
        else:
            entity_counter[label] = 1

In [92]:
with open('input/tagged_sentences_latest.json') as json_file:
    out_sentences = json.load(json_file)
print("LENGTH OF DATASET: ",len(out_sentences))
dataset_dict={}

for sent in out_sentences:
    entities = sent[1]["entities"]
    increment_revision_counters(dataset_dict, entities)

print(dataset_dict)


LENGTH OF DATASET:  1994
{'LEVEL_LOC': 276, 'TIME': 1363, 'GRAN': 1975, 'SINGLE_LOC': 886}


was: 
LENGTH OF DATASET:  1020
{'DATA': 1010, 'GRAN': 279}

is: 
LENGTH OF DATASET:  1994
{'GRAN': 1975, 'PLACE': 1108, 'TIME': 1797}

now:
LENGTH OF DATASET:  1994
{'GRAN': 1975, 'TIME': 1363, 'LEVEL_LOC': 265, 'SINGLE_LOC': 1035}




In [53]:
npr_df = pd.read_csv("external/deu_news_2015_3M-sentences.txt", delimiter = "\t")
npr_df = npr_df.sample(frac=1)
npr_df.head()

Unnamed: 0,1,­
903331,947855,Die Königsblauen einigten sich mit dem 41-Jähr...
83590,87108,"Allerdings meint Lange auch, dass wir bis zum ..."
609337,637012,"Demnach soll die Behörde veranlassen, dass die..."
1199967,1262217,"Epson warnt, dass gefälschte Projektorlampen e..."
16099,16452,3 Leserempfehlungen ein Star.


In [54]:
# *** <- löschen?
# create an nlp object as we'll use this to seperate the sentences and identify existing entities
#loaded already above
#nlp = spacy.load('de_core_news_lg')

In [55]:
revision_texts = []

#STAT: Important: THIS IS A HYPER-PARAMETER: Changing it will affect the accuracy of the result
hyper_para_how_many = 100000

# convert the articles to spacy objects to better identify the sentences. Disabled unneeded components. # takes ~ 4 minutes
for doc in tqdm(nlp.pipe(npr_df.iloc[:hyper_para_how_many,1], batch_size=30, disable=["tagger", "ner"])):
    for sentence in doc.sents:

        if  40 < len(sentence.text) < 80:
            # some of the sentences had excessive whitespace in between words, so we're trimming that
            revision_texts.append(" ".join(re.split("\s+", sentence.text, flags=re.UNICODE)))

100000it [04:23, 380.20it/s]


In [56]:
revisions = []

# Use the existing spaCy model to predict the entities, then append them to revision
for doc in nlp.pipe(revision_texts, batch_size=50, disable=["tagger", "parser"]):
    
    # don't append sentences that have no entities
    if len(doc.ents) > 0:
        revisions.append((doc.text, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}))

In [57]:
# print an example of the revision sentence
print(revisions[0][0])

# print an example of the revision data
print(revisions[0][1])


Die Königsblauen einigten sich mit dem 41-Jährigen auf einen Vertrag bis
{'entities': [(4, 16, 'ORG')]}


In [58]:
# create arrays to store the revision data
TRAIN_REVISION_DATA = []
TEST_REVISION_DATA = []

# create dictionaries to keep count of the different entities
TRAIN_ENTITY_COUNTER = {}
TEST_ENTITY_COUNTER = {}

# This will help distribute the entities (i.e. we don't want 1000 PERSON entities, but only 80 ORG entities)
REVISION_SENTENCE_SOFT_LIMIT = 100



random.shuffle(revisions)
for revision in revisions:
    # get the entities from the revision sentence
    entities = revision[1]["entities"]

    # simple hack to make sure spaCy entities don't get too one-sided
    should_append_to_train_counter = 0
    for _, _, label in entities:
        if label in TRAIN_ENTITY_COUNTER and TRAIN_ENTITY_COUNTER[label] > REVISION_SENTENCE_SOFT_LIMIT:
            should_append_to_train_counter -= 1
        else:
            should_append_to_train_counter += 1

    # simple switch for deciding whether to append to train data or test data
    if should_append_to_train_counter >= 0:
        TRAIN_REVISION_DATA.append(revision)
        increment_revision_counters(TRAIN_ENTITY_COUNTER, entities)
    else:
        TEST_REVISION_DATA.append(revision)
        increment_revision_counters(TEST_ENTITY_COUNTER, entities)

In [59]:
TRAIN_ENTITY_COUNTER

{'PER': 102, 'ORG': 101, 'MISC': 104, 'LOC': 102}

In [60]:
TEST_ENTITY_COUNTER

{'MISC': 7427, 'LOC': 7940, 'PER': 7141, 'ORG': 6141}

In [61]:
#TRAIN_REVISION_DATA

In [93]:
random.shuffle(out_sentences)
TRAIN_STAT_DATA = out_sentences[:int(len(out_sentences)*0.8)]
TEST_STAT_DATA = out_sentences[int(len(out_sentences)*0.8):]

In [94]:
print(len(out_sentences))
print(len(TRAIN_STAT_DATA))
print(len(TEST_STAT_DATA))
print("REVISION", len(TRAIN_REVISION_DATA))
TRAIN_DATA = TRAIN_REVISION_DATA + TRAIN_STAT_DATA
print("COMBINED", len(TRAIN_DATA))

1994
1595
399
REVISION 294
COMBINED 1889


In [95]:
#STAT: below is the heart piece of this script, and the code was heavily changed compared to the original
#script taken out of the code on deepnote.com. The reason is thaat this code has been adapted to spacy 3 -
#while the old code was running on spacy 2.X
#central command is nlp-update

ner = nlp.get_pipe("ner")

ner.add_label("GRAN")
ner.add_label("LEVEL_LOC")
ner.add_label("SINGLE_LOC")
ner.add_label("TIME")



# get the names of the components we want to disable during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# start the training loop, only training NER
epochs = 100
#optimizer = nlp.resume_training()
#optimizer = nlp.initialize()
example_problem_counter = 0

with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    sizes = compounding(1.0, 4.0, 1.001)
    
    # batch up the examples using spaCy's minibatc
    for epoch in range(epochs):
        random.shuffle(TRAIN_DATA)
        #text = []
        #annots=[]
        examples=[]


        for text,annots in TRAIN_DATA:
            #text.append(t)
            #annots.append(a)
            doc = nlp.make_doc(text)
               
            example = Example.from_dict(doc, annots)
            examples.append(example)
               # example_problem_counter += 1
        
        losses = {}
        
        nlp.update(examples, drop=0.35, losses=losses)#,sgd=optimizer)

        print("Losses ({}/{})".format(epoch + 1, epochs), losses)

Losses (1/100) {'ner': 7709.121697278492}
Losses (2/100) {'ner': 7325.782699823606}
Losses (3/100) {'ner': 7022.092537118897}
Losses (4/100) {'ner': 6723.181297273011}
Losses (5/100) {'ner': 6473.82089904329}
Losses (6/100) {'ner': 6432.725299287558}
Losses (7/100) {'ner': 6173.575053762179}
Losses (8/100) {'ner': 5940.919400341809}
Losses (9/100) {'ner': 5809.082900935784}
Losses (10/100) {'ner': 5787.836839877069}
Losses (11/100) {'ner': 5736.401120273396}
Losses (12/100) {'ner': 5623.617335284129}
Losses (13/100) {'ner': 5612.845858054236}
Losses (14/100) {'ner': 5490.982026003301}
Losses (15/100) {'ner': 5552.689499744214}
Losses (16/100) {'ner': 5433.233781544957}
Losses (17/100) {'ner': 5367.753615433816}
Losses (18/100) {'ner': 5365.793095958186}
Losses (19/100) {'ner': 5195.073264249368}
Losses (20/100) {'ner': 4993.02339792384}
Losses (21/100) {'ner': 4929.054347472309}
Losses (22/100) {'ner': 4770.709079275752}
Losses (23/100) {'ner': 4748.097087970644}
Losses (24/100) {'ner'

In [97]:
statbot_colors = {"LEVEL_LOC": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
                  "SINGLE_LOC": "linear-gradient(90deg, #ffff00, #ff8c00)",
                  "GRAN": "linear-gradient(90deg, #ffff00, #ff8c00)",
                  "TIME": "linear-gradient(90deg, #aaf6b1, #99dd9f)"}
statbot_options = {"ents": ["PER","LOC","ORG","MISC","LEVEL_LOC","SINGLE_LOC", "TIME"], "colors": statbot_colors}
spacy.displacy.render(nlp("Ich heisse Christian und war heute in Zürich bei IBM im Internet."), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie viele Kühe hat die Gemeinde Bülach aktuell?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie hoch ist Eigenkapital auf Bezirksebene?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Ich brauche die Daten pro Bezirk"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Ich brauche die Daten für den gesamten Kanton älteste."), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie viel Bauinv. EFH 5 Jahre  hat  in Regensdorf  neueste?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Was ist der Anteil an MIV-Anteil (Modal Split)  älteste  auf Bezirksebene ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Was ist der Anteil an Geb.Vol. Dienstleistungen: Zunahme in Flaach in 2017 ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Welches ist das Schül. Sekundarstufe II für den gesamten Kanton von 2013 bis 2018?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Welche Gemeinde hat die grösste Bevölkerung?"), style="ent",options=statbot_options)

In [98]:
#now check the accuracy of our NERs in this plus the next code chunks!


# dictionary to hold our evaluation data
stat_evaluation = {
    "LEVEL_LOC": {
        "correct": 0,
        "total": 0,
    },
    "SINGLE_LOC": {
        "correct": 0,
        "total": 0,
    },
    "TIME": {
        "correct": 0,
        "total": 0,
    }
}

word_evaluation = {
    "LEVEL_LOC": {
        "correct": 0,
        "total": 0
    },
    "SINGLE_LOC": {
        "correct": 0,
        "total": 0,
    },
    "TIME": {
        "correct": 0,
        "total": 0,
    }

}


for stat in TEST_STAT_DATA:
    # extract the sentence and correct stat entities according to our test data
    sentence = stat[0]
    entities = stat[1]["entities"]

    # for each entity, use our updated model to make a prediction on the sentence
    for entity in entities:
        doc = nlp(sentence)
        correct_text = sentence[entity[0]:entity[1]]
        n_worded_stat =  len(correct_text.split())
        print(n_worded_stat)

        # if we find that there's a match for predicted entity and predicted text, increment correct counters
        for ent in doc.ents:
            print("ENT_LABEL",ent.label_)
            print("ENTITY2",entity[2])
            print("ENT_TEXT",ent.text)
            print("CORRECT:TEXT",correct_text)
            if ent.label_ == entity[2] and ent.text == correct_text:
                
                stat_evaluation[entity[2]]["correct"] += 1
                if n_worded_stat > 0:
                    word_evaluation[entity[2]]["correct"] += 1

                # this break is important, ensures that we're not double counting on a correct match
                break

        #  increment total counters after each entity loop
        stat_evaluation[entity[2]]["total"] += 1
        if n_worded_stat > 0:
            word_evaluation[entity[2]]["total"] += 1

3
ENT_LABEL GRAN
ENTITY2 GRAN
ENT_TEXT NRW
CORRECT:TEXT NRW Waehleranteil SVP
ENT_LABEL SINGLE_LOC
ENTITY2 GRAN
ENT_TEXT Dorf
CORRECT:TEXT NRW Waehleranteil SVP
ENT_LABEL SINGLE_LOC
ENTITY2 GRAN
ENT_TEXT Erlenbach
CORRECT:TEXT NRW Waehleranteil SVP


KeyError: 'GRAN'

In [100]:
for key in word_evaluation:
    correct = word_evaluation[key]["correct"]
    total = word_evaluation[key]["total"]

    print(f"{key}: {correct / total * 100:.2f}%")

stat_total_sum = 0
stat_correct_sum = 0

print("---")
for key in stat_evaluation:
    correct = stat_evaluation[key]["correct"]
    total = stat_evaluation[key]["total"]
    
    stat_total_sum += total
    stat_correct_sum += correct

    print(f"{key}: {correct / total * 100:.2f}%")

print(f"\nTotal: {stat_correct_sum/stat_total_sum * 100:.2f}%")

ZeroDivisionError: division by zero

In [101]:
#now test the accuracy of all the old NERs - was there amnesia on them?

# dictionary which will be populated with the entities and result information
entity_evaluation = {}

# helper function to udpate the entity_evaluation dictionary
def update_results(entity, metric):
    if entity not in entity_evaluation:
        entity_evaluation[entity] = {"correct": 0, "total": 0}
    
    entity_evaluation[entity][metric] += 1

# same as before, see if entities from test set match what spaCy currently predicts
for data in TEST_REVISION_DATA:
    sentence = data[0]
    entities = data[1]["entities"]

    for entity in entities:
        doc = nlp(sentence)
        correct_text = sentence[entity[0]:entity[1]]

        for ent in doc.ents:
            if ent.label_ == entity[2] and ent.text == correct_text:
                update_results(ent.label_, "correct")
                break

        update_results(entity[2], "total")

In [102]:
sum_total = 0
sum_correct = 0

for entity in entity_evaluation:
    total = entity_evaluation[entity]["total"]
    correct = entity_evaluation[entity]["correct"]

    sum_total += total
    sum_correct += correct
    
    print("{} | {:.2f}%".format(entity, correct / total * 100))

print()
print("Overall accuracy: {:.2f}%".format(sum_correct / sum_total * 100))

MISC | 63.73%
LOC | 66.25%
PER | 83.36%
ORG | 80.05%

Overall accuracy: 72.82%


In [30]:
nlp.meta["name"] = "stat_entity_extractor_v0"
nlp.to_disk("./models/v0")

In [31]:
TRAIN_STAT_DATA[5]


['Daten für Steuerb. Einkommen natürl. Pers. 25%Quantil aktuellste',
 {'entities': [[10, 53, 'GRAN'], [54, 64, 'TIME']]}]

In [32]:
doc = nlp(u'Welche Gemeinde hat die grösste Bevölkerung und welche hatte im 2019 den höchsten Ausländeranteil?')

# show universal pos tags
print(' '.join('{word}/{tag}'.format(word=t.orth_, tag=t.pos_) for t in doc))
# output: Ich/PRON bin/AUX ein/DET Berliner/NOUN ./PUNCT

# show German specific pos tags (STTS)
print(' '.join('{word}/{tag}'.format(word=t.orth_, tag=t.tag_) for t in doc))
# output: Ich/PPER bin/VAFIN ein/ART Berliner/NN ./$.

# show dependency arcs
print('\n'.join('{child:<8} <{label:-^7} {head}'.format(child=t.orth_, label=t.dep_, head=t.head.orth_) for t in doc))
# output: (sb: subject, nk: noun kernel, pd: predicate)

#named entities
print("Named Entity Recognition:")
for ent in doc.ents:
    print(ent.text)
print("Noun chunks:")
for chunk in doc.noun_chunks:
    print(chunk.text)

Welche/DET Gemeinde/NOUN hat/AUX die/DET grösste/ADJ Bevölkerung/NOUN und/CCONJ welche/PRON hatte/AUX im/ADP 2019/PROPN den/DET höchsten/ADJ Ausländeranteil/NOUN ?/PUNCT
Welche/PWAT Gemeinde/NN hat/VAFIN die/ART grösste/ADJA Bevölkerung/NN und/KON welche/PDS hatte/VAFIN im/APPRART 2019/CARD den/ART höchsten/ADJA Ausländeranteil/NN ?/$.
Welche   <--nk--- Gemeinde
Gemeinde <--sb--- hat
hat      <-ROOT-- hat
die      <--nk--- Bevölkerung
grösste  <--nk--- Bevölkerung
Bevölkerung <--oa--- hat
und      <--cd--- Bevölkerung
welche   <--cj--- und
hatte    <--cj--- hat
im       <--mo--- hatte
2019     <--nk--- im
den      <--nk--- Ausländeranteil
höchsten <--nk--- Ausländeranteil
Ausländeranteil <--oa--- hatte
?        <-punct- hat
Named Entity Recognition:
Noun chunks:
Welche Gemeinde
die grösste Bevölkerung
welche
2019
den höchsten Ausländeranteil
