In [1]:
# Imports Train
PATH = './tweet_model'
from tweet_ner_data_label import train_data

In [2]:
# Imports & Dependencies
from __future__ import unicode_literals, print_function

import plac
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [3]:
# Update original data to include Normal NER Tags to prevent forgetting problem
nlp = spacy.load('en_core_web_sm')

In [4]:
# Adds old NER tags to train data
def append_old_ner(train_data):
    for i in range(0,len(train_data)):
        doc = nlp(train_data[i][0])
        for ent in doc.ents:
            ents_arr = train_data[i][1]['entities']
            
            start = int(ent.start_char)
            end = int(ent.end_char)
            
            # Only comparing for first truck location, if multiple not account for!
            if len(ents_arr) and ents_arr[0][2] == 'TRUCK_LOCATION':
                truck_start = int(ents_arr[0][0])
                truck_end = int(ents_arr[0][1])

                if (start <= truck_start and end >= truck_end) or (start <= truck_start and end <= truck_end and end >= truck_start):
                    continue
                else:
                    ents_arr.append((start, end, ent.label_))
            else:
                ents_arr.append((start, end, ent.label_))
    return train_data

train_data = append_old_ner(train_data)

In [5]:
# Validate old NER Tags
print(train_data[425])

('Stop by our Bethesda location, right in Metro Center!#bethesda #metrocenter #farmtotable #foods4thought #goodeats… https://t.co/kBC9Ue3E0O', {'entities': [(12, 20, 'TRUCK_LOCATION'), (40, 52, 'TRUCK_LOCATION'), (40, 62, 'FAC'), (76, 88, 'MONEY'), (76, 88, 'MONEY'), (76, 88, 'MONEY'), (89, 103, 'MONEY'), (89, 103, 'MONEY'), (89, 103, 'MONEY'), (104, 105, 'ORG'), (104, 105, 'ORG'), (104, 105, 'ORG')]})


In [22]:
# Load model if exists
def load_model(model=None):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    return nlp

# Train update Spacy's NER
def train_NER(train_data, iterations, model=None):
    TRAIN_DATA = train_data
    
    # Set language to only Text in tweets
    nlp = load_model(model)
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pi
        
    # add new labels eg: using VENDOR_LOCATION
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(iterations):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)
            
    return nlp

In [23]:
# Train Data
nlp = train_NER(train_data, 2)

Created blank 'en' model
Losses {'ner': 587.8558307971144}
Losses {'ner': 374.28660156096333}


In [8]:
# Review NER Results
def show_ents(doc):
    doc = nlp(doc[0])
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_)
    else:
        print('No entities found')

In [24]:
# Test across subset of tweets
# for i in range(0,10):
#     show_ents(train_data[i])
#     print("")
    
print(train_data[3])
show_ents(train_data[3])


('Great events today @NavyFederal HQ in Vienna and @pacsontrack', {'entities': [(20, 44, 'TRUCK_LOCATION'), (13, 18, 'DATE'), (38, 44, 'GPE')]})
today - DATE
HQ - GPE
Vienna - GPE


In [25]:
# Manually test the trained model
test_text = "Today we are at Farragut Square"
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

Entities in 'Today we are at Farragut Square'
DATE Today
FAC Farragut Square


In [26]:
# Save model to output directory
def save_model(nlp, output_dir, name):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta[name] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

In [27]:
save_model(nlp,'./tweet_model','ner_1')

NameError: name 'new_model_name' is not defined