In [1]:
# Imports Train
PATH = './tweet_model'
from tweet_ner_data_label import train_data

In [2]:
# Imports & Dependencies
from __future__ import unicode_literals, print_function

import plac
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [3]:
# Update original data to include Normal NER Tags to prevent forgetting problem
nlp = spacy.load('en_core_web_sm')

In [43]:
# Checks entities if multiple truck locations
def count_locations(ents_arr):
    count = 0
    
    for i in range(0,len(ents_arr)):
        cur_ent_type = ents_arr[i][2]
        if cur_ent_type == 'TRUCK_LOCATION':
            count += 1
    
    return count
    
# Adds old NER tags to train data
def append_old_ner(train_data):
    for i in range(0,len(train_data)):
        doc = nlp(train_data[i][0])
        ents_arr = train_data[i][1]['entities']
        locations_count = count_locations(ents_arr)

        for ent in doc.ents:
            start = int(ent.start_char)
            end = int(ent.end_char)

            # Cases
            # < Start---End > Bound entire word and more
            case1 = lambda: start <= truck_start and end >= truck_end
            # < Start>---End< Bound entire left up to end
            case2 = lambda: start <= truck_start and end <= truck_end and end >= truck_start
            # >Start---<End > Bound entire right up to start
            case3 = lambda: end >= truck_end and start >= truck_start and start <= truck_end
            
            if locations_count == 0:
                ents_arr.append((start, end, ent.label_))
            else:
                continue_outer = False
                for j in range(0,locations_count):
                    truck_start = int(ents_arr[j][0])
                    truck_end = int(ents_arr[j][1])
                    
                    if case1() or case2() or case3():
                        continue_outer = True
                        break
                
                if continue_outer:
                    continue
                else:
                    ents_arr.append((start, end, ent.label_))
                    
    return train_data

train_data = append_old_ner(train_data)

1
1
1
1
1
[('Hash Tag galoreWe are serving Lunch at Franklin Square today. Features: Southern Fried Fish, Shrimp &amp; Cheese Grits… https://t.co/iX7xxjEvER', {'entities': [(39, 54, 'TRUCK_LOCATION'), (30, 35, 'NORP'), (55, 60, 'DATE'), (93, 101, 'ORG'), (106, 118, 'PERSON')]})]


In [27]:
# Validate old NER Tags
print(train_data[562])

("Come celebrate the holidays tonight with El Capitan @ the Annapolis Towne Centre's Fire &amp; Ice Festivals from 4-8. Mmmmm coookieeee! 😍🍪😋", {'entities': [(54, 80, 'TRUCK_LOCATION'), (15, 27, 'TIME'), (28, 35, 'TIME'), (41, 51, 'GPE'), (54, 82, 'ORG'), (83, 89, 'ORG'), (94, 107, 'ORG'), (113, 114, 'CARDINAL')]})


In [28]:
# Load model if exists
def load_model(model=None):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    return nlp

# Train update Spacy's NER
def train_NER(train_data, iterations, model=None):
    TRAIN_DATA = train_data
    
    # Set language to only Text in tweets
    nlp = load_model(model)
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pi
        
    # add new labels eg: using TRUCK_LOCATION
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(iterations):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)
            
    return nlp

In [29]:
# Train Data
nlp = train_NER(train_data, 5)

Created blank 'en' model
Losses {'ner': 568.0590568321777}
Losses {'ner': 372.42172231524745}
Losses {'ner': 296.9194610048229}
Losses {'ner': 249.23931410783047}
Losses {'ner': 207.7893758952677}


In [30]:
# Review NER Results
def show_ents(doc):
    doc = nlp(doc[0])
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_)
    else:
        print('No entities found')

In [36]:
# Test across subset of tweets
# for i in range(0,10):
#     show_ents(train_data[i])
#     print("")
    
print(train_data[89])
print('')
show_ents(train_data[89])


('Hash Tag galoreWe are serving Lunch at Franklin Square today. Features: Southern Fried Fish, Shrimp Po Boy. You ca… https://t.co/7Tah0KkPAa', {'entities': [(39, 54, 'TRUCK_LOCATION'), (30, 35, 'NORP'), (39, 54, 'FAC'), (55, 60, 'DATE')]})

Lunch - NORP
Franklin Square - FAC
today - DATE
Buildings, airports, highways, bridges, etc.


In [11]:
# Manually test the trained model
test_text = "Today we are at Farragut Square"
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

Entities in 'Today we are at Farragut Square'
DATE Today
TRUCK_LOCATION Farragut Square


In [26]:
# Save model to output directory
def save_model(nlp, output_dir, name):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta[name] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

In [27]:
save_model(nlp,'./tweet_model','ner_1')

NameError: name 'new_model_name' is not defined