In [1]:
# Imports Train
PATH = './tweet_model'
from tweet_ner_data_label import train_data

In [2]:
# Imports & Dependencies
from __future__ import unicode_literals, print_function

import plac
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [3]:
# Update original data to include Normal NER Tags to prevent forgetting problem
nlp = spacy.load('en_core_web_sm')

In [4]:
# Checks entities if multiple truck locations
def count_locations(ents_arr):
    count = 0
    
    for i in range(0,len(ents_arr)):
        cur_ent_type = ents_arr[i][2]
        if cur_ent_type == 'TRUCK_LOCATION':
            count += 1
    
    return count
    
# Adds old NER tags to train data
def append_old_ner(train_data):
    for i in range(0,len(train_data)):
        doc = nlp(train_data[i][0])
        ents_arr = train_data[i][1]['entities']
        locations_count = count_locations(ents_arr)

        for ent in doc.ents:
            start = int(ent.start_char)
            end = int(ent.end_char)

            # Cases
            # < Start---End > Bound entire word and more
            case1 = lambda: start <= truck_start and end >= truck_end
            # < Start>---End< Bound entire left up to end
            case2 = lambda: start <= truck_start and end <= truck_end and end >= truck_start
            # >Start---<End > Bound entire right up to start
            case3 = lambda: end >= truck_end and start >= truck_start and start <= truck_end
            
            if locations_count == 0:
                ents_arr.append((start, end, ent.label_))
            else:
                continue_outer = False
                for j in range(0,locations_count):
                    truck_start = int(ents_arr[j][0])
                    truck_end = int(ents_arr[j][1])
                    
                    if case1() or case2() or case3():
                        continue_outer = True
                        break
                
                if continue_outer:
                    continue
                else:
                    ents_arr.append((start, end, ent.label_))
                    
    return train_data

train_data = append_old_ner(train_data)

In [5]:
# Validate old NER Tags
print(train_data[638])

('Serving the BBQ at the China Town Metro. Across from The Capital One Areena. Southern Fried Fish + Shrimp &amp; Cheesy… https://t.co/cG1aPYOBvr', {'entities': [(23, 39, 'TRUCK_LOCATION'), (57, 75, 'TRUCK_LOCATION'), (12, 15, 'ORG'), (77, 107, 'LAW'), (112, 118, 'ORG')]})


In [45]:
# Load model if exists
def load_model(model=None):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    return nlp

# List of Names Entities
# To be referenced later on
move_names = []

# Train update Spacy's NER
def train_NER(train_data, iterations, model=None):
    TRAIN_DATA = train_data
    
    # Set language to only Text in tweets
    nlp = load_model(model)
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pi
        
    # add new labels eg: using TRUCK_LOCATION
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(iterations):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)
            
    return nlp

In [27]:
# Train Data
nlp = train_NER(train_data, 5)

Created blank 'en' model
Losses {'ner': 516.6613109470339}
Losses {'ner': 328.3013192364618}
Losses {'ner': 255.6622861891636}
Losses {'ner': 217.89849494819507}
Losses {'ner': 176.3370517709092}


In [28]:
# Review NER Results
def show_ents(doc):
    doc = nlp(doc[0])
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_)
    else:
        print('No entities found')

In [29]:
# Test across subset of tweets
# for i in range(0,10):
#     show_ents(train_data[i])
#     print("")
    
print(train_data[74])
print('')
show_ents(train_data[74])

('Bringing the smoothies to ballston', {'entities': [(26, 34, 'TRUCK_LOCATION')]})

ballston - TRUCK_LOCATION


In [35]:
# Manually test the trained model
def test_model(nlp, test_text):
    doc = nlp(test_text)
    print("Entities in '%s' \n" % test_text )
    for ent in doc.ents:
        print(ent.label_, ent.text)
        
test_model(nlp, "Today we are at 2nd Ave Parrish")

Entities in 'Today we are at 2nd Ave Parrish' 

DATE Today
TRUCK_LOCATION 2nd Ave Parrish


In [46]:
# Save model to output directory
def save_model(nlp, output_dir, name):
    if output_dir is not None:
        output_dir = Path(output_dir)
        
        if not output_dir.exists():
            output_dir.mkdir()
            
        nlp.meta[name] = name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

def test_saved_model(model_dir, test_text):
    # test the saved model
    print("Loading from", model_dir)
    nlp = spacy.load(model_dir)

    # Check the classes have loaded back consistently
    # assert nlp.get_pipe("ner").move_names == move_names
    if nlp.get_pipe('ner').move_names[0] != 'B-TRUCK_LOCATION':
        print('NER Pipe doesn\'t have Truck Location')
        return
    
    test_model(nlp, test_text)

In [48]:
save_model(nlp,'./tweet_ner_model','ner_1')

Saved model to tweet_ner_model


In [49]:
test_saved_model('./tweet_ner_model', "Today we are at 2nd Ave Parrish")

Loading from ./tweet_ner_model
Entities in 'Today we are at 2nd Ave Parrish' 

DATE Today
TRUCK_LOCATION 2nd Ave Parrish


In [26]:
# Imports & Dependencies
from __future__ import unicode_literals, print_function

from numpy import load

from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import class_weight

import plac
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


In [51]:
# Import Assertion Model
# Import NER Model
# Injest tweet
# Init Blank response model
# Assert it
#     If Pass
#         NER it
#           If NER has Locations
#             build location NER response
#           Else
#             build failed NER response
#     Else
#         build failed response
# Return response

class AssertModel:
    def __init__(self, ner_model_path, affirmation_model_path, scaler_data_path, max_len):
        self.max_len = max_len # Max Tweet Length
        self.scaler_data = self.__load_scaler_data(scaler_data_path, 'scaler_data.npy')
        print(self.scaler_data)
        try:
            self.nlp_ner = spacy.load(ner_model_path)
        except ModuleNotFoundError:
            raise
            
        try:
            self.nlp_affirmation = keras.models.load_model(affirmation_model_path, compile=False)
        except ModuleNotFoundError:
            raise
    
    #  Tokenization Methods
    def __tokenize_tweets(self, tokenizer, data):
        tokenizer.fit_on_texts(data)

        return tokenizer.texts_to_sequences(data)
    
    def __create_zeros_array(self, length):
        zeros_arr = []

        i = 0
        while i < length:
            zeros_arr.append(0)
            i += 1

        return zeros_arr

    def __pad_array(self,data, max_len):
        zeros_len = self.max_len - len(data)
        zeros_arr = self.__create_zeros_array(zeros_len)

        return [*data,*zeros_arr]
    
    def __load_scaler_data(self, directory, file):
        try:
            path = f'{directory}/{file}'
            data = load(path)
            
            return data
        except ModuleNotFoundError:
            raise

    def __scale_test_data(self, data):
        scaler_object = MinMaxScaler()
        scaler_object.fit(self.scaler_data)
        scale_data = scaler_object.transform(data)
        
        return scale_data
    
    def __predict_assertion(self,tweet):
        tokenizer = Tokenizer()
        tokenized_tweets = self.__tokenize_tweets(tokenizer, [tweet])
        predict_tweets = [self.__pad_array(data, self.max_len) for data in tokenized_tweets]

        scaled_test_data = self.__scale_test_data(predict_tweets)
        print(scaled_test_data)
        prediction = self.nlp_affirmation.predict_classes(scaled_test_data)
        
        print(prediction)
        if prediction[0] == 0:
            return True
        else:
            return False

    def predict_tweet(self, tweet):
        affirmation = self.__predict_assertion(tweet)
        print(affirmation)

In [53]:
model = AssertModel('./tweet_ner_model', './tweet_affirmation_model/test', './tweet_affirmation_model', 35)

model.predict_tweet("nononononononono")

[[ 17  21   4 ...   0   0   0]
 [  4  20  22 ...   0   0   0]
 [  9 252 725 ...   0   0   0]
 ...
 [  3  48  83 ...   0   0   0]
 [  7  28  90 ...   0   0   0]
 [ 17  21  35 ...   0   0   0]]
[[-0.00094073  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.        ]]
[0]
True
