In [1]:
import pickle
import en_core_web_sm
import spacy
import random
from spacy import displacy

In [2]:
filename = '../training_data'
infile = open(filename,'rb')
TRAIN_DATA = pickle.load(infile)
infile.close()

In [3]:
nlp=spacy.load('en_core_web_sm')

In [4]:
def train_model(train_data):
    if 'ner' not in nlp.pipe_names:
        ner=nlp.create_pipe('ner')
        nlp.add_pipe(ner,last=True)
    else:
        ner = nlp.get_pipe("ner")
    for _,annotation in train_data:
        for ent in annotation['entities']:
            ner.add_label(ent[2])
    


    other_pipes = [pipe for pipe in nlp.pipe_names if pipe !='ner']
    # only train NER
    with nlp.disable_pipes(*other_pipes):
        optimizer=nlp.entity.create_optimizer()
        for itn in range(10):
            random.shuffle(train_data)
            losses = {}
            index=0
            for text,annotations in train_data:
                try:
                    nlp.update(
                        [text],  # batch of texts
                        [annotations],  # batch of annotations
                        drop=0.2,
                        sgd=optimizer,  # dropout - make it harder to memorise data
                        losses=losses)
                except Exception as e:
                    pass
            print(losses)
        
    

In [5]:
train_model(TRAIN_DATA)

{'ner': 815.0031266910136}
{'ner': 649.6529669092852}
{'ner': 641.6542264722084}
{'ner': 684.9925741915504}
{'ner': 887.0387362321744}
{'ner': 776.2033724473877}
{'ner': 658.8202474864702}
{'ner': 633.8664638452638}
{'ner': 745.1894530615427}
{'ner': 691.9560195447184}


In [8]:
nlp.to_disk('../nlp_model')

In [2]:
nlp = spacy.load('../nlp_model')
doc = nlp("BAMPSL Securities is quoting ex-rights today. The company has announced a rights in the ratio of 2:1 at Re 1 per share on March 31, 2011. The record date has been fixed at April 20, 2011.")
for ent in doc.ents:
    print(ent.text,ent.label_)

ex-rights CA_TYPE
ratio of 2:1 at Re 1 per share on March 31, 2011. PURPOSE


In [4]:
displacy.serve(doc,style='ent')


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
