In [10]:
import pandas as pd
import numpy as np
import spacy
import random
import json
import pickle
from spacy.training import Example
from spacy import displacy

In [2]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,sentence1,sentence2,sentence3,sentence4,sentence5
0,Sven did not like to drink alcohol under any c...,He often excused himself from events that woul...,"One evening, his girlfriend offered him a smal...","Sven decided to drink the beverage, hoping he ...","Unfortunately, the wine was not tasty enough t..."
1,Maddie went to work today.,"However, she didn't eat for most of the day.",Jen invited her to a party.,She tried to go but ended up passing out while...,Her friend had to take her home.
2,Heather needed to go back to school shopping f...,She doesn't get out of the house much because ...,Heather decided to start looking at online sales.,She found some really good deals shopping online.,Because of the good deals she got her shopping...
3,Andrew was taking his little cousins to ride g...,He called them to see if they were ready to be...,They were ready to be picked up.,So Andrew picked them all up and drove them to...,They all had fun riding go carts.
4,The grass in the front yard is dying.,It gets too much sun throughout the day.,We need to plant a tree so that the grass does...,I think we will look to get a maple tree.,We get in the car and head to get the tree.


In [3]:
#Combining all sentences into one paragraph

data['paragraph'] = data['sentence1']+' '+data['sentence2']+' '+data['sentence3']+' '+data['sentence4']+' '+data['sentence5']
data.drop(['sentence1','sentence2','sentence3','sentence4','sentence5'], axis=1, inplace=True)
data.iloc[0]['paragraph']

"Sven did not like to drink alcohol under any circumstances. He often excused himself from events that would include alcohol. One evening, his girlfriend offered him a small glass of wine. Sven decided to drink the beverage, hoping he may actually like it. Unfortunately, the wine was not tasty enough to change Sven's mind."

In [4]:
data.head()

Unnamed: 0,paragraph
0,Sven did not like to drink alcohol under any c...
1,"Maddie went to work today. However, she didn't..."
2,Heather needed to go back to school shopping f...
3,Andrew was taking his little cousins to ride g...
4,The grass in the front yard is dying. It gets ...


In [6]:
# Approach is to manually annotating the entities and train on the given dataset. 
# For annotation I used this tool https://manivannanmurugavel.github.io/annotating-tool/spacy-ner-annotator/ 
# This tool returns a docs and their entity labels with their offsets but we want list of tuples as inputs for spacy training pipeline.  

data.to_csv('data.txt',header=False, index=False)  # Using this file for annotation of entities.

In [12]:
# After annotation we got the following file 'doc.json'.
# Converting it into as per our requirement i.e. list of tuples.
# Following code snippet was referred from 
# https://github.com/ManivannanMurugavel/spacy-ner-annotator/blob/master/convert_spacy_train_data.py

with open('doc.json') as f:
    doc = json.load(f)

train_data = []
for para in doc:
    entities = [tuple(entity[:3]) for entity in para['entities']]
    train_data.append((para['content'],{'entities':entities}))
    
with open('doc.pkl','wb') as f:
     pickle.dump(train_data, f)
        
print(train_data[:5])

[('"Sven did not like to drink alcohol under any circumstances. He often excused himself from events that would include alcohol. One evening, his girlfriend offered him a small glass of wine. Sven decided to drink the beverage, hoping he may actually like it. Unfortunately, the wine was not tasty enough to change Sven\'s mind."', {'entities': [(312, 316, 'living_entity'), (276, 280, 'object'), (215, 223, 'object'), (183, 187, 'object'), (174, 179, 'object'), (143, 153, 'living_entity'), (117, 124, 'object'), (28, 35, 'object'), (1, 5, 'living_entity')]}), ('"Maddie went to work today. However, she didn\'t eat for most of the day. Jen invited her to a party. She tried to go but ended up passing out while she was there. Her friend had to take her home."', {'entities': [(190, 194, 'place'), (167, 173, 'living_entity'), (73, 76, 'living_entity'), (1, 7, 'living_entity')]}), ("Heather needed to go back to school shopping for her children. She doesn't get out of the house much because she do

In [19]:
nlp = spacy.blank('en')  # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    nlp.add_pipe('ner', last=True)

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
n_iter = 20  #no of iterations
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        print("Statring iteration " + str(itn))
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update(
                [example],  # batch of annotations
                drop=0.1,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)
        
nlp.to_disk('model')

Statring iteration 0
{'ner': 1107.792653946908}
Statring iteration 1
{'ner': 420.27882592773597}
Statring iteration 2
{'ner': 277.41279000492517}
Statring iteration 3
{'ner': 337.1649602761418}
Statring iteration 4
{'ner': 161.67681647928381}
Statring iteration 5
{'ner': 90.71223637205419}
Statring iteration 6
{'ner': 61.49427974651371}
Statring iteration 7
{'ner': 59.87141201097892}
Statring iteration 8
{'ner': 50.055224549122904}
Statring iteration 9
{'ner': 42.201763373343546}
Statring iteration 10
{'ner': 44.51495187496945}
Statring iteration 11
{'ner': 33.28897600072449}
Statring iteration 12
{'ner': 35.904692413002856}
Statring iteration 13
{'ner': 93.56050995231955}
Statring iteration 14
{'ner': 16.0156660355891}
Statring iteration 15
{'ner': 15.62840182301043}
Statring iteration 16
{'ner': 76.19191825789858}
Statring iteration 17
{'ner': 20.64396235470604}
Statring iteration 18
{'ner': 20.896974480465396}
Statring iteration 19
{'ner': 20.718920559607334}


In [20]:
para = nlp("Hugo pushed a smaller man for no reason. The man hit Hugo with a stick. Hugo grabbed the man roughly. He threw the man to the ground. The man begged Hugo for mercy.")

displacy.render(para, style='ent', jupyter=True)

In [44]:
#final pipeline
nlp = spacy.load('model')
with open('example_input.txt','r') as f:
    paragraph = f.read()

doc = nlp(paragraph)

name_entity = []   #to store entity and its label
labels = ['living_entity', 'place', 'object']
for ent in doc.ents:
    name_entity.append([str(ent),ent.label_])
    
name_entity = np.array(name_entity)
name_entity = np.unique(name_entity, axis=0)

entities = {}
for i in labels:
    rows,cols = np.where(name_entity == i)
    entities[i] = list(name_entity[rows,0])

res = {'entities':entities}
with open('eg_output.json','w') as f:
    json.dump(res,f)

with open('eg_output.json','r') as f:
    a = json.load(f)

print(a)

{'entities': {'living_entity': ['Hugo', 'man', 'reason', 'smaller'], 'place': ['ground'], 'object': ['stick']}}


In [43]:
doc.ents

()