In [None]:
import spacy
import random
from spacy.util import minibatch, compounding

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

In [None]:
url = "https://groups.csail.mit.edu/sls/downloads/restaurant/restauranttrain.bio"    
html = requests.get(url)
html_page = html.content

text = BeautifulSoup(html_page, 'html.parser')
with open('data.txt','w') as f:
  f.write(text.text)

In [None]:
with open('data.txt','r') as f:
  sentences = []
  sentence = []
  for line in f:
    if line != '\n':
      sentence.append(line)
    else:
      sentences.append(''.join(sentence))
      sentence = []


In [None]:
train_data = []
label = []
for sentence in sentences:
  res = pd.read_csv(StringIO(sentence),delimiter='\t',header=None)
  res.columns = ['Tag','Word']
  res['Word_Length'] = res['Word'].apply(lambda x: len(str(x)))
  res['Word_Start'] = ((res['Word_Length']+1).cumsum()).shift(1,fill_value=0)
  res['Word_End'] = res['Word_Start'] + res['Word_Length']
  words = res['Word'].values.tolist()
  if len(words)==1:
    text = str(words[0])
  else:
    text = ' '.join(words)
  list_entities = []
  for i in range(res.shape[0]):
    tag = res.loc[i,'Tag']
    start =  res.loc[i,'Word_Start']
    end =  res.loc[i,'Word_End']
    if tag not in label:
      label.append(tag)
    if tag != 'O':
      list_entities.append((start,end,tag))
  train_data.append((text,{'entities':list_entities}))


In [None]:
nlp = spacy.blank('en')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)
for i in label:
  ner.add_label(i)
optimizer = nlp.begin_training()
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(30):
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                        losses=losses)
        print('Losses', losses)


Losses {'ner': 21978.942205010993}
Losses {'ner': 15454.985520238266}
Losses {'ner': 13581.910265806877}
Losses {'ner': 12373.159804072391}
Losses {'ner': 11634.135050100598}
Losses {'ner': 11134.175764689579}
Losses {'ner': 10547.605561620534}
Losses {'ner': 10291.67309443081}
Losses {'ner': 9764.054894283905}
Losses {'ner': 9601.554304712074}
Losses {'ner': 9186.407680712146}
Losses {'ner': 9017.84045280131}
Losses {'ner': 8734.428664283116}
Losses {'ner': 8548.251742810628}
Losses {'ner': 8488.69488681472}
Losses {'ner': 8278.635534887526}
Losses {'ner': 8221.475480006113}
Losses {'ner': 8040.667739010552}
Losses {'ner': 7634.014516340962}
Losses {'ner': 7660.055042908127}
Losses {'ner': 7487.649212316109}
Losses {'ner': 7416.833368636181}
Losses {'ner': 7310.52988831355}
Losses {'ner': 7234.063362982268}
Losses {'ner': 7007.7794007783905}
Losses {'ner': 7049.773356238586}
Losses {'ner': 6819.635488878275}
Losses {'ner': 6877.326699053507}
Losses {'ner': 6700.064797591431}
Losses {'

In [None]:
# Test the trained model
test_text = 'a four star restaurant with a bar'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
  print(ent.label_, ent.text)


Entities in 'a four star restaurant with a bar'
B-Rating four
I-Rating star
B-Amenity bar


In [None]:
# Save model 
nlp.to_disk('NLP Training')


In [None]:
#Load the saved model for inference
nlp2 = spacy.load('NLP Training')
doc2 = nlp2(test_text)
for ent in doc2.ents:
  print(ent.label_, ent.text)

B-Rating four
I-Rating star
B-Amenity bar
