spaCy is an open-source software library for advanced natural language processing.

In [82]:
import spacy

Load pre-existing spacy model

In [83]:

'''
"en_core_web_sm" 
is a small English pipeline trained on written web text (blogs, news, comments), that includes vocabulary, syntax and entities.

To Download linux-python, run this command: 
          python -m spacy download en_core_web_sm
'''
nlp=spacy.load('en_core_web_sm')

Name Entity Recognition

In [84]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [85]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

No FOOD Level in NER default dataset, so we can add a new level 

In [86]:

# Getting the pipeline component
ner=nlp.get_pipe("ner")

 New label to add

In [87]:
LABEL = "FOOD"

Training examples in the required format

In [88]:

# Foods Dataset: 
#  different Categories _
#      Staple foods, 
#      Fishs, 
#      Fruits, 
#      Pitha, 
#      Sweets, 
#      Street foods 
#      and more commonly used foods in Bangladesi Restaurants .

TRAIN_DATA =[ 
              ("Rice is Common Staple food in Bangladesh.", {"entities": [(0, 4, "FOOD")]}),
              ("Khichuri is Staple food in Bangladesh.", {"entities": [(0, 8, "FOOD")]}),
              ("Puffed rice is Staple food.", {"entities": [(0, 11, "FOOD")]}),
              ("Flattened Rice is a good probiotic food.", {"entities": [(0, 14, "FOOD")]}),

              ("Dal is a split grain used in Bangladeshi cookery.", {"entities": [(0, 3, "FOOD")]}),

              
              ("Ilish is National Fish in Bangladesh.", {"entities": [(0, 5, "FOOD")]}),
              ("Snakehead murrel is National Fish in Bangladesh.", {"entities": [(0, 16, "FOOD")]}),

              ("Mango is Common Fruits in Bangladesh.", {"entities": [(0, 5, "FOOD")]}),
              ("Jackfruit is National Fruits in Bangladesh.", {"entities": [(0, 9, "FOOD")]}),

              ("Patishapta is classic pitha in Bangladesh.", {"entities": [(0, 10, "FOOD")]}),
              ("Bhapa pitha are street side favorite winter synack for Bangladeshi People.", {"entities": [(0, 11, "FOOD")]}),
              ("Nakshi pitha", {"entities": [(0, 12, "FOOD")]}),
              ("Chitoi pitha", {"entities": [(0, 12, "FOOD")]}),
              ("Malpua pitha are Nobbano Utshob Pitha", {"entities": [(0, 12, "FOOD")]}),

              ("Bogurar doi Milk-based Sweets in Bangladesh.", {"entities": [(0, 11, "FOOD")]}),
              ("Jilapi are common sweets.", {"entities": [(0, 6, "FOOD")]}),
              ("Roshogolla", {"entities": [(0, 10, "FOOD")]}),
              ("Kalojam", {"entities": [(0, 7, "FOOD")]}),

              ("Shingara is common street food in Bangladesis.", {"entities": [(0, 8, "FOOD")]}),
              ("Samosa is common street food in Bangladesis.", {"entities": [(0, 6, "FOOD")]}),
              ("Chanachur is common street food in Bangladesis.", {"entities": [(0, 9, "FOOD")]}),

              ("Pizza is a common Restaurants food in bangladesh.", {"entities": [(0, 5, "FOOD")]}),
              ("Pasta is an italian recipe", {"entities": [(0, 5, "FOOD")]}),
              ("noodles", {"entities": [(0,7, "FOOD")]}),
              ("Sushi is extemely famous and expensive Japanese dish", {"entities": [(0,5, "FOOD")]}),
              ("Chocolate soufflé is extremely famous french cuisine", {"entities": [(0,17, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Frenchfries are considered too oily", {"entities": [(0,11, "FOOD")]}),

           ]


In [89]:

# Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [90]:
from spacy.training.example import Example

for batch in spacy.util.minibatch(TRAIN_DATA, size=2):
    for text, annotations in batch:
        # create Example
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        # Update the model
        nlp.update([example], losses=losses, drop=0.3)



In [91]:

from spacy.util import minibatch, compounding
import random

# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes) :

  sizes = compounding(1.0, 4.0, 1.001)
  # Training for 30 iterations     
  for itn in range(30):
    # shuffle examples before training
    random.shuffle(TRAIN_DATA)
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=sizes)
    # ictionary to store losses
    losses = {}
    for batch in batches:
        texts, annotations = zip(*batch)
            
        example = []
        # Update the model with iterating each text
        for i in range(len(texts)):
            doc = nlp.make_doc(texts[i])
            example.append(Example.from_dict(doc, annotations[i]))
            
        # Update the model
        nlp.update(example, drop=0.5, losses=losses)

In [102]:
# Testing the NER

sample_text = "Bhapa pitha are street side favorite winter synack for Bangladeshi People."
#test_text2 = "Rice is common food in Bangladesh"
doc = nlp(sample_text)
print("Entities in '%s'" % sample_text)
for ent in doc.ents:
  print(ent)

Entities in 'Bhapa pitha are street side favorite winter synack for Bangladeshi People.'
Bhapa pitha


In [103]:
from spacy import displacy

displacy.render(doc, style='ent')