This document is created to explore the Name entity recognition model before implementing it to the final algorithm

In [1]:
#Importing Libraries 

import spacy
import re
from spacy.tokens import DocBin
from tqdm import tqdm
import json
from spacy.cli.train import train


In [2]:
# loading a new spacy model
nlp = spacy.blank("en") 

# create a DocBin object
db = DocBin()

#Opening and loading the data json file
f = open('training_NER.json')
TRAIN_DATA = json.load(f)

Now we will processes the training data, convert it into spaCy Doc objects with annotated entities, and then save these documents in a spaCy DocBin format for training a Named Entity Recognition (NER) model

In [3]:
for text, annot in tqdm(TRAIN_DATA['annotations']): 
    #Creating a spaCy Doc object from the input text.
    doc = nlp.make_doc(text)
    #Creating spacy objects for each entity
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            # skipping entity if span is empty 
            print("Skipping entity")
        else:
            #Adding the span to the list of entities
            ents.append(span)
    doc.ents = ents 
    db.add(doc)
# save the docbin object
db.to_disk("./training_data.spacy") 

100%|██████████| 13/13 [00:00<00:00, 1405.74it/s]


In [4]:
#! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency 

In [30]:
#Let's start our training
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     45.21    0.00    0.00    0.00    0.00
 33     200        191.76   1222.34  100.00  100.00  100.00    1.00
 75     400          0.00      0.00  100.00  100.00  100.00    1.00
126     600          0.82      0.27  100.00  100.00  100.00    1.00
191     800          0.00      0.00  100.00  100.00  100.00    1.00
266    1000          0.00      0.00  100.00  100.00  100.00    1.00
366    1200          0.00      0.00  100.00  100.00  100.00    1.00
466    1400          0.00      0.00  100.00  100.00  100.00    1.00
613    1600          0.00      0.00  100.00  100.00  100.00    1.00
813    1800          0.00      0.00  100.

In [31]:
nlp_ner = spacy.load("model-best") 

In [32]:
doc = nlp_ner('''the waiter had no business being aggressive with me. plus the food was trash i got very sick
''')

In [39]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

# Create a dictionary from the entities
entity_dict = {}
for ent in doc.ents:
    entity_dict[ent.text] = ent.label_

# Print the dictionary
print(entity_dict)


{'aggressive': 'LEGAL_ISSUES', 'sick': 'HEALTH_ISSUE'}
