In [1]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object


In [2]:
import json
f = open('./data/dataset.json')
TRAIN_DATA = json.load(f)


In [5]:
for text, annot in tqdm(TRAIN_DATA[0: 4500]): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

db.to_disk("./data/training_data.spacy") # save the docbin object

for text, annot in tqdm(TRAIN_DATA[4500:]): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

db.to_disk("./data/validating_data.spacy") # save the docbin object


100%|██████████| 4500/4500 [00:03<00:00, 1273.25it/s]
100%|██████████| 1588/1588 [00:00<00:00, 1737.99it/s]


In [6]:
! python -m spacy train config.cfg --output ./dist --paths.train ./data/training_data.spacy --paths.dev ./data/validating_data.spacy

[38;5;4mℹ Saving to output directory: dist[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-03-02 07:51:52,020] [INFO] Set up nlp object from config
[2022-03-02 07:51:52,037] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-03-02 07:51:52,043] [INFO] Created vocabulary
[2022-03-02 07:51:52,044] [INFO] Finished initializing nlp object
[2022-03-02 07:52:37,698] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     48.83    0.12    3.89    0.06    0.00
  0     200         70.04   1934.32   99.80   99.84   99.76    1.00
  0     400         12.56     15.90   99.98  100.00   99.96    1.00
  0     600         11.56     13.16   99.96   99.96   99.96    1.00
  0     800         24.15     20.20   99.98   99.98   99.98 

In [7]:
nlp_ner = spacy.load("./dist/model-best")

In [9]:
%%time
doc = nlp_ner("I want to cook something")
print([(ent.text, ent.label_) for ent in doc.ents])

[]
CPU times: user 7.2 ms, sys: 2.43 ms, total: 9.63 ms
Wall time: 18.2 ms
