In [5]:
#Import all required libraries
import random
import json
from tqdm import tqdm

import spacy
from spacy import displacy
from spacy.tokens import DocBin
from spacy.tokens import Span

from sklearn.model_selection import train_test_split

***********
### Data preparation step

Load data and convert to Spacy ver3 format

In [6]:
def convert_to_NER_format(input_data):
    """
    """
    spacy_NER_data = []
    labels_list = []
    for sentence in data:
        labels_list += [label[2] for label in sentence['labels']]  
        entities_dict = {'entities':[tuple(j) for j in sentence['labels']]}
        spacy_NER_data.append([sentence["text"], entities_dict])
        
    return spacy_NER_data, list(set(labels_list))

def convert_NER_data_spacy3(input_data, path_to_save):
    """
    """
    db = DocBin() # create a DocBin object
    for text, annot in tqdm(input_data):
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annot["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="expand")
            if span is None:
                print("Skipping entity", label, text[start:end])
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        db.add(doc)
    db.to_disk(path_to_save) # save the docbin object

In [8]:
path_to_train_data = 'input_data.json'

with open(path_to_train_data) as json_file:
    data = json.load(json_file)

In [9]:
spacy_NER_data, data_labels = convert_to_NER_format(data)
TRAIN_DATA, VALID_DATA = train_test_split(spacy_NER_data, test_size=0.2, random_state=42)

In [5]:
nlp = spacy.blank("de") 
convert_NER_data_spacy3(TRAIN_DATA, path_to_save="./train.spacy")
convert_NER_data_spacy3(VALID_DATA, path_to_save="./valid.spacy")

100%|███████████████████████████████████████████████████████████████████████████████| 171/171 [00:00<00:00, 305.76it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 271.69it/s]


************
### INIT CONFIG FILES

Create config files to train models.
Aditionally was added:

- Wandb logger preferences
- vectors lookups for large (accuracy) model

In [6]:
import warnings
import os
import logging

warnings.filterwarnings('ignore')

logging.getLogger("spacy_lefff").setLevel(logging.WARNING)
os.environ['WANDB_SILENT'] = 'true'

In [7]:
#!python -m spacy init config config_demo_acc.cfg --lang de --pipeline ner --optimize accuracy

In [8]:
#!python -m spacy init config config_demo_efficiency.cfg --lang de --pipeline ner --optimize efficiency

************
### TRAIN CLI


In [9]:
!python -m spacy train config_demo_efficiency.cfg --output ./ner_efficiency/training/ --paths.train train.spacy --paths.dev valid.spacy

[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001

E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     62.67    0.17    0.14    0.23    0.00
  1     200        154.03   2033.13   90.78   90.27   91.30    0.91
  2     400        121.05    352.87   94.02   94.46   93.59    0.94
  3     600        175.35    341.57   93.76   93.02   94.51    0.94
  5     800        218.10    314.23   94.61   96.88   92.45    0.95
  7    1000       4192.30    459.20   95.71   96.95   94.51    0.96
 10    1200        147.29    123.14   96.06   97.19   94.97    0.96
 13    1400        176.29    111.08   96.30   97.42   95.19    0.96
 17    1600        200.99     84.53   95.32   95.00   95.65    0.95
 22    1800        153.28     71.55   95.14   96.25   94.05    0.95
 28    2000        159.61     61.53   94.80   95.79   93.82    0.95
 35

2021-08-01 13:58:27.808814: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-08-01 13:58:27.808851: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[2021-08-01 13:58:31,742] [INFO] Set up nlp object from config
[2021-08-01 13:58:32,546] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-08-01 13:58:32,551] [INFO] Created vocabulary
[2021-08-01 13:58:32,551] [INFO] Finished initializing nlp object
[2021-08-01 13:58:34,439] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [14]:
!python -m spacy train config_demo_acc.cfg --output ./ner_acc/training/ --paths.train train.spacy --paths.dev valid.spacy

[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001

E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     62.67    0.00    0.00    0.00    0.00
  1     200         90.35   1562.25   91.70   93.78   89.70    0.92
  2     400         35.55    393.02   95.56   95.02   96.11    0.96
  3     600        405.05    394.51   93.48   91.83   95.19    0.93
  5     800         62.10    285.57   95.16   95.82   94.51    0.95
  7    1000        426.11    216.40   96.58   96.15   97.03    0.97
 10    1200        206.35    243.82   94.70   95.36   94.05    0.95
 13    1400         55.00    112.97   95.18   95.40   94.97    0.95
 17    1600         68.92    111.45   95.19   95.19   95.19    0.95
 22    1800         23.94     31.66   96.66   97.44   95.88    0.97
 28    2000          5.61      3.74   96.67   97.00   96.34    0.97
 35

2021-08-01 14:08:46.714073: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-08-01 14:08:46.714102: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[2021-08-01 14:08:49,238] [INFO] Set up nlp object from config
[2021-08-01 14:08:49,681] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-08-01 14:08:49,684] [INFO] Created vocabulary
[2021-08-01 14:08:52,140] [INFO] Added vectors: de_core_news_lg
[2021-08-01 14:08:52,141] [INFO] Finished initializing nlp object
[2021-08-01 14:08:55,063] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


***************
### Visualization Part

In [10]:
ner = spacy.load(R"model")

In [11]:
colour_dict = dict(zip(data_labels, 
                       ["#%06x" % random.randint(0, 0xFFFFFF) for i in range(len(data_labels))]))

In [12]:
for sentence, _ in VALID_DATA[10:20]:
    doc = ner(sentence)
    options = {"ents": data_labels, "colors": colour_dict}
    displacy.render(doc,jupyter=True, style = "ent", options=options)