In [1]:
#Training for the NER component done via command line and config file.
# In the directory containing the base_config.cfg file run command:
# python -m spacy train base_config.cfg --output ./base_model
#
# with GPU: python -m spacy train base_config.cfg --output ./base_model --gpu-id 0.

In [1]:
import torch
import spacy
from spacy.scorer import Scorer

In [2]:
spacy.prefer_gpu()
nlp = spacy.load('../base_model/model-last')

In [3]:
print(torch.cuda.memory_reserved(0))
torch.cuda.get_device_properties(0).total_memory

1088421888


8589606912

## Training the entity linker

In [4]:
import json

In [5]:
with open('../data/mp_train_el.json',"r", encoding = "utf8") as f:
    train = json.load(f)

with open('../data/mp_val_el.json',"r", encoding = "utf8") as f:
    val = json.load(f)

In [6]:
# reformatting labelled data into appropriate format for training the NEL component
def data_creator(data):
    dataset = []
    for line in data:
        try:
            if line != []:
                example = line
                text = line[0]
                for entities in line[1]["links"]:
                    qid = entities[3]
                    links_dict = {qid: 1.0}
                    offset = (entities[0], entities[1])
                    entities = [(entities[0], entities[1], entities[2])]
                dataset.append((text,{'links': {offset: links_dict}, 'entities': entities}))
        except:
            pass
    return(dataset)

In [7]:
train_data = data_creator(train)
val_data = data_creator(val)

In [8]:
# adding tagger and parser components to the pipeline. We can use pretrained versions from the en_core_web_trf transformers like our NER and NEL component
source = spacy.load('en_core_web_trf')
tagger = nlp.add_pipe('tagger', before = 'ner', source = source)
parser = nlp.add_pipe('parser', before = 'ner', source = source)

ValueError: [E007] 'tagger' already exists in pipeline. Existing names: ['transformer', 'tagger', 'parser', 'ner']

In [10]:
attr_r = nlp.add_pipe('attribute_ruler', before = 'ner', source = source)

In [11]:
from spacy.training import Example

In [12]:
train_data

[('MINDEF Cyber Work Learn Scheme.',
  {'links': {(0, 6): {'Ministry of Defence ': 1.0}},
   'entities': [(0, 6, 'ORG')]}),
 ("if WP supporters adhered to safe distancing rules and were able to be more organised, it would have been super impressive and boosted WP's image even further haha",
  {'links': {(134, 136): {"Workers' Party": 1.0}},
   'entities': [(134, 136, 'ORG')]}),
 ('Aunty on MRT insulting Singaporeans and complaining about everything under the Sun',
  {'links': {(79, 82): {'Sun Xueling': 1.0}},
   'entities': [(79, 82, 's_pol')]}),
 ("There was only 1 warzone match my ping was at 160+... at least it's better than before.  Hope it gets better for u too.",
  {'links': {(46, 49): {'The Straits Times': 1.0}},
   'entities': [(46, 49, 'ORG')]}),
 ('Government monitoring hospital capacity closely as Covid-19 cases remain high: Ong Ye Kung',
  {'links': {(79, 90): {'Ong Ye Kung': 1.0}},
   'entities': [(79, 90, 's_pol')]}),
 ('Straits Times: Parliament: All bus services in the 

In [23]:
#converting data into Spacy training example format
TRAIN_EXAMPLES = []
for text, annotation in tqdm(train_data):
    example = Example.from_dict(nlp.make_doc(text), annotation)
    example.reference = nlp(example.reference)
    TRAIN_EXAMPLES.append(example)


100%|██████████| 9956/9956 [05:59<00:00, 27.73it/s]


In [24]:
from spacy.ml.models import load_kb

In [25]:
entity_linker = nlp.add_pipe("entity_linker", config={"incl_prior": False}, last=True)


ValueError: [E007] 'entity_linker' already exists in pipeline. Existing names: ['transformer', 'tagger', 'parser', 'attribute_ruler', 'ner', 'entity_linker']

In [26]:
entity_linker.initialize(get_examples=lambda: TRAIN_EXAMPLES, kb_loader=load_kb('../assets/mp_kb'))

In [27]:
import random
from spacy.util import minibatch, compounding

In [28]:
from tqdm import tqdm

In [29]:
%%time

with nlp.select_pipes(enable=["entity_linker"]):
    optimizer = nlp.resume_training()
    for itn in tqdm(range(200)):
        random.shuffle(TRAIN_EXAMPLES)
        batches = minibatch(TRAIN_EXAMPLES, size=compounding(256,528, 1.001))  # increasing batch sizes
        losses = {}
        for batch in batches:
            nlp.update(batch,
                       drop=0.1,      # prevent overfitting with dropout
                       losses=losses,
                       sgd=optimizer)
        if itn % 50 == 0:
            print(itn, "Losses", losses)   # print the training loss
print(itn, "Losses", losses)


  0%|          | 1/200 [00:46<2:33:17, 46.22s/it]

0 Losses {'entity_linker': 39.49820387363434}


 26%|██▌       | 51/200 [35:34<2:03:53, 49.89s/it]

50 Losses {'entity_linker': 39.40390223264694}


 50%|█████     | 101/200 [1:10:03<1:02:41, 38.00s/it]

100 Losses {'entity_linker': 39.46898150444031}


 76%|███████▌  | 151/200 [1:42:25<35:01, 42.90s/it]  

150 Losses {'entity_linker': 39.43627601861954}


100%|██████████| 200/200 [2:11:50<00:00, 39.55s/it]

199 Losses {'entity_linker': 39.45754611492157}
Wall time: 2h 11min 50s





KeyboardInterrupt: 

In [None]:
# Evaluating

In [30]:
VAL_EXAMPLES = []
for text, annotation in val_data:
    example = Example.from_dict(nlp.make_doc(text), annotation)
    example.predicted = nlp(example.predicted)
    #print(example.reference)
    VAL_EXAMPLES.append(example)

In [31]:
scorer = Scorer(nlp)

In [32]:
scores = scorer.score(VAL_EXAMPLES)

In [33]:
scores

{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'tag_acc': None,
 'sents_p': None,
 'sents_r': None,
 'sents_f': None,
 'dep_uas': None,
 'dep_las': None,
 'dep_las_per_type': None,
 'pos_acc': None,
 'morph_acc': None,
 'morph_micro_p': None,
 'morph_micro_r': None,
 'morph_micro_f': None,
 'morph_per_feat': None,
 'lemma_acc': None,
 'ents_p': 0.5030814142069413,
 'ents_r': 0.6519546027742749,
 'ents_f': 0.5679238374221897,
 'ents_per_type': {'ORG': {'p': 0.7853164556962026,
   'r': 0.9961464354527938,
   'f': 0.8782559456398642},
  'PERSON': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  's_pol': {'p': 0.0, 'r': 0.0, 'f': 0.0}},
 'nel_score': 0.9892148403796377,
 'nel_score_desc': 'micro F',
 'nel_micro_p': 1.0,
 'nel_micro_r': 0.9786598378147674,
 'nel_micro_f': 0.9892148403796377,
 'nel_macro_p': 1.0,
 'nel_macro_r': 0.9684343434343434,
 'nel_macro_f': 0.983702737940026,
 'nel_f_per_type': {'ORG': {'p': 1.0, 'r': 1.0, 'f': 1.0},
  's_pol': {'p': 1.0, 'r': 0.9368686868686

In [34]:
nlp.to_disk('../model')