In [119]:
import spacy, random
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training import Example


In [120]:
nlp = spacy.blank('en')
source_nlp = spacy.load("en_core_web_sm")
activated = spacy.prefer_gpu()
nlp.add_pipe("ner", source=source_nlp)

<spacy.pipeline.ner.EntityRecognizer at 0x2b9eda80340>

In [121]:
# TODO: add all of the other train_data

train_data = [
    ("Adjusted Mean Change From Baseline in Fasting Plasma Glucose at Week 24 (Last Observation Carried Forward [LOCF])", 
    {"entities":[(0,20, "Change"),(26,34,"Reference"),(38,60,"Variable"), (64,71,"Timepoint")]}),
    ("Number of Participants With Abnormal Electrocardiogram (ECG) Interval", 
    {"entities":[(0,22, "Change"),(28,69,"Variable")]}),
    ("Number of Children With Documented Risk Factors for Type 2 Diabetes",
    {"entities": [(0,18,"Change"),(24,47,"Variable"), (52,67, "Condition")]}),
    ("Validation of SCOUT DS algorithm for detecting known type 2 diabetes",
    {"entities": [(0,10, "Reference"),(14,32, "Variable"),(53,70, "Condition")]})
]

In [122]:


examples = []
losses = {}
for text, annots in train_data:
    examples.append(Example.from_dict(nlp.make_doc(text), annots))

    
optimizer = nlp.initialize(lambda: examples)

for i in range(300):
    random.shuffle(examples)
    for batch in minibatch(examples, size = compounding(1.0, 32.0, 1.001)):
        nlp.update(
            batch,
            sgd=optimizer,
            losses = losses,
        )
        

In [123]:
print("Losses: ", losses)

Losses:  {'ner': 501.94938702087904}


In [124]:
from spacy import displacy

for text, _ in train_data:
        target = nlp(text)
        for entity in target.ents:
                print(entity.text, entity.start_char, entity.end_char, entity.label_)
        displacy.render(nlp(target.text), style='ent')

Adjusted Mean Change 0 20 Change
Baseline 26 34 Reference
Fasting Plasma Glucose 38 60 Variable
Week 24 64 71 Timepoint


Number of Participants 0 22 Change
Abnormal Electrocardiogram (ECG) Interval 28 69 Variable


Number of Children 0 18 Change
Documented Risk Factors 24 47 Variable
Type 2 Diabetes 52 67 Condition


Validation 0 10 Reference
SCOUT DS algorithm 14 32 Variable
type 2 diabetes 53 68 Condition


In [125]:


target = nlp("Number of Participants with undiagnosed type 2 diabetes")
displacy.render(nlp(target.text), style='ent')

In [126]:
target = nlp("Number of Participants With Abnormal Electrocardiogram (ECG) Interval")
displacy.render(nlp(target.text), style='ent')

In [127]:
target = nlp("Change in HbA1c Baseline to End of Trial in TINSAL-T2D Stage 1")
displacy.render(nlp(target.text), style='ent')

In [128]:
nlp.to_disk("./test_model")

### Test to coninue Training

In [129]:
optimizer_resume = nlp.resume_training()


for i in range(500):
    random.shuffle(examples)
    for batch in minibatch(examples, size = compounding(1.0, 32.0, 1.001)):
        nlp.update(
            batch,
            sgd=optimizer_resume,
            losses = losses,
        )

In [130]:
print("Losses: ", losses)


for text, _ in train_data:
        target = nlp(text)
        for entity in target.ents:
                print(entity.text, entity.start_char, entity.end_char, entity.label_)
                displacy.render(nlp(target.text), style='ent')

Losses:  {'ner': 501.94938745875646}
Adjusted Mean Change 0 20 Change


Baseline 26 34 Reference


Fasting Plasma Glucose 38 60 Variable


Week 24 64 71 Timepoint


Number of Participants 0 22 Change


Abnormal Electrocardiogram (ECG) Interval 28 69 Variable


Number of Children 0 18 Change


Documented Risk Factors 24 47 Variable


Type 2 Diabetes 52 67 Condition


Validation 0 10 Reference


SCOUT DS algorithm 14 32 Variable


type 2 diabetes 53 68 Condition


In [131]:
nlp.to_disk("./test_model")