In [9]:
import spacy, random
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training import Example


In [10]:
nlp = spacy.blank('en')
source_nlp = spacy.load("en_core_web_sm")
activated = spacy.prefer_gpu()
nlp.add_pipe("ner", source=source_nlp)

<spacy.pipeline.ner.EntityRecognizer at 0x1efa4dc2dc0>

In [11]:
# TODO: add all of the other train_data

train_data = [
    ("Adjusted Mean Change From Baseline in Fasting Plasma Glucose at Week 24 (Last Observation Carried Forward [LOCF])", 
    {"entities":[(0,20, "Change"),(26,34,"Reference"),(38,60,"Variable"), (64,71,"Timepoint")]}),
    ("Number of Participants With Abnormal Electrocardiogram (ECG) Interval", 
    {"entities":[(0,22, "Change"),(28,69,"Variable")]}),
    ("Number of Children With Documented Risk Factors for Type 2 Diabetes",
    {"entities": [(0,18,"Change"),(24,47,"Variable"),(52,67, "Condition")]}),
    ("Validation of SCOUT DS algorithm for detecting known type 2 diabetes",
    {"entities": [(0,10, "Reference"),(14,32, "Variable"),(53,68, "Condition")]}),
    ("The main outcome is the comparison of total volumetric bone mineral density (vBMD) at the tibia and distal radius",
    {"entities":[(9,16, "Change"), (24, 34, "Reference"),(38,76, "Variable")]}),
    ("Number of Participants with undiagnosed type 2 diabetes", {"entities": [(0, 22,"Change"), (40, 55, "Condition")]}),
    ("2 months", {"entities":[(0,8,"Timepoint")]}),
    ("Explore if there was any difference in the number of participants with a high risk of developing type 2 diabetes in the risk test only (RTO)- group and the group that also offered a HbA1c-measurement (HbA1c-group)", 
    {"entities":[(43,65, "Change"), (86,101, "Condition"),(172,204, "Variable")]}),
    ("Baseline", {"entities":[(0,8,"Reference")]}),
    ("Correlation between NF-кB dependent-proinflammation markers and osteoblast-specific gene expression in the MSC to measure the effects of NF-кB dependent-proinflammation on differentiation potential toward osteoblast in type 2 diabetes.",
    {"entities":[(0,11,"Reference"),(20,59,"Variable"), (64,110,"Variable"), (219,234,"Condition")]}),
    ("2-4 weeks", {"entities":[(0,9,"Timepoint")]}),
    ("Characteristic of type 2 diabetes patients, treated with insulin in Guadeloupe", 
    {"entities":[(0,14,"Change"), (18,33,"Condition"), (34,42,"Reference"), (57, 64, "Variable")]}),
    ("Trough study completion, an average of 2 years", {"entities": [(28,46,"Timepoint")]}),
    ("Describe the relation between the rate of HbAC and the plan of insulinothérapie",
    {"entities":[(13,21,"Reference"),(34,46,"Variable"),(55,79,"Variable")]}),
    ("Change in baseline A1C (glycated hemoglobin) at 12 months",{"entities":[(0,6,"Change"),(10,18,"Reference"),(19,44,"Variable"),(48,57, "Timepoint")]})
]

In [12]:


examples = []
losses = {}
for text, annots in train_data:
    examples.append(Example.from_dict(nlp.make_doc(text), annots))

    
optimizer = nlp.initialize(lambda: examples)

for i in range(100):
    random.shuffle(examples)
    for batch in minibatch(examples, size = compounding(1.0, 32.0, 1.001)):
        nlp.update(
            batch,
            sgd=optimizer,
            losses = losses,
        )
        



In [13]:
print("Losses: ", losses)

Losses:  {'ner': 632.7874572003294}


In [14]:
from spacy import displacy

for text, _ in train_data:
        target = nlp(text)
        for entity in target.ents:
                print(entity.text, entity.start_char, entity.end_char, entity.label_)
        displacy.render(nlp(target.text), style='ent')

Adjusted Mean Change 0 20 Change
Baseline 26 34 Reference
Fasting Plasma Glucose 38 60 Variable
Week 24 64 71 Timepoint


Number of Participants 0 22 Change
Abnormal Electrocardiogram (ECG) Interval 28 69 Variable


Number of Children 0 18 Change
Documented Risk Factors 24 47 Variable
Type 2 Diabetes 52 67 Condition


Validation 0 10 Reference
SCOUT DS algorithm 14 32 Variable
type 2 diabetes 53 68 Condition


outcome 9 16 Change
comparison 24 34 Reference


Number of Participants 0 22 Change
type 2 diabetes 40 55 Condition


2 months 0 8 Timepoint


number of participants 43 65 Change
developing type 86 101 Condition


Baseline 0 8 Reference


Correlation 0 11 Reference
NF-кB dependent-proinflammation markers 20 59 Variable
osteoblast-specific gene expression in the MSC 64 110 Variable
type 2 diabetes 219 234 Condition


2-4 weeks 0 9 Timepoint


Characteristic 0 14 Change
type 2 diabetes 18 33 Condition
patients 34 42 Reference
insulin 57 64 Variable


average of 2 years 28 46 Timepoint


relation 13 21 Reference
rate of HbAC 34 46 Variable
plan of insulinothérapie 55 79 Variable


Change 0 6 Change
baseline 10 18 Reference
A1C (glycated hemoglobin) 19 44 Variable
12 months 48 57 Timepoint


In [15]:


target = nlp("Number of Participants with undiagnosed type 2 diabetes")
displacy.render(nlp(target.text), style='ent')

In [16]:
target = nlp("Number of Participants With Abnormal Electrocardiogram (ECG) Interval")
displacy.render(nlp(target.text), style='ent')

In [17]:
target = nlp("Change in HbA1c Baseline to End of Trial in TINSAL-T2D Stage 1")
displacy.render(nlp(target.text), style='ent')

In [18]:
nlp.to_disk("./test_model")

### Test to coninue Training

In [19]:
optimizer_resume = nlp.resume_training()


for i in range(100):
    random.shuffle(examples)
    for batch in minibatch(examples, size = compounding(1.0, 32.0, 1.001)):
        nlp.update(
            batch,
            sgd=optimizer_resume,
            losses = losses,
        )

In [20]:
print("Losses: ", losses)


for text, _ in train_data:
        target = nlp(text)
        for entity in target.ents:
                print(entity.text, entity.start_char, entity.end_char, entity.label_)
                displacy.render(nlp(target.text), style='ent')

Losses:  {'ner': 632.7874616102115}
Adjusted Mean Change 0 20 Change


Baseline 26 34 Reference


Fasting Plasma Glucose 38 60 Variable


Week 24 64 71 Timepoint


Number of Participants 0 22 Change


Abnormal Electrocardiogram (ECG) Interval 28 69 Variable


Number of Children 0 18 Change


Documented Risk Factors 24 47 Variable


Type 2 Diabetes 52 67 Condition


Validation 0 10 Reference


SCOUT DS algorithm 14 32 Variable


type 2 diabetes 53 68 Condition


outcome 9 16 Change


comparison 24 34 Reference


Number of Participants 0 22 Change


type 2 diabetes 40 55 Condition


2 months 0 8 Timepoint


number of participants 43 65 Change


developing type 86 101 Condition


Baseline 0 8 Reference


Correlation 0 11 Reference


NF-кB dependent-proinflammation markers 20 59 Variable


osteoblast-specific gene expression in the MSC 64 110 Variable


type 2 diabetes 219 234 Condition


2-4 weeks 0 9 Timepoint


Characteristic 0 14 Change


type 2 diabetes 18 33 Condition


patients 34 42 Reference


insulin 57 64 Variable


average of 2 years 28 46 Timepoint


relation 13 21 Reference


rate of HbAC 34 46 Variable


plan of insulinothérapie 55 79 Variable


Change 0 6 Change


baseline 10 18 Reference


A1C (glycated hemoglobin) 19 44 Variable


12 months 48 57 Timepoint


In [21]:
nlp.to_disk("./test_model")