In [1]:
import spacy, random
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training import Example
from spacy import displacy
import pandas as pd


In [2]:
nlp = spacy.blank('en')
source_nlp = spacy.load("en_core_web_sm")
activated = spacy.prefer_gpu()
nlp.add_pipe("ner", source=source_nlp)

<spacy.pipeline.ner.EntityRecognizer at 0x203ba5b7b20>

In [3]:
def train_ner_model(): 

    train_data = [
    ("Adjusted Mean Change From Baseline in Fasting Plasma Glucose at Week 24 (Last Observation Carried Forward [LOCF])", 
    {"entities":[(0,20, "Change"),(26,34,"Reference"),(38,60,"Variable"), (64,71,"Timepoint")]}),
    ("Number of Participants With Abnormal Electrocardiogram (ECG) Interval", 
    {"entities":[(0,22, "Change"),(28,69,"Variable")]}),
    ("Number of Children With Documented Risk Factors for Type 2 Diabetes",
    {"entities": [(0,18,"Change"),(24,47,"Variable"),(52,67, "Condition")]}),
    ("Validation of SCOUT DS algorithm for detecting known type 2 diabetes",
    {"entities": [(0,10, "Reference"),(14,32, "Variable"),(53,68, "Condition")]}),
    ("The main outcome is the comparison of total volumetric bone mineral density (vBMD) at the tibia and distal radius",
    {"entities":[(9,16, "Change"), (24, 34, "Reference"),(38,82, "Variable")]}),
    ("Number of Participants with undiagnosed type 2 diabetes", {"entities": [(0, 22,"Change"), (40, 55, "Condition")]}),
    ("2 months", {"entities":[(0,8,"Timepoint")]}),
    ("Explore if there was any difference in the number of participants with a high risk of developing type 2 diabetes in the risk test only (RTO)- group and the group that also offered a HbA1c-measurement (HbA1c-group)", 
    {"entities":[(43,65, "Change"), (97,112, "Condition"),(120,147,"Variable"),(182,213, "Variable")]}),
    ("Baseline", {"entities":[(0,8,"Reference")]}),
    ("Correlation between NF-кB dependent-proinflammation markers and osteoblast-specific gene expression in the MSC to measure the effects of NF-кB dependent-proinflammation on differentiation potential toward osteoblast in type 2 diabetes.",
    {"entities":[(0,11,"Reference"),(20,59,"Variable"), (64,110,"Variable"), (219,234,"Condition")]}),
    ("2-4 weeks", {"entities":[(0,9,"Timepoint")]}),
    ("Characteristic of type 2 diabetes patients, treated with insulin in Guadeloupe", 
    {"entities":[(0,14,"Change"), (18,33,"Condition"), (34,42,"Reference"), (57, 64, "Variable")]}),
    ("Trough study completion, an average of 2 years", {"entities": [(28,46,"Timepoint")]}),
    ("Describe the relation between the rate of HbAC and the plan of insulinothérapie",
    {"entities":[(13,21,"Reference"),(34,46,"Variable"),(55,79,"Variable")]}),
    ("Change in baseline A1C (glycated hemoglobin) at 12 months",
    {"entities":[(0,6,"Change"),(10,18,"Reference"),(19,44,"Variable"),(48,57, "Timepoint")]}),
    ("12 months", {"entities":[(0,9,"Timepoint")]}),
    ("Change in baseline lipids at 12 months", {"entities":[(0,6,"Change"),(10,18,"Reference"),(19,25,"Variable"),(29,38,"Timepoint")]}),
    ("Change in baseline blood pressure at 12 months",{"entities":[(0,6,"Change"),(10,18,"Reference"),(19,33,"Variable"),(37,46,"Timepoint")]}),
    ("Change in baseline Quality of life at 12 months",{"entities":[(0,6,"Change"),(10,18,"Reference"),(19,34,"Variable"),(38,47,"Timepoint")]}),
    ("Change in baseline insulin satisfaction at 12 months",{"entities":[(0,6,"Change"),(10,18,"Reference"),(19,39,"Variable"),(43,52,"Timepoint")]}),
    ("Feasibility of implementing the intervention in primary care: Binary outcome (feasible / not feasible), as judged by the investigators",
    {"entities":[(32,44,"Variable"),(62,76,"Change"),(121,134,"Reference")]}),
    ("3 months",{"entities":[(0,8,"Timepoint")]}),
    ("Recall of personalised risk information",{"entities":[(0,6,"Reference"),(10,39,"Variable")]}),
    ("Intentions to make lifestyle changes",{"entities":[(19,36,"Variable")]}),
    ("Change in self-management behaviour",{"entities":[(0,6,"Change"),(10,35,"Variable")]}),
    ("Change in HbA1c Baseline to End of Trial in TINSAL-T2D Stage 1",{"entities":[(0,6,"Change"),(10,15,"Variable"),(16,24,"Reference")]}),
    ("14 week",{"entities":[(0,7,"Timepoint")]}),
    ("Change in HbA1c", {"entities":[(0,15,"Change")]}),
    ("Change From Baseline and Trends in Fasting Glucose Over Time", {"entities":[(0,6,"Change"),(12,20,"Reference"),(25,31,"Reference"),(35,50,"Variable")]}),
    ("Change in Lipids",{"entities":[(0,16,"Change")]}),
    ("Change From Baseline in 14-week Insulin", {"entities":[(0,6,"Change"),(12,20,"Reference"),(24,31,"Timepoint"),(32,39,"Variable")]}),
    ("Change in Insulin",{"entities":[(0,17,"Change")]})
]

    examples = []
    losses = {}
    for text, annots in train_data:
        examples.append(Example.from_dict(nlp.make_doc(text), annots))

        
    optimizer = nlp.initialize(lambda: examples)

    for i in range(100):
        random.shuffle(examples)
        for batch in minibatch(examples, size = compounding(1.0, 16.0, 1.001)):
            nlp.update(
                batch,
                sgd=optimizer,
                losses = losses,
            )
    print("Losses: ", losses)


    for text, _ in train_data:
            target = nlp(text)
            for entity in target.ents:
                    print(entity.text, entity.start_char, entity.end_char, entity.label_)
            displacy.render(nlp(target.text), style='ent')



    nlp.to_disk("../training/ner_model")

In [4]:
def resume_train_ner_model(train_data):
    nlp = spacy.load("../training/ner_model")

    examples = []
    losses = {}
    for text, annots in train_data:
        examples.append(Example.from_dict(nlp.make_doc(text), annots))

    optimizer_resume = nlp.resume_training()

    for i in range(10):
        random.shuffle(examples)
        for batch in minibatch(examples, size = compounding(1.0, 4.0, 1.01)):
            nlp.update(
                batch,
                sgd=optimizer_resume,
                losses = losses,
            )
    print("Losses: ", losses)

    nlp.to_disk("../training/ner_model")

In [5]:
def create_and_train_new_data():
    data_measurements = pd.read_csv("../training/baseline_measurements.csv")
    # TODO: remove unnecessary code if not needed -> replace with clean data
    data_measurements = data_measurements.drop(columns=["Unnamed: 0"], axis=1)

    data_measurements.dropna(inplace=True)

    change_train = from_csv_to_train_data(data_measurements, "Change", "units")
    resume_train_ner_model(change_train)
    print("Done training changes")

In [6]:
def from_csv_to_train_data(datadf, label, colname):
    output = []
    for index, row in datadf.iterrows():
        char_start = 0
        char_end = len(row[colname])
        output_row = (row[colname], {"entities":[(char_start, char_end, label)]})
        output.append(output_row)
    return output
        

In [7]:
train_ner_model()

Losses:  {'ner': 739.2882857930664}
Adjusted Mean Change 0 20 Change
Baseline 26 34 Reference
Fasting Plasma Glucose 38 60 Variable
Week 24 64 71 Timepoint


Number of Participants 0 22 Change
Abnormal Electrocardiogram (ECG) Interval 28 69 Variable


Number of Children 0 18 Change
Documented Risk Factors 24 47 Variable
Type 2 Diabetes 52 67 Condition


Validation 0 10 Reference
SCOUT DS algorithm 14 32 Variable
type 2 diabetes 53 68 Condition


outcome 9 16 Change
comparison 24 34 Reference
total volumetric bone mineral density (vBMD) 38 82 Variable


Number of Participants 0 22 Change
type 2 diabetes 40 55 Condition


2 months 0 8 Timepoint


number of participants 43 65 Change
type 2 diabetes 97 112 Condition
risk test only (RTO)- group 120 147 Variable
HbA1c-measurement (HbA1c-group) 182 213 Variable


Baseline 0 8 Reference


Correlation 0 11 Reference
NF-кB dependent-proinflammation markers 20 59 Variable
osteoblast-specific gene expression in the MSC 64 110 Variable
type 2 diabetes 219 234 Condition


2-4 weeks 0 9 Timepoint


Characteristic 0 14 Change
type 2 diabetes 18 33 Condition
patients 34 42 Reference
insulin 57 64 Variable


average of 2 years 28 46 Timepoint


relation 13 21 Reference
rate of HbAC 34 46 Variable
plan of insulinothérapie 55 79 Variable


Change 0 6 Change
baseline 10 18 Reference
A1C (glycated hemoglobin) 19 44 Variable
12 months 48 57 Timepoint


12 months 0 9 Timepoint


Change 0 6 Change
baseline 10 18 Reference
lipids 19 25 Variable
12 months 29 38 Timepoint


Change 0 6 Change
baseline 10 18 Reference
blood pressure 19 33 Variable
12 months 37 46 Timepoint


Change 0 6 Change
baseline 10 18 Reference
Quality of life 19 34 Variable
12 months 38 47 Timepoint


Change 0 6 Change
baseline 10 18 Reference
insulin satisfaction 19 39 Variable
12 months 43 52 Timepoint


intervention 32 44 Variable
Binary outcome 62 76 Change
investigators 121 134 Reference


3 months 0 8 Timepoint


Recall 0 6 Reference
personalised risk information 10 39 Variable


lifestyle changes 19 36 Variable


Change 0 6 Change
self-management behaviour 10 35 Variable


Change 0 6 Change
HbA1c 10 15 Variable
Baseline 16 24 Reference


14 week 0 7 Timepoint


Change in HbA1c 0 15 Change


Change 0 6 Change
Baseline 12 20 Reference
Trends 25 31 Reference
Fasting Glucose 35 50 Variable


Change in Lipids 0 16 Change


Change 0 6 Change
Baseline 12 20 Reference
14-week 24 31 Timepoint
Insulin 32 39 Variable


Change in Insulin 0 17 Change


In [8]:
create_and_train_new_data()

number of relapses 0 18 Change


cells/ micro liter 0 18 Change


Words 0 5 Change


percent predicted mortality 0 27 Change


nanomoles per milliliter (nmol/mL) 0 34 Change


Kilograms per square meter (kg/m^2) 0 35 Change


cells/µL 0 8 Change


Metered Squares m^2 0 19 Change


cmH2O 0 5 Change


nocturia episodes/24 hours 0 26 Change


ms 0 2 Change


Number of cycles 0 16 Change


µm/day 0 6 Change


Kilograms 0 9 Change


treatments 0 10 Change


microIU/ml 0 10 Change


Percentage of Predicted FVC 0 27 Change


cigarettes smoked per day 0 25 Change


pariticipants 0 13 Change


probability 0 11 Change


percentile 0 10 Change


Percentage of Predicted FEV1 0 28 Change


° 0 1 Change


sq. meter 0 9 Change


pg/mL 0 5 Change


age in months of encounter subjects 0 35 Change


mg/mg 0 5 Change


picograms per milliliter (pg/mL) 0 32 Change


10^9 platelets/L 0 16 Change


Percentile adjusted for age and sex 0 35 Change


cigarette 0 9 Change


month 0 5 Change


mmol/L/min 0 10 Change


kilo Pascals 0 12 Change


L/sec 0 5 Change


Number of prior chemotherapy regimens 0 37 Change


percent reversibility 0 21 Change


seizures/week 0 13 Change


Percentage of Abstinent Days 0 28 Change


percentage of bone surface 0 26 Change


pharmacy claims 0 15 Change


gestational age (weeks) 0 23 Change


Percent of the predicted value 0 30 Change


mg / dL 0 7 Change


composite score 0 15 Change


milligrams/kilograms (mg/kg) 0 28 Change


Kilogram 0 8 Change


units on a scale out of 10 min 1 max 10 0 39 Change


mg/Dl 0 5 Change


Years of Age 0 12 Change


Meters squared 0 14 Change


centimenters (cm) 0 17 Change


Joint Count 0 11 Change


feedings per day 0 16 Change


Child 0 5 Change


# events / subject 0 18 Change


liters/second (L/sec) 0 21 Change


%Gag-specific MIP1B+ CD4+ T-cells 0 33 Change


Percent of items 0 16 Change


Units on a scale 0 16 Change


ng/mg of creatinine 0 19 Change


CYP2D6 drug 0 11 Change


kU/L 0 4 Change


number of comorbidities 0 23 Change


Chair stands per 30 seconds 0 27 Change


percentage of cells 0 19 Change


millimeter/hour (mm/hr) 0 23 Change


(kg) 0 4 Change


gender 0 6 Change


IQ 0 2 Change


mL/min per 1.73 m^2 0 19 Change


(ng/mL)^2/1000 0 14 Change


% predicted normal 0 18 Change


µg 0 2 Change


deployments 0 11 Change


Comorbid conditions 0 19 Change


Number of masses 0 16 Change


Eyes 0 4 Change


cm³ 0 3 Change


standard drinks/day 0 19 Change


gram (g) 0 8 Change


millimeters mercury (mmHg) 0 26 Change


eosinophils per high power field 0 32 Change


morphine milligram equivalents per day 0 38 Change


Picograms per milliliter (pg/mL) 0 32 Change


Done training changes
