In [None]:
import spacy, random
import spacy_annotator as spa
import numpy as np
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training import Example
from spacy import displacy
import pandas as pd


In [None]:
spacy.prefer_gpu()
nlp = spacy.blank('en')
source_nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("ner", source=source_nlp)

In [None]:
train_data_init = [
    ("Adjusted Mean Change From Baseline in Fasting Plasma Glucose at Week 24 (Last Observation Carried Forward [LOCF])", 
    {"entities":[(0,20, "Change"),(26,34,"Reference"),(38,60,"Variable"), (64,71,"Timepoint")]}),
    ("Number of Participants With Abnormal Electrocardiogram (ECG) Interval", 
    {"entities":[(0,22, "Change"),(28,69,"Variable")]}),
    ("Number of Children With Documented Risk Factors for Type 2 Diabetes",
    {"entities": [(0,18,"Change"),(24,47,"Variable"),(52,67, "Condition")]}),
    ("Validation of SCOUT DS algorithm for detecting known type 2 diabetes",
    {"entities": [(0,10, "Reference"),(14,32, "Variable"),(53,68, "Condition")]}),
    ("The main outcome is the comparison of total volumetric bone mineral density (vBMD) at the tibia and distal radius",
    {"entities":[(9,16, "Change"), (24, 34, "Reference"),(38,82, "Variable")]}),
    ("Number of Participants with undiagnosed type 2 diabetes", {"entities": [(0, 22,"Change"), (40, 55, "Condition")]}),
    ("2 months", {"entities":[(0,8,"Timepoint")]}),
    ("Explore if there was any difference in the number of participants with a high risk of developing type 2 diabetes in the risk test only (RTO)- group and the group that also offered a HbA1c-measurement (HbA1c-group)", 
    {"entities":[(43,65, "Change"), (97,112, "Condition"),(120,147,"Variable"),(182,213, "Variable")]}),
    ("Baseline", {"entities":[(0,8,"Reference")]}),
    ("Correlation between NF-кB dependent-proinflammation markers and osteoblast-specific gene expression in the MSC to measure the effects of NF-кB dependent-proinflammation on differentiation potential toward osteoblast in type 2 diabetes.",
    {"entities":[(0,11,"Reference"),(20,59,"Variable"), (64,110,"Variable"), (219,234,"Condition")]}),
    ("2-4 weeks", {"entities":[(0,9,"Timepoint")]}),
    ("Characteristic of type 2 diabetes patients, treated with insulin in Guadeloupe", 
    {"entities":[(0,14,"Change"), (18,33,"Condition"), (34,42,"Reference"), (57, 64, "Variable")]}),
    ("Trough study completion, an average of 2 years", {"entities": [(28,46,"Timepoint")]}),
    ("Describe the relation between the rate of HbAC and the plan of insulinothérapie",
    {"entities":[(13,21,"Reference"),(34,46,"Variable"),(55,79,"Variable")]}),
    ("Change in baseline A1C (glycated hemoglobin) at 12 months",
    {"entities":[(0,6,"Change"),(10,18,"Reference"),(19,44,"Variable"),(48,57, "Timepoint")]}),
    ("12 months", {"entities":[(0,9,"Timepoint")]}),
    ("Change in baseline lipids at 12 months", {"entities":[(0,6,"Change"),(10,18,"Reference"),(19,25,"Variable"),(29,38,"Timepoint")]}),
    ("Change in baseline blood pressure at 12 months",{"entities":[(0,6,"Change"),(10,18,"Reference"),(19,33,"Variable"),(37,46,"Timepoint")]}),
    ("Change in baseline Quality of life at 12 months",{"entities":[(0,6,"Change"),(10,18,"Reference"),(19,34,"Variable"),(38,47,"Timepoint")]}),
    ("Change in baseline insulin satisfaction at 12 months",{"entities":[(0,6,"Change"),(10,18,"Reference"),(19,39,"Variable"),(43,52,"Timepoint")]}),
    ("Feasibility of implementing the intervention in primary care: Binary outcome (feasible / not feasible), as judged by the investigators",
    {"entities":[(32,44,"Variable"),(62,76,"Change"),(121,134,"Reference")]}),
    ("3 months",{"entities":[(0,8,"Timepoint")]}),
    ("Recall of personalised risk information",{"entities":[(0,6,"Reference"),(10,39,"Variable")]}),
    ("Intentions to make lifestyle changes",{"entities":[(19,36,"Variable")]}),
    ("Change in self-management behaviour",{"entities":[(0,6,"Change"),(10,35,"Variable")]}),
    ("Change in HbA1c Baseline to End of Trial in TINSAL-T2D Stage 1",{"entities":[(0,6,"Change"),(10,15,"Variable"),(16,24,"Reference")]}),
    ("14 week",{"entities":[(0,7,"Timepoint")]}),
    ("Change in HbA1c", {"entities":[(0,15,"Change")]}),
    ("Change From Baseline and Trends in Fasting Glucose Over Time", {"entities":[(0,6,"Change"),(12,20,"Reference"),(25,31,"Reference"),(35,50,"Variable")]}),
    ("Change in Lipids",{"entities":[(0,16,"Change")]}),
    ("Change From Baseline in 14-week Insulin", {"entities":[(0,6,"Change"),(12,20,"Reference"),(24,31,"Timepoint"),(32,39,"Variable")]}),
    ("Change in Insulin",{"entities":[(0,17,"Change")]}),
    ("Neuroendocrine Tumor of Pancreas", {"entities": [(15,32, "Condition")]}),
    ("Number of levels assessed", {"entities": [(0,16, "Change")]}),
    ("Instances of hypoglycemia", {"entities": [(13,25, "Variable")]}),
    ("percentage of average nicotine binding", {"entities": [(22,38, "Variable")]}),
    ("Number of unsupervised injections", {"entities": [(0,33, "Change")]}), 
    ("Renal Replacement Therapy", {"entities": [(0,25, "Variable")]}),
    ("Polysomnographic recording of nocturnal sleep", {"entities": [(17,45, "Variable")]}), 
    ("percent predicted of FEV1 per year", {"entities": [(21,25, "Variable"), (30,34, "Timepoint")]}), 
    ("Change in percent time missed", {"entities": [(0,29, "Change")]}), 
    ("Recurrence, Detection rate per patient of PET-PSMA for the detection of biochemical recurrence.", {"entities": [(0,10, "Change"),(12,38, "Reference"), (42,50,"Variable"),(72,94,"Variable")]}),
    ('Circulating endothelial progenitor cells concentration and microparticles derived from endothelial cells',
    {'entities': [(0, 54, 'Variable'), (59, 104, 'Variable')]}),
    ('Peak anti-Xa activity', {'entities': [(5, 21, 'Variable')]}),
    ('Clinical Evolution', {'entities': [(0, 18, 'Reference')]}),
    ('Geometric Mean Ratio|90% confidence intervals', {'entities': []}),
    ('In-segment late lumen loss', {'entities': [(0, 26, 'Change')]}),
    ('Reading speed day 0|Reading speed M3', {'entities': [(0, 13, 'Change')]}),
    ('Number of patients who died for fournier gangrene',
    {'entities': [(32, 49, 'Condition'), (0, 18, 'Change')]}),
    ('Feedback of individual fellow in training on virtual and simulator based learning.',
    {'entities': [(0, 22, 'Reference'), (33, 52, 'Variable'), (57, 81, 'Variable')]}),
    ('Introductory Information Form (questionnaire)|Health Belief Model Scale in Obesity (questionnaire-scale)|Health Belief Model Scale in Obesity-Importance of Health (questionnaire-scale)|Health Belief Model Scale in Obesity- Perceived sensitivity (questionnaire-scale)|Health Belief Model Scale in Obesity- Perceived severity (questionnaire-scale)|Health Belief Model Scale in Obesity- Perceived benefit (questionnaire-scale)|Health Belief Model Scale in Obesity- Perceived barrier (questionnaire-scale)|Height Meter|weighing machine|body mass index calculation',
    {'entities': [(142, 152, 'Reference'), (223, 244, 'Reference'), (384, 401, 'Reference'), (66, 82, 'Change'), (125, 141, 'Change'), (462, 479, 'Variable'),
    (537, 559, 'Variable')]}),
    ('Change of post-traumatic stress symptoms',
    {'entities': [(0, 40, 'Change')]}),
  ('Number of patients without intubation', {'entities': [(0, 18, 'Change'), (27, 37, 'Variable')]}),
 ('Effect of Threat Appraisal on Vaping Intention', {'entities': [(0, 26, 'Variable'), (30, 46, 'Variable')]}),
 ('prevalence of peri-implantitis', {'entities': [(14, 30, 'Condition'), (0, 10, 'Variable')]}),
 ('Evaluation of video', {'entities': [(0, 10, 'Reference'), (14, 19, 'Variable')]}),
 ('correlation of disease severity with results of laboratory test', {'entities': [(0, 11, 'Reference'), (15, 31, 'Variable'), (37, 63, 'Variable')]}),
 ('Cognition-Global cognition|Cognition-Global cognition|Cognition-Global cognition|Cognition-Global cognition|Cognition-Global cognition|Cognition-memory|Cognition-memory|Cognition-memory|Cognition-memory|Cognition-memory|Cognition-attention|Cognition-attention|Cognition-attention|Cognition-attention|Cognition-attention|Cognition-visual/spatial function|Cognition-visual/spatial function|Cognition-visual/spatial function|Cognition-visual/spatial function|Cognition-visual/spatial function|Cognition-executive function|Cognition-executive function|Cognition-executive function|Cognition-executive function|Cognition-executive function',
  {'entities': [(0, 16, 'Variable')]}),
  ('Pelvic Examination of Anxiety', {'entities': [(22, 29, 'Condition'), (0, 18, 'Variable')]}),
 ('Detection rate per patient of PET-PSMA for the detection of biochemical recurrence.', {'entities': [(0, 9, 'Reference'), (47, 56, 'Reference'),
    (10, 26, 'Change'),
    (30, 38, 'Variable'),
    (60, 82, 'Variable')]}),
 ('post-operative pain', {'entities': [(0, 19, 'Condition')]}),
 ('Visual Analog Scale(VAS)|Neck Pain And Disability Scale (NPDS)|Neck and jaw range of motion (ROM)',
  {'entities': [(39, 49, 'Variable'),(50, 67, 'Variable')]}),
 ('Incidence of treatment-emergent adverse events (TEAE) by skin irritation assessments|Incidence of treatment-emergent adverse events (TEAE) by vital signs measurements|Incidence of treatment-emergent adverse events (TEAE) by ECG assessment|Incidence of treatment-emergent adverse events (TEAE) by clinical lab tests|Incidence of study drug related TEAEs',
  {'entities': [(13, 53, 'Variable'),
    (98, 138, 'Variable'),
    (180, 220, 'Variable'),
    (252, 292, 'Variable'),
    (328, 352, 'Variable')]}),
 ('Krebs von den Lungen - 6 (KL-6)',
  {'entities': [(0, 5, 'Condition'), (14, 31, 'Variable')]}),
 ('MMP8, MMP9|IL1, IL6,IL8,IL17|TNF alfa|RANK-L|OPG',
  {'entities': [(0, 4, 'Change')]}) 
]

In [None]:
def train_ner_model(trainig_data_init): 

    

    examples = []
    losses = {}
    for text, annots in trainig_data_init:
        examples.append(Example.from_dict(nlp.make_doc(text), annots))

        
    optimizer = nlp.initialize(lambda: examples)

    for i in range(35):
        random.shuffle(examples)
        for batch in minibatch(examples, size = compounding(1., 32., 1.001)):
            nlp.update(
                batch,
                drop=0.1,
                sgd=optimizer,
                losses = losses,
            )
    print("Losses: ", losses)


    #for text, _ in trainig_data_init:
    #        target = nlp(text)
    #        for entity in target.ents:
    #                print(entity.text, entity.start_char, entity.end_char, entity.label_)
    #        displacy.render(nlp(target.text), style='ent')



    nlp.to_disk("../training/ner_model")

In [None]:
train_ner_model(train_data_init)

In [26]:
doc = nlp("To evaluate a nutritional intervention for women newly diagnosed with breast cancer 9 weight control and physical activity program")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('To evaluate a nutritional intervention', 'Condition'), ('9 weight', 'Timepoint'), ('physical activity program', 'Variable')]


In [None]:
def resume_train_ner_model(train_data):
    nlp = spacy.load("../training/ner_model")

    examples = []
    losses = {}
    for text, annots in train_data:
        examples.append(Example.from_dict(nlp.make_doc(text), annots))

    optimizer_resume = nlp.resume_training()

    for i in range(int(np.sqrt(len(train_data)))):
        random.shuffle(examples)
        for batch in minibatch(examples, size = compounding(1.0, 32.0, 1.001)):
            nlp.update(
                batch,
                drop=0.2,
                sgd=optimizer_resume,
                losses = losses,
            )
    print("Losses: ", losses)
    print("\nTraining Data")

    nlp.to_disk("../training/ner_model")

In [None]:
def from_csv_to_train_data(datadf, label, colname):
    output = []
    datadf[colname] = datadf[colname].astype(str)
    for index, row in datadf.iterrows():
        char_start = 0
        char_end = len(row[colname])
        output_row = (row[colname], {"entities":[(char_start, char_end, label)]})
        output.append(output_row)
    return output
        

In [None]:
data_vars = pd.read_csv("../training/training_data/varaible_data.csv")
data_refs = pd.read_csv("../training/training_data/reference_data.csv")
data_timepoint_1 = pd.read_csv("../training/training_data/timepoint_data.csv")
data_change = pd.read_csv("../training/training_data/change_data.csv")
data_condition = pd.read_csv("../training/training_data/condition_data.csv")

In [None]:
data_vars.dropna(inplace=True)
data_refs.dropna(inplace=True)
data_timepoint_1.dropna(inplace=True)
data_change.dropna(inplace=True)
data_condition.dropna(inplace=True)

In [None]:
nlp_lemma = spacy.load("en_core_web_sm")

doc = nlp_lemma(u"Percentage while in a possible probability of recovery")
lema_list = []
for token in doc:
    lema_list.append(token.lemma_)
all_stopwords = nlp_lemma.Defaults.stop_words


In [None]:
lema_list

In [None]:
tokens_without_sw= [word for word in lema_list if not word in all_stopwords]

print(tokens_without_sw)

In [None]:
nlp_test = spacy.load("../training/ner_model")

In [None]:
data_filtered_raw= pd.read_csv("../backend/input_data_filtered.csv")

In [None]:


# annotator = spa.Annotator(labels = ["Condition", "Reference", "Change", "Timepoint", "Variable"], model = nlp_test)

# df_labels = annotator.annotate(df = data_filtered_raw[0:25], col_text = "PrimaryOutcomeMeasure", shuffle = True)

In [None]:
# list(df_labels["annotations"])

In [None]:
test = nlp_test(" ".join(tokens_without_sw))

print("Entities", [(ent.text, ent.label_) for ent in test.ents])

In [None]:
data_var_training = from_csv_to_train_data(data_vars, "Variable", "units_analyzed")
data_ref_training = from_csv_to_train_data(data_refs, "Reference", "param_type")
data_timepoint_training = from_csv_to_train_data(data_timepoint_1, "Timepoint", "target_duration")
data_change_training = from_csv_to_train_data(data_change, "Change", "units")
data_condition_training = from_csv_to_train_data(data_condition, "Condition", "name")

In [None]:
training_data = data_ref_training + data_change_training[0:100] + data_timepoint_training + data_var_training[0:100] + data_condition_training[0:100] + train_data_init

In [None]:
data_to_train = random.sample(training_data, len(training_data))
data_to_train_portion = data_to_train[0:int(len(data_to_train)*0.2)]
print(len(data_to_train_portion))
# print(data_to_train_portion)

In [None]:
resume_train_ner_model(data_to_train_portion)

In [None]:
doc = nlp("To evaluate a nutritional intervention for women newly diagnosed with breast cancer 9 weight control and physical activity program")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

In [None]:
nlp = spacy.load("../training/ner_model")

In [27]:
test_data = [
    ("Adjusted Mean Change From Baseline in Fasting Plasma Glucose at Week 24 (Last Observation Carried Forward [LOCF])", 
    {"entities":[(0,20, "Change"),(26,34,"Reference"),(38,60,"Variable"), (64,71,"Timepoint")]}),
    ("Number of Participants With Abnormal Electrocardiogram (ECG) Interval", 
    {"entities":[(0,22, "Change"),(28,69,"Variable")]}),
    ("Number of Children With Documented Risk Factors for Type 2 Diabetes",
    {"entities": [(0,18,"Change"),(24,47,"Variable"),(52,67, "Condition")]}),
    ("Validation of SCOUT DS algorithm for detecting known type 2 diabetes",
    {"entities": [(0,10, "Reference"),(14,32, "Variable"),(53,68, "Condition")]}),
    ("The main outcome is the comparison of total volumetric bone mineral density (vBMD) at the tibia and distal radius",
    {"entities":[(9,16, "Change"), (24, 34, "Reference"),(38,82, "Variable")]}),
    ("Number of Participants with undiagnosed type 2 diabetes", {"entities": [(0, 22,"Change"), (40, 55, "Condition")]}),
    ("2 months", {"entities":[(0,8,"Timepoint")]}),
    ("Explore if there was any difference in the number of participants with a high risk of developing type 2 diabetes in the risk test only (RTO)- group and the group that also offered a HbA1c-measurement (HbA1c-group)", 
    {"entities":[(43,65, "Change"), (97,112, "Condition"),(120,147,"Variable"),(182,213, "Variable")]}),
    ("Baseline", {"entities":[(0,8,"Reference")]}),
    ("Correlation between NF-кB dependent-proinflammation markers and osteoblast-specific gene expression in the MSC to measure the effects of NF-кB dependent-proinflammation on differentiation potential toward osteoblast in type 2 diabetes.",
    {"entities":[(0,11,"Reference"),(20,59,"Variable"), (64,110,"Variable"), (219,234,"Condition")]}),
    ("2-4 weeks", {"entities":[(0,9,"Timepoint")]}),
    ("Characteristic of type 2 diabetes patients, treated with insulin in Guadeloupe", 
    {"entities":[(0,14,"Change"), (18,33,"Condition"), (34,42,"Reference"), (57, 64, "Variable")]}),
    ("Trough study completion, an average of 2 years", {"entities": [(28,46,"Timepoint")]}),
    ("Describe the relation between the rate of HbAC and the plan of insulinothérapie",
    {"entities":[(13,21,"Reference"),(34,46,"Variable"),(55,79,"Variable")]}),
    ("Change in baseline A1C (glycated hemoglobin) at 12 months",
    {"entities":[(0,6,"Change"),(10,18,"Reference"),(19,44,"Variable"),(48,57, "Timepoint")]}),
    ("12 months", {"entities":[(0,9,"Timepoint")]}),
    ("Change in baseline lipids at 12 months", {"entities":[(0,6,"Change"),(10,18,"Reference"),(19,25,"Variable"),(29,38,"Timepoint")]}),
    ("Change in baseline blood pressure at 12 months",{"entities":[(0,6,"Change"),(10,18,"Reference"),(19,33,"Variable"),(37,46,"Timepoint")]}),
    ("Change in baseline Quality of life at 12 months",{"entities":[(0,6,"Change"),(10,18,"Reference"),(19,34,"Variable"),(38,47,"Timepoint")]}),
    ("Change in baseline insulin satisfaction at 12 months",{"entities":[(0,6,"Change"),(10,18,"Reference"),(19,39,"Variable"),(43,52,"Timepoint")]}),
    ("Feasibility of implementing the intervention in primary care: Binary outcome (feasible / not feasible), as judged by the investigators",
    {"entities":[(32,44,"Variable"),(62,76,"Change"),(121,134,"Reference")]}),
    ("3 months",{"entities":[(0,8,"Timepoint")]}),
    ("Recall of personalised risk information",{"entities":[(0,6,"Reference"),(10,39,"Variable")]}),
    ("Intentions to make lifestyle changes",{"entities":[(19,36,"Variable")]}),
    ("Change in self-management behaviour",{"entities":[(0,6,"Change"),(10,35,"Variable")]}),
    ("Change in HbA1c Baseline to End of Trial in TINSAL-T2D Stage 1",{"entities":[(0,6,"Change"),(10,15,"Variable"),(16,24,"Reference")]}),
    ("14 week",{"entities":[(0,7,"Timepoint")]}),
    ("Change in HbA1c", {"entities":[(0,15,"Change")]}),
    ("Change From Baseline and Trends in Fasting Glucose Over Time", {"entities":[(0,6,"Change"),(12,20,"Reference"),(25,31,"Reference"),(35,50,"Variable")]}),
    ("Change in Lipids",{"entities":[(0,16,"Change")]}),
    ("Change From Baseline in 14-week Insulin", {"entities":[(0,6,"Change"),(12,20,"Reference"),(24,31,"Timepoint"),(32,39,"Variable")]}),
    ("Change in Insulin",{"entities":[(0,17,"Change")]})]

for text, _ in test_data:

        target = nlp(text)
        for entity in target.ents:
                #print(entity.text, entity.start_char, entity.end_char, entity.label_)
                displacy.render(nlp(target.text), style='ent')