In [1]:
import spacy, random
import spacy_annotator as spa
import numpy as np
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training import Example
from spacy import displacy
import pandas as pd
from training_data.custom_training_data import train_data_init

In [2]:
# Init GPU-Usage, load blank model with basis of en_core_web_sm and
# enable only NER in the pipeline

spacy.prefer_gpu()
nlp = spacy.blank('en')
source_nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("ner", source=source_nlp)

# Init tracker for training loss with traing_data

current_loss = 0
training_data_init = train_data_init

<spacy.pipeline.ner.EntityRecognizer at 0x2220d511ca0>

In [4]:
'''
@author Ervin Joa

Init-Training of a custom NER-Model with droprate of 20%

@return current_loss
'''
def train_ner_model(trainig_data_init): 

    examples = []
    losses = {}
    for text, annots in trainig_data_init:
        examples.append(Example.from_dict(nlp.make_doc(text), annots))

        
    optimizer = nlp.initialize(lambda: examples)

    for i in range(30):
        random.shuffle(examples)
        for batch in minibatch(examples, size = compounding(1., 32., 1.001)):
            nlp.update(
                batch,
                drop=0.2,
                sgd=optimizer,
                losses = losses,
            )
    print("Losses: ", losses)
    current_loss = losses["ner"]

    nlp.to_disk("../training/ner_model")
    return current_loss

In [7]:
current_loss = train_ner_model(training_data_init)

Losses:  {'ner': 3411.014173047471}


In [9]:
'''
@author Ervin Joa

Method for the continuation training with SpaCy. After
each tme this function is called, the model will be retrained and
updated for the data-inputed.

@param train_data: Training Data based on the GOLD-Format
@return current_loss: Loss of the current training epoch
'''

def resume_train_ner_model(train_data):
    nlp = spacy.load("../training/ner_model")

    examples = []
    losses = {}
    for text, annots in train_data:
        examples.append(Example.from_dict(nlp.make_doc(text), annots))

    optimizer_resume = nlp.resume_training()

    for i in range(int(np.sqrt(len(train_data)))):
        random.shuffle(examples)
        for batch in minibatch(examples, size = compounding(1.0, 32.0, 1.001)):
            nlp.update(
                batch,
                drop=0.2,
                sgd=optimizer_resume,
                losses = losses,
            )
    current_loss = losses["ner"]

    nlp.to_disk("../training/ner_model")
    return current_loss

In [10]:
def from_csv_to_train_data(datadf, label, colname):
    output = []
    datadf[colname] = datadf[colname].astype(str)
    for index, row in datadf.iterrows():
        char_start = 0
        char_end = len(row[colname])
        output_row = (row[colname], {"entities":[(char_start, char_end, label)]})
        output.append(output_row)
    return output
        

In [13]:
def filter_dataframe_count(df, col_name):
    df['word_count'] = df[col_name].str.split().str.len()
    df_new = df[df['word_count'] <= 2].drop(columns=["word_count"], axis=0)

    return df_new.reset_index(drop=True)

In [11]:
data_vars = pd.read_csv("../training/training_data/varaible_data.csv")
data_refs = pd.read_csv("../training/training_data/reference_data.csv")
data_timepoint_1 = pd.read_csv("../training/training_data/timepoint_data.csv")
data_change = pd.read_csv("../training/training_data/change_data.csv")
data_condition = pd.read_csv("../training/training_data/condition_data.csv")

data_vars.dropna(inplace=True)
data_refs.dropna(inplace=True)
data_timepoint_1.dropna(inplace=True)
data_change.dropna(inplace=True)
data_condition.dropna(inplace=True)

# For all Files filter data-entries that are <= 4 words

data_vars = filter_dataframe_count(data_vars, data_vars.iloc[:, 0].name)
data_refs = filter_dataframe_count(data_refs, data_refs.iloc[:, 0].name)
data_timepoint_1 = filter_dataframe_count(data_timepoint_1, data_timepoint_1.iloc[:, 0].name)
data_change = filter_dataframe_count(data_change, data_change.iloc[:, 0].name)
data_condition = filter_dataframe_count(data_condition, data_condition.iloc[:, 0].name)

data_var_training = from_csv_to_train_data(data_vars, "Variable", "units_analyzed")
data_ref_training = from_csv_to_train_data(data_refs, "Reference", "param_type")
data_timepoint_training = from_csv_to_train_data(data_timepoint_1, "Timepoint", "target_duration")
data_change_training = from_csv_to_train_data(data_change, "Change", "units")
data_condition_training = from_csv_to_train_data(data_condition, "Condition", "name")

In [16]:
print(len(data_var_training))
print(len(data_ref_training))
print(len(data_timepoint_training))
print(len(data_change_training))
print(len(data_condition_training))

682
9
162
10251
35602


In [17]:
'''
@author Ervin Joa

Main training-loop:

    While the loss of training remains above 100:
        continue extracting random shuffled training data and retrain the model;
    else:
        stop;
'''

while current_loss > 100:
    data_var_training_random = random.sample(data_var_training, len(data_var_training))
    data_change_training_random = random.sample(data_change_training, len(data_change_training))
    data_condition_training_random = random.sample(data_condition_training, len(data_condition_training))

    training_data = data_ref_training + data_change_training_random[0:300] + data_timepoint_training + data_var_training_random[0:300] + data_condition_training_random[0:300] + train_data_init
    data_to_train = random.sample(training_data, len(training_data))
    
    data_to_train_portion = data_to_train[0:500]

    current_loss = resume_train_ner_model(data_to_train_portion)
    print(f"Done Training iteration with loss of {current_loss}")
    

[('Microhaemorrhage', {'entities': [(0, 16, 'Condition')]}), ('fallopian tubes', {'entities': [(0, 15, 'Variable')]}), ('Change in baseline insulin satisfaction at 12 months', {'entities': [(0, 6, 'Change'), (10, 18, 'Reference'), (19, 39, 'Variable'), (43, 52, 'Timepoint')]}), ('Change in HbA1c', {'entities': [(0, 15, 'Change')]}), ('Kilogram', {'entities': [(0, 8, 'Change')]}), ('Nasal Intubation', {'entities': [(0, 16, 'Condition')]}), ('Unit/kilogram (U/kg)', {'entities': [(0, 20, 'Change')]}), ('Pg/mL', {'entities': [(0, 5, 'Change')]}), ('8 Years', {'entities': [(0, 7, 'Timepoint')]}), ('collisions', {'entities': [(0, 10, 'Change')]})]
Done Training iteration with loss of 3379.1869744131377
[('Target Lesions', {'entities': [(0, 14, 'Variable')]}), ('pen needles', {'entities': [(0, 11, 'Variable')]}), ('Primary Glioblastoma', {'entities': [(0, 20, 'Condition')]}), ('Dialyzers', {'entities': [(0, 9, 'Variable')]}), ('Afterschool Programs', {'entities': [(0, 20, 'Variable')]}), ('Ba

In [None]:
# nlp_annot = spacy.load("../training/ner_model")
# data_filtered_raw= pd.read_csv("../backend/input_data_filtered.csv")
# annotator = spa.Annotator(labels = ["Condition", "Reference", "Change", "Timepoint", "Variable"], model = nlp_annot)
# df_labels = annotator.annotate(df = data_filtered_raw[0:25], col_text = "SecondaryOutcomeMeasure", shuffle = True)

In [18]:
nlp_test = spacy.load("../training/ner_model")

In [19]:
doc = nlp_test("To evaluate a nutritional intervention for women newly diagnosed with breast cancer 9 weight control and physical activity program")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('To evaluate a nutritional intervention', 'Change'), ('cancer 9 weight control', 'Variable'), ('physical activity program', 'Change')]


In [None]:
for text, _ in data_to_train_portion:

        target = nlp_test(text)
        for entity in target.ents:
                displacy.render(nlp_test(target.text), style='ent')

In [16]:

from spacy.scorer import Scorer

def evaluate( examples):
    ner_model = spacy.load("../training/ner_model")
    scorer = Scorer()


    examples = []
    for text, annots in examples:
        predicted=ner_model(text)
        example=Example.from_dict(predicted, annots)
        examples.append(example)
    scores = scorer.score(examples)
    return scores

examples = [
    ('The main outcome is the comparison of total volumetric bone mineral density (vBMD) at the tibia and distal radius',[(9,16, "Change"), (24, 34, "Reference"),(38,82, "Variable")]),
    ("Number of Participants with undiagnosed type 2 diabetes", [(0, 22,"Change"), (40, 55, "Condition")]),
    ("2 months", [(0,8,"Timepoint")])
]


results = evaluate(examples)
print(results)

{'token_acc': None, 'token_p': None, 'token_r': None, 'token_f': None, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': None, 'ents_r': None, 'ents_f': None, 'ents_per_type': None, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, 'cats_f_per_type': {}, 'cats_auc_per_type': {}}
