In [1]:
import spacy, random
import spacy_annotator as spa
import numpy as np
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training import Example
from spacy import displacy
import pandas as pd
from training_data.custom_training_data import train_data_init

In [2]:
# Init GPU-Usage, load blank model with basis of en_core_web_sm and
# enable only NER in the pipeline

spacy.prefer_gpu()
nlp = spacy.blank('en')
source_nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("ner", source=source_nlp)

# Init tracker for training loss with traing_data

current_loss = 0
training_data_init = train_data_init

In [3]:
'''
@author Ervin Joa

Init-Training of a custom NER-Model with droprate of 20%

@return current_loss
'''
def train_ner_model(trainig_data_init): 

    examples = []
    losses = {}
    for text, annots in trainig_data_init:
        examples.append(Example.from_dict(nlp.make_doc(text), annots))

        
    optimizer = nlp.initialize(lambda: examples)

    for i in range(40):
        random.shuffle(examples)
        for batch in minibatch(examples, size = compounding(1., 32., 1.001)):
            nlp.update(
                batch,
                drop=0.2,
                sgd=optimizer,
                losses = losses,
            )
    print("Losses: ", losses)
    current_loss = losses["ner"]

    nlp.to_disk("../training/ner_model")
    return current_loss

In [4]:
current_loss = train_ner_model(training_data_init)

Losses:  {'ner': 4598.098713291725}


In [5]:
'''
@author Ervin Joa

Method for the continuation training with SpaCy. After
each tme this function is called, the model will be retrained and
updated for the data-inputed.

@param train_data: Training Data based on the GOLD-Format
@return current_loss: Loss of the current training epoch
'''

def resume_train_ner_model(train_data):
    nlp = spacy.load("../training/ner_model")

    examples = []
    losses = {}
    for text, annots in train_data:
        examples.append(Example.from_dict(nlp.make_doc(text), annots))

    optimizer_resume = nlp.resume_training()

    for i in range(int(np.sqrt(len(train_data)))):
        random.shuffle(examples)
        for batch in minibatch(examples, size = compounding(1.0, 32.0, 1.001)):
            nlp.update(
                batch,
                drop=0.2,
                sgd=optimizer_resume,
                losses = losses,
            )
    current_loss = losses["ner"]

    nlp.to_disk("../training/ner_model")
    return current_loss

In [6]:
def from_csv_to_train_data(datadf, label, colname):
    output = []
    datadf[colname] = datadf[colname].astype(str)
    for index, row in datadf.iterrows():
        char_start = 0
        char_end = len(row[colname])
        output_row = (row[colname], {"entities":[(char_start, char_end, label)]})
        output.append(output_row)
    return output
        

In [7]:
def filter_dataframe_count(df, col_name):
    df['word_count'] = df[col_name].str.split().str.len()
    df_new = df[(df['word_count'] <= 4) & (df['word_count'] > 1)].drop(columns=["word_count"], axis=0)

    return df_new.reset_index(drop=True)

In [8]:
data_vars = pd.read_csv("../training/training_data/varaible_data.csv")
data_refs = pd.read_csv("../training/training_data/reference_data.csv")
data_timepoint_1 = pd.read_csv("../training/training_data/timepoint_data.csv")
data_change = pd.read_csv("../training/training_data/change_data.csv")
data_condition = pd.read_csv("../training/training_data/condition_data.csv")

data_vars.dropna(inplace=True)
data_refs.dropna(inplace=True)
data_timepoint_1.dropna(inplace=True)
data_change.dropna(inplace=True)
data_condition.dropna(inplace=True)

# For all Files filter data-entries that are <= 4 words

data_vars = filter_dataframe_count(data_vars, data_vars.iloc[:, 0].name)
data_timepoint_1 = filter_dataframe_count(data_timepoint_1, data_timepoint_1.iloc[:, 0].name)
data_change = filter_dataframe_count(data_change, data_change.iloc[:, 0].name)
data_condition = filter_dataframe_count(data_condition, data_condition.iloc[:, 0].name)

data_var_training = from_csv_to_train_data(data_vars, "Variable", "units_analyzed")
data_ref_training = from_csv_to_train_data(data_refs, "Reference", "param_type")
data_timepoint_training = from_csv_to_train_data(data_timepoint_1, "Timepoint", "target_duration")
data_change_training = from_csv_to_train_data(data_change, "Change", "units")
data_condition_training = from_csv_to_train_data(data_condition, "Condition", "name")

In [9]:
print(len(data_var_training))
print(len(data_ref_training))
print(len(data_timepoint_training))
print(len(data_change_training))
print(len(data_condition_training))
print(len(train_data_init))

663
13
162
17744
64226
128


In [10]:
'''
@author Ervin Joa

Main training-loop:

    While the loss of training remains above 100:
        continue extracting random shuffled training data and retrain the model;
    else:
        stop;
'''

while current_loss > 100:
    # Due to huge amount of data, randomize and select first 200 entries
    data_var_training_random = random.sample(data_var_training, len(data_var_training))
    data_change_training_random = random.sample(data_change_training, len(data_change_training))
    data_condition_training_random = random.sample(data_condition_training, len(data_condition_training))

    # Creation of the training dataset for training 
    training_data = data_ref_training + data_change_training_random[0:200] + data_timepoint_training + \
    data_var_training_random[0:200] + data_condition_training_random[0:200] + train_data_init

    # Shuffeling and selecting the first 500 entries for training
    data_to_train = random.sample(training_data, len(training_data))
    data_to_train_portion = data_to_train[0:500]

    # Training and display of the traing-loss for each session
    current_loss = resume_train_ner_model(data_to_train_portion)
    print(f"Done Training iteration with loss of {current_loss}")
    

Done Training iteration with loss of 3780.66520294482
Done Training iteration with loss of 2996.661206510561


KeyboardInterrupt: 

In [None]:
#nlp_annot = spacy.load("../training/ner_model")
#data_filtered_raw= pd.read_csv("../backend/input_data_filtered.csv")
#annotator = spa.Annotator(labels = ["Condition", "Reference", "Change", "Timepoint", "Variable"], model = nlp_annot)
#df_labels = annotator.annotate(df = data_filtered_raw[0:50], col_text = "PrimaryOutcomeMeasure", shuffle = True)


HTML(value='-1 examples annotated, 51 examples left')

Text(value='', description='Condition', layout=Layout(width='auto'), placeholder='ent one, ent two, ent three'…

Text(value='', description='Reference', layout=Layout(width='auto'), placeholder='ent one, ent two, ent three'…

Text(value='', description='Change', layout=Layout(width='auto'), placeholder='ent one, ent two, ent three')

Text(value='', description='Timepoint', layout=Layout(width='auto'), placeholder='ent one, ent two, ent three'…

Text(value='', description='Variable', layout=Layout(width='auto'), placeholder='ent one, ent two, ent three')

HBox(children=(Button(button_style='success', description='submit', style=ButtonStyle()), Button(button_style=…

Output()

In [None]:
print(list(df_labels["annotations"]))

['', '', ('Number of patients who died for fournier gangrene', {'entities': [(32, 49, 'Condition'), (0, 18, 'Change')]}), ('Cheeks appearance', {'entities': [(0, 17, 'Reference')]}), ('Number of patients without intubation', {'entities': [(0, 18, 'Change'), (27, 37, 'Variable')]}), '', '', ('Percentage of Participants With Serious Adverse Events (SAEs) and Serious Adverse Drug Reactions (SADRs)|Percentage of Participants With Unexpected Adverse Events (AEs) and Adverse Drug Reactions (ADRs) Not Mentioned in Precautions|Percentage of Participants With Expected/Already Known ADRs at Week 13|Percentage of Participants With Expected/Already Known ADRs at Week 26|Percentage of Participants With Expected/Already Known ADRs at Week 39|Percentage of Participants With Expected/Already Known ADRs at Week 52|Percentage of Participants With Expected/Already Known ADRs at Week 153|Percentage of Participants With Non-serious ADRs|Percentage of Participants With Abnormal Laboratory Findings Reported 

In [None]:
nlp_test = spacy.load("../training/ner_model")

In [None]:
for text, _ in data_to_train[0:50]:

        target = nlp_test(text)
        for entity in target.ents:
                displacy.render(nlp_test(target.text), style='ent')

In [None]:

from spacy.scorer import Scorer

def evaluate( examples):
    ner_model = spacy.load("../training/ner_model")
    scorer = Scorer()


    examples = []
    for text, annots in examples:
        predicted=ner_model(text)
        example=Example.from_dict(predicted, annots)
        examples.append(example)
    scores = scorer.score(examples)
    return scores

examples = [
    ('The main outcome is the comparison of total volumetric bone mineral density (vBMD) at the tibia and distal radius',[(9,16, "Change"), (24, 34, "Reference"),(38,82, "Variable")]),
    ("Number of Participants with undiagnosed type 2 diabetes", [(0, 22,"Change"), (40, 55, "Condition")]),
    ("2 months", [(0,8,"Timepoint")])
]


results = evaluate(examples)
print(results)

{'token_acc': None, 'token_p': None, 'token_r': None, 'token_f': None, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': None, 'ents_r': None, 'ents_f': None, 'ents_per_type': None, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, 'cats_f_per_type': {}, 'cats_auc_per_type': {}}
