In [1]:
import spacy
import random
import spacy_annotator as spa
import numpy as np
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training import Example
from spacy import displacy
import pandas as pd
from training_data.custom_training_data import train_data_init


In [2]:
# Init GPU-Usage, load blank model with basis of en_core_web_sm and
# enable only NER in the pipeline

spacy.prefer_gpu()
nlp = spacy.blank('en')
source_nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("ner", source=source_nlp)

# Init tracker for training loss with traing_data

current_loss = 0
training_data_init = train_data_init


In [3]:
'''
@author Ervin Joa

Init-Training of a custom NER-Model with droprate of 20%

@param: list of manually labeled training data

@return current_loss
'''


def train_ner_model(trainig_data_init):

    examples = []
    losses = {}
    for text, annots in trainig_data_init:
        examples.append(Example.from_dict(nlp.make_doc(text), annots))

    optimizer = nlp.initialize(lambda: examples)

    for iter in range(50):
        random.shuffle(examples)
        for batch in minibatch(examples, size=compounding(1., 32., 1.001)):
            nlp.update(
                batch,
                drop=0.2,
                sgd=optimizer,
                losses=losses,
            )
    print("Losses: ", losses)
    current_loss = losses["ner"]

    nlp.to_disk("../training/ner_model")
    return current_loss


In [4]:
current_loss = train_ner_model(training_data_init)

Losses:  {'ner': 8928.782519897488}


In [5]:
'''
@author Ervin Joa

Method for the continuation training with SpaCy. After
each tme this function is called, the model will be retrained and
updated for the data-inputed.

@param train_data: Training Data based on the GOLD-Format
@return current_loss: Loss of the current training epoch
'''


def resume_train_ner_model(train_data):
    nlp = spacy.load("../training/ner_model")

    examples = []
    losses = {}
    for text, annots in train_data:
        examples.append(Example.from_dict(nlp.make_doc(text), annots))

    optimizer_resume = nlp.resume_training()

    for iter in range(50):
        random.shuffle(examples)
        for batch in minibatch(examples, size=compounding(1., 32., 1.001)):
            nlp.update(
                batch,
                drop=0.2,
                sgd=optimizer_resume,
                losses=losses,
            )
    current_loss = losses["ner"]

    nlp.to_disk("../training/ner_model")
    return current_loss


In [6]:
'''
@author Ervin Joa

This method transformes a feature to the curresponding
GOD-standard set of items, which will be used
for SpaCys training.

@param datadf: unput dataframe with the feature
@param label: label to be given for entry
@param colname: column name of the feature in the input dataframe

@return list
'''


def from_csv_to_train_data(datadf, label, colname):
    output = []
    datadf[colname] = datadf[colname].astype(str)
    for index, row in datadf.iterrows():
        char_start = 0
        char_end = len(row[colname])
        output_row = (row[colname], {"entities": [
                      (char_start, char_end, label)]})
        output.append(output_row)
    return output


In [7]:
'''
@author Ervin Joa

Method fpr creating a new feature, with which the number
of words per entry will be filtered.
The goal is to avoid training large sentances, which can
overfit the model and to only train keywords or entities whith
multiple words beloging to it. e.g. "Type 2 Diabetes"

@param df: dataframe containing the feature
@param colname: column name of the feature in the input dataframe

@return filtered dataframe
'''


def filter_dataframe_count(df, col_name):
    df['word_count'] = df[col_name].str.split().str.len()
    df_new = df[(df['word_count'] <= 4) & (df['word_count'] > 1)
                ].drop(columns=["word_count"], axis=0)

    return df_new.reset_index(drop=True)


In [8]:
# import of the extracted data from ACCT
data_vars = pd.read_csv("../training/training_data/varaible_data.csv")
data_refs = pd.read_csv("../training/training_data/reference_data.csv")
data_timepoint_1 = pd.read_csv("../training/training_data/timepoint_data.csv")
data_change = pd.read_csv("../training/training_data/change_data.csv")
data_condition = pd.read_csv("../training/training_data/condition_data.csv")

# droping empty entries
data_vars.dropna(inplace=True)
data_refs.dropna(inplace=True)
data_timepoint_1.dropna(inplace=True)
data_change.dropna(inplace=True)
data_condition.dropna(inplace=True)

# For all Files filter data-entries that are <= 4 words
data_vars = filter_dataframe_count(data_vars, data_vars.iloc[:, 0].name)
data_timepoint_1 = filter_dataframe_count(
    data_timepoint_1, data_timepoint_1.iloc[:, 0].name)
data_change = filter_dataframe_count(data_change, data_change.iloc[:, 0].name)
data_condition = filter_dataframe_count(
    data_condition, data_condition.iloc[:, 0].name)

# Transfomration of the data to the corresponding training-format (GOLD-standard) lists
data_var_training = from_csv_to_train_data(
    data_vars, "Variable", "units_analyzed")
data_ref_training = from_csv_to_train_data(
    data_refs, "Reference", "param_type")
data_timepoint_training = from_csv_to_train_data(
    data_timepoint_1, "Timepoint", "target_duration")
data_change_training = from_csv_to_train_data(data_change, "Change", "units")
data_condition_training = from_csv_to_train_data(
    data_condition, "Condition", "name")


In [9]:
'''
@author Ervin Joa

Main training-loop:

    While the number of training-iterations < 21:
        continue extracting random shuffled training data and retrain the model;
'''
num_training_iter = 1

# 20 epoch, since ner-losses typically plateu after this value
while num_training_iter < 21:
    # Due to huge amount of data, randomize and select first 200 entries
    data_var_training_random = random.sample(
        data_var_training, len(data_var_training))
    data_change_training_random = random.sample(
        data_change_training, len(data_change_training))
    data_condition_training_random = random.sample(
        data_condition_training, len(data_condition_training))

    # Creation of the training dataset for training
    training_data = data_ref_training + data_change_training_random[0:200] + data_timepoint_training + \
        data_var_training_random[0:200] + \
        data_condition_training_random[0:200] + train_data_init

    # Shuffeling and selecting the first 200 entries for training
    data_to_train = random.sample(training_data, len(training_data))
    data_to_train_portion = data_to_train[0:200]

    # Training and display of the traing-loss for each session
    current_loss = resume_train_ner_model(data_to_train_portion)
    print(
        f"Done Training iteration epoch Nr. {num_training_iter} with loss of {current_loss}")
    num_training_iter += 1


Done Training iteration epoch Nr. 1 with loss of 2053.201431748315
Done Training iteration epoch Nr. 2 with loss of 1416.739391536687
Done Training iteration epoch Nr. 3 with loss of 1567.2378504293465
Done Training iteration epoch Nr. 4 with loss of 1450.479927350355
Done Training iteration epoch Nr. 5 with loss of 1045.8071162253398
Done Training iteration epoch Nr. 6 with loss of 2061.7157966570703
Done Training iteration epoch Nr. 7 with loss of 942.0956782751825
Done Training iteration epoch Nr. 8 with loss of 1024.360110468681
Done Training iteration epoch Nr. 9 with loss of 929.5616843582495
Done Training iteration epoch Nr. 10 with loss of 783.3708781466682
Done Training iteration epoch Nr. 11 with loss of 493.1286495073215
Done Training iteration epoch Nr. 12 with loss of 819.1708668245018
Done Training iteration epoch Nr. 13 with loss of 1254.6247862063992
Done Training iteration epoch Nr. 14 with loss of 1031.9637345997496
Done Training iteration epoch Nr. 15 with loss of 80

In [10]:
# For the usage of the SpaCy annotator

#nlp_annot = spacy.load("../training/ner_model")
#data_filtered_raw= pd.read_csv("../backend/input_data_filtered.csv")
#annotator = spa.Annotator(labels = ["Condition", "Reference", "Change", "Timepoint", "Variable"], model = nlp_annot)
#df_labels = annotator.annotate(df = data_filtered_raw[0:100], col_text = "PrimaryOutcomeMeasure", shuffle = True)


In [11]:
#print(list(df_labels["annotations"]))