# Install prerequisites

In [None]:
# install simpletransformers
!pip install simpletransformers==0.40.2

# check installed version
!pip freeze | grep simpletransformers

# Configuration

In [None]:
import os
import numpy as np

In [None]:
BASE_PATH = '..' + os.path.sep 
# set base work dir and base data/models/results dir
BASE_WORK_PATH = BASE_PATH 
DIR_SEPARATOR = os.path.sep 
BERT_MODEL = "multilingual" # "slavic", "multilingual", "multilingual-custom", "biobert"

LOAD_MODEL_DIR = '{0}output{1}{2}{3}best'.format(BASE_PATH, DIR_SEPARATOR, BERT_MODEL, DIR_SEPARATOR)
USE_ORIGINAL_MODEL = True
USE_COLAB = False
USE_CUDA = True
HYPERPARAMSEARCH = False

DATA_FOLDER = '{0}data{1}'.format(BASE_PATH, DIR_SEPARATOR)
SAVED_MODELS_SUBFOLDER = '{0}models{1}'.format(BASE_PATH, DIR_SEPARATOR)

OUTPUT_DIR =  '{0}output{1}{2}{3}'.format(BASE_WORK_PATH, DIR_SEPARATOR, BERT_MODEL, DIR_SEPARATOR)
TENSORBOARD_DIR = '{0}runs{1}'.format(BASE_PATH, DIR_SEPARATOR)
RESULTS_DIR = '{0}results{1}'.format(BASE_PATH, DIR_SEPARATOR)
BEST_MODEL_DIR = '{0}best'.format(OUTPUT_DIR)

NUMBER_SIGNS = 4 
USING_TEST_DF = False

if BERT_MODEL == "multilingual":
  ORIGINAL_MODEL = "bert-base-multilingual-cased"
  ORIGINAL_MODEL_FILE_NAME = ORIGINAL_MODEL
elif BERT_MODEL == "slavic":
  ORIGINAL_MODEL = "DeepPavlov/bert-base-bg-cs-pl-ru-cased"
  ORIGINAL_MODEL_FILE_NAME = "bert-base-bg-cs-pl-ru-cased"
elif BERT_MODEL == "clinical":
  ORIGINAL_MODEL = "emilyalsentzer/Bio_ClinicalBERT"
  ORIGINAL_MODEL_FILE_NAME = "Bio_ClinicalBERT"
else:
  ORIGINAL_MODEL = BERT_MODEL
  ORIGINAL_MODEL_FILE_NAME = BERT_MODEL
    
vocab_file = '{0}classes-collapsed-{1}.csv'.format(DATA_FOLDER, NUMBER_SIGNS)
dataset = '{0}dataset-augmented-{1}.csv'.format(DATA_FOLDER, NUMBER_SIGNS)

In [None]:
if USE_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

# Load Data

In [None]:
TEXT_COLUMN = 'Text'
LABEL_COLUMN = 'ICD10'
PRED_CLASS = 'pred_class'

In [None]:
from os import path

In [None]:
import pandas as pd

df_vocab = pd.read_csv(vocab_file, header=None)

class_list = df_vocab[0].values.tolist()

In [None]:
len(class_list)

In [None]:
from sklearn.model_selection import train_test_split

def split_dataset(dataset):
  df = pd.read_csv(dataset, sep=',',header=0)
  df[PRED_CLASS] = df.apply(lambda x:  class_list.index(x[LABEL_COLUMN]),axis=1)
  df[TEXT_COLUMN] = df[TEXT_COLUMN].apply(lambda x:  str(x))

  df = df.drop([LABEL_COLUMN], axis=1)

  train_dev_df, test_df = train_test_split(df,
                                stratify=df[PRED_CLASS],
                                random_state=42,
                                shuffle=True, 
                                test_size=0.1)
  train_df, dev_df = train_test_split(train_dev_df,
                                stratify=train_dev_df[PRED_CLASS],
                                random_state=42,
                                shuffle=True, 
                                test_size=0.11)
  
  train_df.to_csv('{0}train-{1}.csv'.format(DATA_FOLDER, NUMBER_SIGNS), sep=',', index=False, header=True)
  test_df.to_csv('{0}new-test-{1}.csv'.format(DATA_FOLDER, NUMBER_SIGNS), sep=',', index=False, header=True)
  dev_df.to_csv('{0}new-dev-{1}.csv'.format(DATA_FOLDER, NUMBER_SIGNS), sep=',', index=False, header=True)
  
  return train_df, dev_df, test_df

In [None]:
train_file = '{0}train-{1}.csv'.format(DATA_FOLDER, NUMBER_SIGNS)

if path.exists(train_file):
  train_df = pd.read_csv(train_file,sep=',',header=0)
  dev_file = '{0}new-dev-{1}.csv'.format(DATA_FOLDER, NUMBER_SIGNS)
  dev_df = pd.read_csv(dev_file,sep=',',header=0)
  test_file = '{0}new-test-{1}.csv'.format(DATA_FOLDER, NUMBER_SIGNS)
  test_df = pd.read_csv(test_file,sep=',',header=0)
   
  train_df[TEXT_COLUMN] = train_df[TEXT_COLUMN].apply(lambda x:  str(x))
  dev_df[TEXT_COLUMN] = dev_df[TEXT_COLUMN].apply(lambda x:  str(x))
  test_df[TEXT_COLUMN] = test_df[TEXT_COLUMN].apply(lambda x:  str(x))

  train_df[PRED_CLASS] = train_df[PRED_CLASS].astype(int)
  dev_df[PRED_CLASS] = dev_df[PRED_CLASS].astype(int)
  test_df[PRED_CLASS] = test_df[PRED_CLASS].astype(int)
else:
  train_df, dev_df, test_df = split_dataset(dataset)

train_df = pd.DataFrame({"text": train_df[TEXT_COLUMN].replace(r"\n", " ", regex=True), "labels": train_df[PRED_CLASS]})
dev_df = pd.DataFrame({"text": dev_df[TEXT_COLUMN].replace(r"\n", " ", regex=True), "labels": dev_df[PRED_CLASS], 'OLD_TEXT': dev_df['OLD_TEXT']})
test_df = pd.DataFrame({"text": test_df[TEXT_COLUMN].replace(r"\n", " ", regex=True), "labels": test_df[PRED_CLASS], 'OLD_TEXT': test_df['OLD_TEXT']})

In [None]:
print(train_df.shape)
train_df.head()

In [None]:
print(dev_df.shape)
dev_df.head()

In [None]:
print(test_df.shape)
test_df.head()

In [None]:
df_train_dev = pd.concat([train_df, dev_df[['text','labels']]])
label_groups = df_train_dev.groupby('text')
alt_labels_df = label_groups['labels'].apply(lambda x: x.values).reset_index()
alt_labels_df.set_index('text', inplace=True)

In [None]:
alt_labels_df.head()

In [None]:
def get_alt_labels(text):
  
  if USING_TEST_DF:
      if not(text in alt_labels_df.index):    
        #lookup old text column
        # todo: support other dfs
        values = test_df[test_df['text']==text]['OLD_TEXT'].values
        if len(values) > 0:
          text = values[0]
        else:
          print(text)

      if text in alt_labels_df.index:
        labels = alt_labels_df.at[text, 'labels']
        return labels
  return []

In [None]:
get_alt_labels('#ефлукзус')

# Top K Classification Model

In [None]:
TOP_K = 5

In [None]:
def top_k(array):
  return (-array).argsort()[:TOP_K]

# Simple Transformers - https://github.com/ThilinaRajapakse/simpletransformers
Copyright ThilinaRajapakse/simpletransformers

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0

In [None]:
from __future__ import absolute_import, division, print_function
from simpletransformers.classification import ClassificationModel

import json
import math
import os
import random
import warnings
from multiprocessing import cpu_count
import logging

import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import (
    confusion_matrix,
    label_ranking_average_precision_score,
    matthews_corrcoef,
    mean_squared_error,
)
from tqdm.auto import tqdm, trange

import torch
from simpletransformers.experimental.classification.classification_utils import (
    InputExample,
    convert_examples_to_features,
)
from tensorboardX import SummaryWriter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertTokenizer,
    get_linear_schedule_with_warmup,
)

In [None]:
logging.basicConfig(filename='training.log',level=logging.DEBUG)

In [None]:
class TopKClassificationModel(ClassificationModel):
    def __init__(
        self, model_type, model_name, topK, num_labels=None, weight=None, args=None, use_cuda=True, cuda_device=-1, **kwargs,
    ):
      super().__init__(model_type, model_name, num_labels, weight, args, use_cuda, cuda_device, **kwargs)
      self.topK = topK

    def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", verbose=True, silent=False, **kwargs):
        """
        Evaluates the model on eval_df.

        Utility function to be used by the eval_model() method. Not intended to be used directly.
        """

        model = self.model
        args = self.args
        eval_output_dir = output_dir

        results = {}
        if isinstance(eval_df, str):
            eval_dataset = LazyClassificationDataset(eval_df, self.tokenizer, self.args)
            eval_examples = None
        else:
            if "text" in eval_df.columns and "labels" in eval_df.columns:
                eval_examples = [
                    InputExample(i, text, None, label)
                    for i, (text, label) in enumerate(zip(eval_df["text"].astype(str), eval_df["labels"]))
                ]
            elif "text_a" in eval_df.columns and "text_b" in eval_df.columns:
                eval_examples = [
                    InputExample(i, text_a, text_b, label)
                    for i, (text_a, text_b, label) in enumerate(
                        zip(eval_df["text_a"].astype(str), eval_df["text_b"].astype(str), eval_df["labels"])
                    )
                ]
            else:
                warnings.warn(
                    "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
                )
                eval_examples = [
                    InputExample(i, text, None, label)
                    for i, (text, label) in enumerate(zip(eval_df.iloc[:, 0], eval_df.iloc[:, 1]))
                ]

            eval_dataset = self.load_and_cache_examples(
                eval_examples, evaluate=True, verbose=verbose, silent=silent
            )
        os.makedirs(eval_output_dir, exist_ok=True)

        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        #eval_sampler = BatchSampler(SequentialSampler(eval_dataset), batch_size=args.eval_batch_size, drop_last=False)
        #eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler) #, batch_size=args.eval_batch_size)

        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        model.eval()

        for batch in tqdm(eval_dataloader, disable=args.silent or silent):
            # batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = self._get_inputs_dict(batch)

                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
  
                eval_loss += tmp_eval_loss.mean().item()

            nb_eval_steps += 1

            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps

        model_outputs = preds

        if self.topK == 1:
          preds = np.argmax(preds, axis=1)
        else:
          preds = np.apply_along_axis(top_k, axis=1, arr=preds)

        result, wrong = self.compute_metrics(preds, out_label_ids, eval_examples, **kwargs)
        result["eval_loss"] = eval_loss
        results.update(result)

        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))
        
        return results, model_outputs, wrong#, eval_examples

    def compute_metrics(self, preds, labels, eval_examples=None, multi_label=False, **kwargs):
        """
        Computes the evaluation metrics for the model predictions.

        Args:
            preds: Model predictions
            labels: Ground truth labels
            eval_examples: List of examples on which evaluation was performed
            **kwargs: Additional metrics that should be used. Pass in the metrics as keyword arguments (name of metric: function to use). E.g. f1=sklearn.metrics.f1_score.
                        A metric function should take in two parameters. The first parameter will be the true labels, and the second parameter will be the predictions.

        Returns:
            result: Dictionary containing evaluation results. (Matthews correlation coefficient, tp, tn, fp, fn)
            wrong: List of InputExample objects corresponding to each incorrect prediction by the model
        """  # noqa: ignore flake8"

        assert len(preds) == len(labels)

        extra_metrics = {}
        for metric, func in kwargs.items():
          if callable(func):
            extra_metrics[metric] = func(labels, preds, eval_examples)

        mismatched = labels != preds[:, 0]

        if eval_examples:
            wrong = [i for (i, v) in zip(eval_examples, mismatched) if v.any()]
        else:
            wrong = ["NA"]

        mcc = matthews_corrcoef(labels, preds[:, 0])

        return {**{"mcc": mcc}, **extra_metrics}, wrong

    def predict(self, to_predict, multi_label=False):
        """
        Performs predictions on a list of text.

        Args:
            to_predict: A python list of text (str) to be sent to the model for prediction.

        Returns:
            preds: A python list of the predictions (0 or 1) for each text.
            model_outputs: A python list of the raw model outputs for each text.
        """

        model = self.model
        args = self.args

        self._move_model_to_device()
        
        if isinstance(to_predict[0], list):
            eval_examples = [InputExample(i, text[0], text[1], 0) for i, text in enumerate(to_predict)]
        else:
            eval_examples = [InputExample(i, text, None, 0) for i, text in enumerate(to_predict)]
    
        eval_dataset = self.load_and_cache_examples(
            eval_examples, evaluate=True, multi_label=multi_label, no_cache=True
        )

        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None

        if self.config.output_hidden_states:
            for batch in tqdm(eval_dataloader, disable=args.silent):
                model.eval()
                # batch = tuple(t.to(device) for t in batch)

                with torch.no_grad():
                    inputs = self._get_inputs_dict(batch)
                    outputs = model(**inputs)
                    tmp_eval_loss, logits = outputs[:2]
                    embedding_outputs, layer_hidden_states = outputs[2][0], outputs[2][1:]

                    eval_loss += tmp_eval_loss.mean().item()

                nb_eval_steps += 1

                if preds is None:
                    preds = logits.detach().cpu().numpy()
                    out_label_ids = inputs["labels"].detach().cpu().numpy()
                    all_layer_hidden_states = np.array([state.detach().cpu().numpy() for state in layer_hidden_states])
                    all_embedding_outputs = embedding_outputs.detach().cpu().numpy()
                else:
                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                    out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
                    all_layer_hidden_states = np.append(
                        all_layer_hidden_states,
                        np.array([state.detach().cpu().numpy() for state in layer_hidden_states]),
                        axis=1,
                    )
                    all_embedding_outputs = np.append(
                        all_embedding_outputs, embedding_outputs.detach().cpu().numpy(), axis=0
                    )
        else:
            for batch in tqdm(eval_dataloader, disable=args.silent):
                model.eval()
                # batch = tuple(t.to(device) for t in batch)

                with torch.no_grad():
                    inputs = self._get_inputs_dict(batch)
                    outputs = model(**inputs)
                    tmp_eval_loss, logits = outputs[:2]

                    eval_loss += tmp_eval_loss.mean().item()

                nb_eval_steps += 1

                if preds is None:
                    preds = logits.detach().cpu().numpy()
                    out_label_ids = inputs["labels"].detach().cpu().numpy()
                else:
                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                    out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps

        model_outputs = preds
        if self.topK == 1:
          preds = np.argmax(preds, axis=1)
        else:
          preds = np.apply_along_axis(top_k, axis=1, arr=preds)

        if self.config.output_hidden_states:
            return preds, model_outputs, all_embedding_outputs, all_layer_hidden_states
        else:
            return preds, model_outputs

# Custom Metrics

In [None]:
def acc_topK_multiclass_multiple_correct_row(label_list, preds):  
    for i in range(TOP_K):
      if preds[i] in label_list:
        return i
      
    return -1; 

In [None]:
def acc_topK_multiple_correct(labels, preds, eval_examples):
  acc_accum = {}
  alt_acc_accum = {}
  acc_topK = {}   
  alt_acc_topK = {}
  
  for i in range(TOP_K):
    acc_accum[i] = 0
    alt_acc_accum[i] = 0
    acc_topK[i] = 0
    alt_acc_topK[i] = 0

  number_rows = len(preds)  
  for preds_row, label_list, example in zip(preds, labels, eval_examples):
    alt_label_list = get_alt_labels(example.text_a)   
    if len(alt_label_list) > 0:
      np.append(alt_label_list, label_list)
    else:
      alt_label_list = [label_list]

    acc = acc_topK_multiclass_multiple_correct_row([label_list], preds_row)
    alt_acc = acc_topK_multiclass_multiple_correct_row(alt_label_list, preds_row)

    if acc > -1:
      acc_accum[acc] += 1
    if alt_acc > -1:
      alt_acc_accum[alt_acc] += 1

  for i in range(TOP_K):
    acc_topK[i] = acc_accum[i] / number_rows
    alt_acc_topK[i] = alt_acc_accum[i] / number_rows
  return acc_topK, alt_acc_topK; 

In [None]:
def acc_mrr_multiclass_multiple_correct_row(label_list, preds):  
    for i in range(TOP_K):
      if preds[i] in label_list:
        return 1/(i+1)
      
    return 0; 

In [None]:
def acc_mrr_multiclass_multiple_correct(labels, preds, eval_examples):
  mrr_accum = 0
  alt_mrr_accum = 0
  for preds_row, label_list, example in zip(preds, labels, eval_examples):
    alt_label_list = get_alt_labels(example.text_a)
    if len(alt_label_list) > 0:
      np.append(alt_label_list, label_list)
    else:
      alt_label_list = [label_list]
    
    mrr_accum += acc_mrr_multiclass_multiple_correct_row([label_list], preds_row)
    alt_mrr_accum += acc_mrr_multiclass_multiple_correct_row(alt_label_list, preds_row)

  #return mrr_accum/len(preds), alt_mrr_accum/len(preds); 
  return alt_mrr_accum/len(preds); 

# File Helpers

In [None]:
import os
import tarfile
import shutil
from os import path

def save_model(model_path='',file_name=''):
  files = [files for root, dirs, files in os.walk(model_path)][0]
  with tarfile.open(file_name+ '.tar.gz', 'w:gz') as f:
    for file in files:
      f.add(f'{model_path}{DIR_SEPARATOR}{file}')

In [None]:
def save_copy_model(model_path, file_name):
  save_model(model_path, file_name) 
  full_file_name = file_name+'.tar.gz'

  if not(path.exists(SAVED_MODELS_SUBFOLDER)):
    os.makedirs(SAVED_MODELS_SUBFOLDER)

  full_path = '{0}{1}'.format(SAVED_MODELS_SUBFOLDER, full_file_name)
  shutil.copy(full_file_name, full_path)
  os.remove(full_file_name)

In [None]:
def cleanup_training_checkpoints(model_path, leave_best_model = False):
  if not(path.exists(model_path)):
    return

  dirs, files = [(dirs, files) for root, dirs, files in os.walk(model_path)][0]
  for dir in dirs:
    if not(leave_best_model and dir == 'best'):
      current_dir = '{0}{1}{2}'.format(model_path, DIR_SEPARATOR, dir)
      shutil.rmtree(current_dir)
  for file in files:
    current_file = '{0}{1}{2}'.format(model_path, DIR_SEPARATOR, file)
    os.remove(current_file)
  #shutil.rmtree(model_path)

In [None]:
def unpack_model(model_name=''): 
  tar = tarfile.open(f"{model_name}.tar.gz", "r:gz")
  tar.extractall()
  tar.close()

# Hyperparameter Selection

In [None]:
BATCH_SIZES = [8, 16, 32, 64, 128]
MAX_SEQ_LENGTH = 64
MAX_NUM_TRAIN_EPOCHS = 1
LEARNING_RATES = [3e-4, 1e-4, 5e-5, 3e-5] 
USING_TEST_DF = False

In [None]:
results = []
best_result = { }

if HYPERPARAMSEARCH == True:

    for BATCH_SIZE in BATCH_SIZES:
      for LEARNING_RATE in LEARNING_RATES:
        CURRENT_OUTPUT_DIR = '{0}{1}_{2}'.format(OUTPUT_DIR, str(BATCH_SIZE), str(LEARNING_RATE))    
        print(CURRENT_OUTPUT_DIR)
        # define hyperparameter
        train_args ={"output_dir": CURRENT_OUTPUT_DIR,
                    "reprocess_input_data": True,
                    "overwrite_output_dir": True,
                    "use_cached_eval_features": True,
                    "fp16":False,
                    "max_seq_length": MAX_SEQ_LENGTH,
                    "train_batch_size": BATCH_SIZE,
                    "eval_batch_size": BATCH_SIZE,
                    "learning_rate":  LEARNING_RATE,
                    "num_train_epochs": MAX_NUM_TRAIN_EPOCHS,
                    "tensorboard_dir": TENSORBOARD_DIR,
                    "save_model_every_epoch": True,
                    "save_steps": 0,
                    "evaluate_during_training": True,
                    "evaluate_during_training_verbose": True,
                    "evaluate_during_training_silent": False,
                    "early_stopping_metric_minimize": False,
                    "early_stopping_metric": 'accmrr',
                    "early_stopping_patience": 3,
                    "early_stopping_delta": 0.005,
                    "use_early_stopping": True,
                    "evaluate_during_training_steps": 0,
                    "early_stopping_consider_epochs": True,
                    "best_model_dir": BEST_MODEL_DIR,             
                    "sliding_window": False,
                    "use_multiprocessing": False,
                    "labels_list": None,
                    "labels_map": None}

        # Create a ClassificationModel
        model = TopKClassificationModel(
            "bert", ORIGINAL_MODEL if USE_ORIGINAL_MODEL else LOAD_MODEL_DIR,
            topK=TOP_K,
            num_labels=len(class_list),
            args=train_args,
            use_cuda = USE_CUDA
        )

        # train
        print("Start training for batch size {batch}, learning rate {lr}".format(batch = BATCH_SIZE, lr = LEARNING_RATE))    
        model.train_model(train_df, eval_df = dev_df, verbose=True, accmrr=acc_mrr_multiclass_multiple_correct, acctopK=acc_topK_multiple_correct)

        # evaluate on dev set
        print("Start evaluation for batch size {batch}, learning rate {lr}".format(batch = BATCH_SIZE, lr = LEARNING_RATE))
        result, model_outputs, wrong_predictions = model.eval_model(dev_df, accmrr=acc_mrr_multiclass_multiple_correct, acctopK=acc_topK_multiple_correct) 

        # record eval result
        full_result = { 
                        'params': { 
                            'batch_size': BATCH_SIZE, 
                            'learning_rate': LEARNING_RATE, 
                            'epochs': 'best' # best out of MAX_NUM_TRAIN_EPOCHS
                            }, 
                        'result': result 
                        }

        results.append(full_result)
        print("Evaluation result for batch size {batch}, learning rate {lr}".format(batch = BATCH_SIZE, lr = LEARNING_RATE))
        print(full_result)

        # if best result - save the model
        if not('result' in best_result.keys()) or (best_result['result']['accmrr'] < result['accmrr']):
          best_result = full_result

        training_progress_file = '{0}{1}training_progress_scores.csv'.format(CURRENT_OUTPUT_DIR, DIR_SEPARATOR)
        training_progress_file_target = '{0}training_progress_scores_{1}_{2}.csv'.format(SAVED_MODELS_SUBFOLDER, str(BATCH_SIZE), str(LEARNING_RATE))

        if path.exists(training_progress_file):
            shutil.copy(training_progress_file, training_progress_file_target)        

        cleanup_training_checkpoints(CURRENT_OUTPUT_DIR)

    save_model_file = '{0}_{1}_{2}_best'.format(ORIGINAL_MODEL_FILE_NAME, str(BATCH_SIZE), str(LEARNING_RATE))
    save_copy_model(BEST_MODEL_DIR, save_model_file)  

    # leave the best model
    cleanup_training_checkpoints(OUTPUT_DIR, True)

In [None]:
import time
import json

RESULTS_FILE_PATH = '{0}results_hyperparameters_{1}.txt'.format(RESULTS_DIR, str(time.time()))

if path.exists(RESULTS_DIR):
    with open(RESULTS_FILE_PATH, 'w') as outfile:
        json.dump(results, outfile)

    print('Full results:')
    print(results)
    print('Best result:')
    print(best_result)

# Load BERT Model

In [None]:
FIRST_TIME_TRAINING = True
MAX_SEQ_LENGTH = 64
NUM_TRAINING_EPOCHS = 10

#BATCH_SIZE = best_result['params']['batch_size']
#LEARNING_RATE = best_result['params']['learning_rate']

if BERT_MODEL.startswith("multilingual"):  
  BATCH_SIZE = 64
  LEARNING_RATE = 0.0001
elif BERT_MODEL == "slavic":
  BATCH_SIZE = 16
  LEARNING_RATE = 5e-5
else:
  BATCH_SIZE = 16
  LEARNING_RATE = 5e-5

In [None]:
BERT_LOAD_MODEL = (ORIGINAL_MODEL if USE_ORIGINAL_MODEL else LOAD_MODEL_DIR) if FIRST_TIME_TRAINING else BEST_MODEL_DIR
print(BERT_LOAD_MODEL)

In [None]:
# define hyperparameter
train_args ={"output_dir": OUTPUT_DIR,
             "reprocess_input_data": True,
             "overwrite_output_dir": True,
             "use_cached_eval_features": True,
             "fp16":False,
             "max_seq_length": MAX_SEQ_LENGTH,
             "train_batch_size": BATCH_SIZE,
             "eval_batch_size": BATCH_SIZE,
             "learning_rate":  LEARNING_RATE,
             "num_train_epochs": NUM_TRAINING_EPOCHS,
             "tensorboard_dir": TENSORBOARD_DIR,
             "save_model_every_epoch": True,
             "save_steps": 0,
             "evaluate_during_training": True, 
             "evaluate_during_training_verbose": True,
             "evaluate_during_training_silent": False,
             "early_stopping_metric_minimize": False,
             "early_stopping_metric": 'accmrr',
             "early_stopping_patience": 3,
             "early_stopping_delta": 0.005,
             "use_early_stopping": True, 
             "evaluate_during_training_steps": 0,
             "early_stopping_consider_epochs": True, 
             "best_model_dir": BEST_MODEL_DIR,
             "sliding_window": False,
             "use_multiprocessing": False,
             "labels_list": None,
             "labels_map": None}

# Create a ClassificationModel
model = TopKClassificationModel("bert", BERT_LOAD_MODEL, topK=TOP_K, num_labels=len(class_list), args=train_args, use_cuda = USE_CUDA)

# Train & Evaluate

In [None]:
# Train the model
USING_TEST_DF = False
model.train_model(train_df, eval_df = dev_df, verbose=True, accmrr=acc_mrr_multiclass_multiple_correct, acctopK=acc_topK_multiple_correct)

In [None]:
USING_TEST_DF = False
result, model_outputs, wrong_predictions = model.eval_model(dev_df, accmrr=acc_mrr_multiclass_multiple_correct, acctopK=acc_topK_multiple_correct)

print("Evalution results on dev set for final epoch:")
result

In [None]:
USING_TEST_DF = True
result, model_outputs, wrong_predictions = model.eval_model(test_df, accmrr=acc_mrr_multiclass_multiple_correct, acctopK=acc_topK_multiple_correct)
print("Evalution results on test set for final epoch:")
result

In [None]:
# save the trained model
save_model_file = '{0}_{1}_{2}_best'.format(ORIGINAL_MODEL_FILE_NAME, str(BATCH_SIZE), str(LEARNING_RATE))
save_copy_model(BEST_MODEL_DIR, save_model_file)

In [None]:
cleanup_training_checkpoints(OUTPUT_DIR)

# Result Analysis

In [40]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score

In [41]:
preds = np.apply_along_axis(top_k, axis=1, arr=model_outputs)

In [42]:
df_preds = pd.DataFrame(preds)
df_preds.head()

Unnamed: 0,0,1,2,3,4
0,1613,4256,1067,2781,6238
1,1067,1613,6238,4256,2781
2,1613,6238,2781,1067,3665
3,3665,6238,1613,4256,696
4,2536,3320,4295,696,277


In [43]:
y_true = np.array(test_df['labels'])
y_pred = np.array(preds[:,0])

In [44]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb_ns = MultiLabelBinarizer(classes=range(len(class_list)), sparse_output=False)

In [45]:
label_lists = []
pred_lists = []
for i in range(len(test_df)):
  pred_value = preds[i,0]
  pred_lists.append([pred_value])
  y_pred[i] = pred_value
  label_list = get_alt_labels(test_df['text'].values[i])  
  label_list = np.append(label_list, test_df['labels'].values[i])
  if pred_value in label_list:
    label_lists.append([pred_value])
    y_true[i] = pred_value
  else:
    label_lists.append([label_list[0]])
    y_true[i] = label_list[0]  

In [46]:
y_true_one_hot_vectors = mlb_ns.fit_transform(label_lists)
y_pred_one_hot_vectors = mlb_ns.fit_transform(pred_lists)

In [None]:
from sklearn.metrics import f1_score
print(f1_score(y_true, y_pred, average='micro'))
print(f1_score(y_true, y_pred, average='macro'))
print(f1_score(y_true, y_pred, average='weighted'))

# Predict

In [None]:
pred_sentences = [
  "Акроцефалополисиндактилия",
  "захарен диабет тип 2",
  "хипотироидизъм",
  "oстра дихателна недостатъчност",
  "конвулсивни припадъци"
]

for sentence in pred_sentences:
  predictions, raw_outputs = model.predict([sentence])
  for prediction in predictions[0]:
    print(class_list[prediction])