# Install prerequisites

In [1]:
# install simpletransformers
!pip install simpletransformers

# check installed version
!pip freeze | grep simpletransformers



'grep' is not recognized as an internal or external command,
operable program or batch file.


# Configuration

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
BASE_PATH = '/content/drive/My Drive/DiagnosisToCode/'
DIR_SEPARATOR = '/'
BERT_MODEL = "multilingual" # "slavic"

DATA_SUBFOLDER = 'data' + DIR_SEPARATOR
SAVED_MODELS_SUBFOLDER = 'models' + DIR_SEPARATOR

OUTPUT_DIR =  DIR_SEPARATOR + 'content' + DIR_SEPARATOR + BERT_MODEL + '_bert' + DIR_SEPARATOR
TENSORBOARD_DIR = BASE_PATH + 'runs' + DIR_SEPARATOR
BEST_MODEL_DIR = OUTPUT_DIR + 'best'

if BERT_MODEL == "multilingual":
  ORIGINAL_MODEL = "bert-base-multilingual-cased"
  ORIGINAL_MODEL_FILE_NAME = ORIGINAL_MODEL
else:
  ORIGINAL_MODEL = "DeepPavlov/bert-base-bg-cs-pl-ru-cased"
  ORIGINAL_MODEL_FILE_NAME = "bert-base-bg-cs-pl-ru-cased"

# Load Data

In [0]:
TEXT_COLUMN = 'disease'
LABEL_COLUMN = 'code'

In [0]:
import pandas as pd

vocab_file = BASE_PATH + DATA_SUBFOLDER + 'vocab.csv'
df_vocab = pd.read_csv(vocab_file, header=None)

class_list = df_vocab[0].values.tolist()

In [5]:
train_df = pd.read_csv(BASE_PATH + DATA_SUBFOLDER + 'train.csv',sep=',',header=0)

train_df['pred_class'] = train_df.apply(lambda x:  class_list.index(x[LABEL_COLUMN]),axis=1)

train_df[TEXT_COLUMN] = train_df[TEXT_COLUMN].apply(lambda x:  str(x))
train_df = train_df[[TEXT_COLUMN,'pred_class']]


print(train_df.shape)
train_df.head()

(61274, 2)


Unnamed: 0,disease,pred_class
0,холер,0
1,тиф паратиф,1
2,салмонелоз инфекци,2
3,шигелоз,3
4,бактериал чревн инфекци,4


In [6]:
dev_df = pd.read_csv(BASE_PATH + DATA_SUBFOLDER + 'dev.csv',sep=',',header=0)

dev_df['pred_class'] = dev_df.apply(lambda x:  class_list.index(x[LABEL_COLUMN]),axis=1)
dev_df[TEXT_COLUMN] = dev_df[TEXT_COLUMN].apply(lambda x:  str(x))

dev_df = dev_df[[TEXT_COLUMN,'pred_class']]

print(dev_df.shape)
dev_df.head()

(6712, 2)


Unnamed: 0,disease,pred_class
0,célera vibrio cholerae,0
1,manifestationes et morbis infectiosis,0
2,инфекци холер vibrio,0
3,cholera,0
4,morbis infectiosis,0


In [7]:
test_df = pd.read_csv(BASE_PATH + DATA_SUBFOLDER + 'test.csv',sep=',',header=0)

test_df['pred_class'] = test_df.apply(lambda x:  class_list.index(x[LABEL_COLUMN]),axis=1)
test_df[TEXT_COLUMN] = test_df[TEXT_COLUMN].apply(lambda x:  str(x))

test_df = test_df[[TEXT_COLUMN,'pred_class']]

print(test_df.shape)
test_df.head()

(7486, 2)


Unnamed: 0,disease,pred_class
0,cã²lera,0
1,morbi,0
2,plasmodium orci,0
3,intestinorum morbis infectivis morbus,0
4,холер предизвика холер вибрион 01 биовар eltor,0


# Top K Classification Model

In [0]:
TOP_K = 5

In [0]:
def top_k(array):
  return (-array).argsort()[:TOP_K]

# Simple Transformers - https://github.com/ThilinaRajapakse/simpletransformers
Copyright ThilinaRajapakse/simpletransformers

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0

In [0]:
from simpletransformers.classification import ClassificationModel

from __future__ import absolute_import, division, print_function

import json
import math
import os
import random
import warnings
from multiprocessing import cpu_count
import logging

import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import (
    confusion_matrix,
    label_ranking_average_precision_score,
    matthews_corrcoef,
    mean_squared_error,
)
from tqdm.auto import tqdm, trange

import torch
from simpletransformers.experimental.classification.classification_utils import (
    InputExample,
    convert_examples_to_features,
)
from tensorboardX import SummaryWriter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertTokenizer,
    get_linear_schedule_with_warmup,
)

In [0]:
logging.basicConfig(filename='training.log',level=logging.DEBUG)

In [0]:
class TopKClassificationModel(ClassificationModel):
    def __init__(
        self, model_type, model_name, topK, num_labels=None, weight=None, args=None, use_cuda=True, cuda_device=-1,
    ):
      super().__init__(model_type, model_name, num_labels, weight, args, use_cuda, cuda_device)
      self.topK = topK

    def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", verbose=True, silent=False, **kwargs):
        """
        Evaluates the model on eval_df.

        Utility function to be used by the eval_model() method. Not intended to be used directly.
        """

        model = self.model
        args = self.args
        eval_output_dir = output_dir

        results = {}
        if isinstance(eval_df, str):
            eval_dataset = LazyClassificationDataset(eval_df, self.tokenizer, self.args)
            eval_examples = None
        else:
            if "text" in eval_df.columns and "labels" in eval_df.columns:
                eval_examples = [
                    InputExample(i, text, None, label)
                    for i, (text, label) in enumerate(zip(eval_df["text"].astype(str), eval_df["labels"]))
                ]
            elif "text_a" in eval_df.columns and "text_b" in eval_df.columns:
                eval_examples = [
                    InputExample(i, text_a, text_b, label)
                    for i, (text_a, text_b, label) in enumerate(
                        zip(eval_df["text_a"].astype(str), eval_df["text_b"].astype(str), eval_df["labels"])
                    )
                ]
            else:
                warnings.warn(
                    "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
                )
                eval_examples = [
                    InputExample(i, text, None, label)
                    for i, (text, label) in enumerate(zip(eval_df.iloc[:, 0], eval_df.iloc[:, 1]))
                ]

            eval_dataset = self.load_and_cache_examples(
                eval_examples, evaluate=True, verbose=verbose, silent=silent
            )
        os.makedirs(eval_output_dir, exist_ok=True)

        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"])

        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        model.eval()

        for batch in tqdm(eval_dataloader, disable=args["silent"] or silent):
            # batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = self._get_inputs_dict(batch)

                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
  
                eval_loss += tmp_eval_loss.mean().item()

            nb_eval_steps += 1

            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps

        model_outputs = preds

        if self.topK == 1:
          preds = np.argmax(preds, axis=1)
        else:
          preds = np.apply_along_axis(top_k, axis=1, arr=preds)

        result, wrong = self.compute_metrics(preds, out_label_ids, eval_examples, **kwargs)
        result["eval_loss"] = eval_loss
        results.update(result)

        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))
        
        return results, model_outputs, wrong

    def compute_metrics(self, preds, labels, eval_examples=None, multi_label=False, **kwargs):
        """
        Computes the evaluation metrics for the model predictions.

        Args:
            preds: Model predictions
            labels: Ground truth labels
            eval_examples: List of examples on which evaluation was performed
            **kwargs: Additional metrics that should be used. Pass in the metrics as keyword arguments (name of metric: function to use). E.g. f1=sklearn.metrics.f1_score.
                        A metric function should take in two parameters. The first parameter will be the true labels, and the second parameter will be the predictions.

        Returns:
            result: Dictionary containing evaluation results. (Matthews correlation coefficient, tp, tn, fp, fn)
            wrong: List of InputExample objects corresponding to each incorrect prediction by the model
        """  # noqa: ignore flake8"

        assert len(preds) == len(labels)

        extra_metrics = {}
        for metric, func in kwargs.items():
            extra_metrics[metric] = func(labels, preds)

        mismatched = labels != preds[:, 0]

        if eval_examples:
            wrong = [i for (i, v) in zip(eval_examples, mismatched) if v.any()]
        else:
            wrong = ["NA"]

        mcc = matthews_corrcoef(labels, preds[:, 0])

        return {**{"mcc": mcc}, **extra_metrics}, wrong

    def predict(self, to_predict, multi_label=False):
        """
        Performs predictions on a list of text.

        Args:
            to_predict: A python list of text (str) to be sent to the model for prediction.

        Returns:
            preds: A python list of the predictions (0 or 1) for each text.
            model_outputs: A python list of the raw model outputs for each text.
        """

        model = self.model
        args = self.args

        self._move_model_to_device()
        
        if isinstance(to_predict[0], list):
            eval_examples = [InputExample(i, text[0], text[1], 0) for i, text in enumerate(to_predict)]
        else:
            eval_examples = [InputExample(i, text, None, 0) for i, text in enumerate(to_predict)]
    
        eval_dataset = self.load_and_cache_examples(
            eval_examples, evaluate=True, multi_label=multi_label, no_cache=True
        )

        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"])

        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None

        if self.config.output_hidden_states:
            for batch in tqdm(eval_dataloader, disable=args["silent"]):
                model.eval()
                # batch = tuple(t.to(device) for t in batch)

                with torch.no_grad():
                    inputs = self._get_inputs_dict(batch)
                    outputs = model(**inputs)
                    tmp_eval_loss, logits = outputs[:2]
                    embedding_outputs, layer_hidden_states = outputs[2][0], outputs[2][1:]

                    eval_loss += tmp_eval_loss.mean().item()

                nb_eval_steps += 1

                if preds is None:
                    preds = logits.detach().cpu().numpy()
                    out_label_ids = inputs["labels"].detach().cpu().numpy()
                    all_layer_hidden_states = np.array([state.detach().cpu().numpy() for state in layer_hidden_states])
                    all_embedding_outputs = embedding_outputs.detach().cpu().numpy()
                else:
                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                    out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
                    all_layer_hidden_states = np.append(
                        all_layer_hidden_states,
                        np.array([state.detach().cpu().numpy() for state in layer_hidden_states]),
                        axis=1,
                    )
                    all_embedding_outputs = np.append(
                        all_embedding_outputs, embedding_outputs.detach().cpu().numpy(), axis=0
                    )
        else:
            for batch in tqdm(eval_dataloader, disable=args["silent"]):
                model.eval()
                # batch = tuple(t.to(device) for t in batch)

                with torch.no_grad():
                    inputs = self._get_inputs_dict(batch)
                    outputs = model(**inputs)
                    tmp_eval_loss, logits = outputs[:2]

                    eval_loss += tmp_eval_loss.mean().item()

                nb_eval_steps += 1

                if preds is None:
                    preds = logits.detach().cpu().numpy()
                    out_label_ids = inputs["labels"].detach().cpu().numpy()
                else:
                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                    out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps

        model_outputs = preds
        if self.topK == 1:
          preds = np.argmax(preds, axis=1)
        else:
          preds = np.apply_along_axis(top_k, axis=1, arr=preds)

        if self.config.output_hidden_states:
            return preds, model_outputs, all_embedding_outputs, all_layer_hidden_states
        else:
            return preds, model_outputs

In [0]:
from sklearn.metrics import f1_score, accuracy_score

def acc_mrr_multiclass(labels, preds):
    acc_topK = {}
    mrr = 0;
    for i in range(TOP_K):
      acc_topK[i] = accuracy_score(labels, preds[:,i])
      mrr += acc_topK[i]/(i+1)
      
    return mrr; 

In [0]:
def acc_topK(labels, preds):
    acc_topK = {}
    for i in range(TOP_K):
      acc_topK[i] = accuracy_score(labels, preds[:,i])
      
    return acc_topK; 

# File Helpers

In [0]:
import os
import tarfile
import shutil

def save_model(model_path='',file_name=''):
  files = [files for root, dirs, files in os.walk(model_path)][0]
  with tarfile.open(file_name+ '.tar.gz', 'w:gz') as f:
    for file in files:
      f.add(f'{model_path}{DIR_SEPARATOR}{file}')

In [0]:
def save_copy_model(model_path, file_name):
  save_model(model_path, file_name) 
  full_file_name = file_name+'.tar.gz'
  full_path =  BASE_PATH + SAVED_MODELS_SUBFOLDER + full_file_name
  shutil.copy(full_file_name, full_path)
  os.remove(full_file_name)

In [0]:
def cleanup_training_checkpoints(model_path, leave_best_model = False):
  dirs, files = [(dirs, files) for root, dirs, files in os.walk(model_path)][0]
  for dir in dirs:
    if not(leave_best_model and dir == 'best'):
      shutil.rmtree(model_path + DIR_SEPARATOR + dir)
  for file in files:
    os.remove(model_path + DIR_SEPARATOR + file)
  #shutil.rmtree(model_path)

In [0]:
def unpack_model(model_name=''): 
  tar = tarfile.open(f"{model_name}.tar.gz", "r:gz")
  tar.extractall()
  tar.close()

# Hyperparameter Selection

In [0]:
BATCH_SIZES = [8, 16, 32, 64, 128]
MAX_SEQ_LENGTH = 64
MAX_NUM_TRAIN_EPOCHS = 10
LEARNING_RATES = [3e-4, 1e-4, 5e-5, 3e-5] 

In [34]:
results = []
best_result = { }

for BATCH_SIZE in BATCH_SIZES:
  for LEARNING_RATE in LEARNING_RATES:
    CURRENT_OUTPUT_DIR = OUTPUT_DIR + str(BATCH_SIZE) + '_' + str(LEARNING_RATE)
    print(CURRENT_OUTPUT_DIR)
    # define hyperparameter
    train_args ={"output_dir": CURRENT_OUTPUT_DIR,
                "reprocess_input_data": True,
                "overwrite_output_dir": True,
                "fp16":False,
                "max_seq_length": MAX_SEQ_LENGTH,
                "train_batch_size": BATCH_SIZE,
                "eval_batch_size": BATCH_SIZE,
                "learning_rate":  LEARNING_RATE,
                "num_train_epochs": MAX_NUM_TRAIN_EPOCHS,
                "tensorboard_dir": TENSORBOARD_DIR,
                "save_model_every_epoch": True,
                "save_steps": 0,
                "evaluate_during_training": True,
                "evaluate_during_training_verbose": True,
                "evaluate_during_training_silent": False,
                "early_stopping_metric_minimize": False,
                "early_stopping_metric": 'accmrr',
                "early_stopping_patience": 3,
                "early_stopping_delta": 0.005,
                "use_early_stopping": True,
                "evaluate_during_training_steps": 0,
                "early_stopping_consider_epochs": True,
                "best_model_dir": BEST_MODEL_DIR}

    # Create a ClassificationModel
    model = TopKClassificationModel(
        "bert", ORIGINAL_MODEL,
        topK=TOP_K,
        num_labels=len(class_list),
        args=train_args,
        use_cuda = True
    )
        
    # train
    print("Start training for batch size {batch}, learning rate {lr}".format(batch = BATCH_SIZE, lr = LEARNING_RATE))
    model.train_model(train_df, eval_df = dev_df, verbose=True, acctopK=acc_topK, accmrr=acc_mrr_multiclass)

    # evaluate on dev set
    print("Start evaluation for batch size {batch}, learning rate {lr}".format(batch = BATCH_SIZE, lr = LEARNING_RATE))
    result, model_outputs, wrong_predictions = model.eval_model(dev_df, acctopK=acc_topK, accmrr=acc_mrr_multiclass) 

    # record eval result
    full_result = { 
                    'params': { 
                        'batch_size': BATCH_SIZE, 
                        'learning_rate': LEARNING_RATE, 
                        'epochs': "best" #(i+1) 
                        }, 
                    'result': result 
                    }

    results.append(full_result)
    print("Evaluation result for batch size {batch}, learning rate {lr}".format(batch = BATCH_SIZE, lr = LEARNING_RATE))
    print(full_result)

    # if best result - save the model
    if not('result' in best_result.keys()) or (best_result['result']['accmrr'] < result['accmrr']):
      best_result = full_result

    shutil.copy(CURRENT_OUTPUT_DIR + DIR_SEPARATOR + 'training_progress_scores.csv', BASE_PATH + SAVED_MODELS_SUBFOLDER + 'training_progress_scores_' + str(BATCH_SIZE) + '_' + str(LEARNING_RATE) + '.csv')        
    cleanup_training_checkpoints(CURRENT_OUTPUT_DIR)

save_copy_model(BEST_MODEL_DIR, ORIGINAL_MODEL_FILE_NAME + '-' + str(BATCH_SIZE) + '_' + str(LEARNING_RATE) + '_best')     

# leave the best model
cleanup_training_checkpoints(OUTPUT_DIR, True)

\content\multilingual_bert\16_5e-05


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…


Start training for batch size 16, learning rate 5e-05


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=61274.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 7.641952



Running loss: 4.821805




HBox(children=(FloatProgress(value=0.0, max=6712.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 1.059154


HBox(children=(FloatProgress(value=0.0, max=6712.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 1.506028


HBox(children=(FloatProgress(value=0.0, max=6712.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 0.627128


HBox(children=(FloatProgress(value=0.0, max=6712.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 0.227945


HBox(children=(FloatProgress(value=0.0, max=6712.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 0.261877


HBox(children=(FloatProgress(value=0.0, max=6712.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 0.499431


HBox(children=(FloatProgress(value=0.0, max=6712.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 0.181845


HBox(children=(FloatProgress(value=0.0, max=6712.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 0.047893


HBox(children=(FloatProgress(value=0.0, max=6712.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 0.004604


HBox(children=(FloatProgress(value=0.0, max=6712.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=420.0), HTML(value='')))



Start evaluation for batch size 16, learning rate 5e-05


HBox(children=(FloatProgress(value=0.0, max=6712.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=420.0), HTML(value='')))


Evaluation result for batch size 16, learning rate 5e-05
{'params': {'batch_size': 16, 'learning_rate': 5e-05, 'epochs': 'best'}, 'result': {'mcc': 0.7955128862422453, 'acctopK': {0: 0.7967818831942789, 1: 0.08596543504171633, 2: 0.018623361144219308, 3: 0.010131108462455305, 4: 0.00432061978545888}, 'accmrr': 0.8493692888359158, 'eval_loss': 1.0520231640055067}}


In [35]:
import time
import json

RESULTS_FILE_PATH = BASE_PATH + 'results_hyperparameters_' +  str(time.time()) + '.txt'

with open(RESULTS_FILE_PATH, 'w') as outfile:
    json.dump(results, outfile)

print('Full results:')
print(results)
print('Best result:')
print(best_result)

Full results:
[{'params': {'batch_size': 16, 'learning_rate': 5e-05, 'epochs': 'best'}, 'result': {'mcc': 0.7955128862422453, 'acctopK': {0: 0.7967818831942789, 1: 0.08596543504171633, 2: 0.018623361144219308, 3: 0.010131108462455305, 4: 0.00432061978545888}, 'accmrr': 0.8493692888359158, 'eval_loss': 1.0520231640055067}}]
Best result:
{'params': {'batch_size': 16, 'learning_rate': 5e-05, 'epochs': 'best'}, 'result': {'mcc': 0.7955128862422453, 'acctopK': {0: 0.7967818831942789, 1: 0.08596543504171633, 2: 0.018623361144219308, 3: 0.010131108462455305, 4: 0.00432061978545888}, 'accmrr': 0.8493692888359158, 'eval_loss': 1.0520231640055067}}


# Load BERT Model

In [0]:
FIRST_TIME_TRAINING = False
BATCH_SIZE = 16
MAX_SEQ_LENGTH = 64

if BERT_MODEL == "multilingual":
  LEARNING_RATE = 5e-5
  NUM_TRAINING_EPOCHS = 7
else:
  LEARNING_RATE = 0.0001
  NUM_TRAINING_EPOCHS = 8

In [0]:
# define hyperparameter
train_args ={"output_dir": OUTPUT_DIR,
             "reprocess_input_data": True,
             "overwrite_output_dir": True,
             "fp16":False,
             "max_seq_length": MAX_SEQ_LENGTH,
             "train_batch_size": BATCH_SIZE,
             "eval_batch_size": BATCH_SIZE,
             "learning_rate":  LEARNING_RATE,
             "num_train_epochs": NUM_TRAINING_EPOCHS,
             "tensorboard_dir": TENSORBOARD_DIR,
             "save_model_every_epoch": True,
             "save_steps": 0,
             "evaluate_during_training": True,
             "evaluate_during_training_verbose": True,
             "evaluate_during_training_silent": False,
             "early_stopping_metric_minimize": False,
             "early_stopping_metric": 'accmrr',
             "early_stopping_patience": 3,
             "early_stopping_delta": 0.005,
             "use_early_stopping": True,
             "evaluate_during_training_steps": 0,
             "early_stopping_consider_epochs": True,
             "best_model_dir": BEST_MODEL_DIR}

# Create a ClassificationModel
model = TopKClassificationModel(
    "bert", ORIGINAL_MODEL if FIRST_TIME_TRAINING else BEST_MODEL_DIR,
    topK=TOP_K,
    num_labels=len(class_list),
    args=train_args,
    use_cuda = True
)

# Train & Evaluate

In [0]:
# Train the model
model.train_model(train_df, eval_df = dev_df, verbose=True, acctopK=acc_topK, accmrr=acc_mrr_multiclass)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=61274.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 7.584064



Running loss: 5.253979



Running loss: 5.034223


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 2.596781


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 3.824692


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 1.152680


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 0.823020


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 1.110886


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 0.119487


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 0.736435


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 0.515341


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3830.0, style=ProgressStyle(descr…

Running loss: 0.429131



In [0]:
result, model_outputs, wrong_predictions = model.eval_model(dev_df, acctopK=acc_topK, accmrr=acc_mrr_multiclass)

print("Evalution results on dev set for final epoch:")
result



HBox(children=(FloatProgress(value=0.0, max=6712.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=839.0), HTML(value='')))


Evalution results on dev set for final epoch:


{'acc': {'acc_topK': {0: 0.7750297973778307,
   1: 0.08820023837902265,
   2: 0.021901072705601907,
   3: 0.009535160905840286,
   4: 0.008492252681764005},
  'mrr': 0.8305125148986888},
 'eval_loss': 1.1447045614807483,
 'mcc': 0.7736057929503211}

In [38]:
result, model_outputs, wrong_predictions = model.eval_model(test_df, acctopK=acc_topK, accmrr=acc_mrr_multiclass)
print("Evalution results on test set for final epoch:")
result



HBox(children=(FloatProgress(value=0.0, max=7486.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=468.0), HTML(value='')))


Evalution results on test set for final epoch:


{'accmrr': 0.8417802119511978,
 'acctopK': {0: 0.7902751803366284,
  1: 0.08161902217472615,
  2: 0.02070531659096981,
  3: 0.010686615014694095,
  4: 0.0056104728827144},
 'eval_loss': 1.0953120623643582,
 'mcc': 0.7889498072914133}

In [0]:
# save the trained model
save_copy_model(BEST_MODEL_DIR, ORIGINAL_MODEL_FILE_NAME + '-' + str(BATCH_SIZE) + '_' + str(LEARNING_RATE) + '_epoch-best')
cleanup_training_checkpoints(OUTPUT_DIR)

# Predict

In [43]:
pred_sentences = [
  "Акроцефалополисиндактилия",
  "захарен диабет тип 2",
  "хипотироидизъм",
  "oстра дихателна недостатъчност",
  "конвулсивни припадъци"
]

for sentence in pred_sentences:
  predictions, raw_outputs = model.predict([sentence])
  for prediction in predictions[0]:
    print(class_list[prediction])

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Q87
Q18
Q74
Q75
F71


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


E11
E10
E29
E13
H36


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


E03
G31
E00
N25
E07


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


J96
F09
F43
F42
P28


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


G40
F63
N89
F80
R47
