In [None]:
# install simpletransformers
!pip install simpletransformers 

# check installed version
!pip freeze | grep simpletransformers

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pandas import DataFrame
import numpy as np

In [None]:
USE_COLAB = True
BASE_PATH = "/content/drive/My Drive/DiagnosisToCode/data/"

In [None]:
if USE_COLAB:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)

In [None]:
train_file = '{0}diseases_train.csv'.format(BASE_PATH)
test_file = '{0}diseases_test.csv'.format(BASE_PATH)
dev_file = '{0}diseases_cross_validation.csv'.format(BASE_PATH)

In [None]:
train_df = pd.read_csv(train_file, header=None)
train_df.head()

In [None]:
train_df.shape

In [None]:
test_df = pd.read_csv(test_file, header=None)
test_df.head()

In [None]:
test_df.shape

In [None]:
dev_df = pd.read_csv(dev_file)
dev_df.head()

In [None]:
dev_df.shape

In [None]:
import logging

from simpletransformers.language_modeling import (
    LanguageModelingModel
)

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

slavic_bert_dir = "/content/drive/My Drive/DiagnosisToCode/SlavicBert"
output_dir = slavic_bert_dir + "/output"
best_model_dir = output_dir + "/best_model"
cache_dir = slavic_bert_dir + "/cache"

model_args = {
     "output_dir": output_dir,
     "overwrite_output_dir": True,
     "best_model_dir": best_model_dir,
     "cache_dir": cache_dir,
     "reprocess_input_data": True,
     "overwrite_output_dir": True,
     "show_running_loss": True,
     "sliding_window": True,
     "fp16": False,
     "use_cuda": True,
     "dataset_type": "simple",
     "batch_size": 8,
     "num_train_epochs": 8,
     "save_model_every_epoch": True,
     "save_steps": 0,
     "evaluate_during_training": True,
     "evaluate_during_training_verbose": True,
     "evaluate_during_training_silent": False,
     "early_stopping_metric_minimize": True,
     "early_stopping_metric": 'perplexity',
     "early_stopping_patience": 3,
     "early_stopping_delta": 0.01,
     "use_early_stopping": True,
     "evaluate_during_training_steps": 0,
     "early_stopping_consider_epochs": True
}

model = LanguageModelingModel(
    "bert", best_model_dir, args=model_args, use_cuda=True
)

# Train the model
model.train_model(train_file=train_file, eval_file=dev_file, show_running_loss=True, verbose=True)

In [None]:
# Evaluate the model
result = model.eval_model(eval_file=test_file)

In [None]:
result