In [None]:
import numpy as np
import time
import pandas as pd
from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')
import gc
from scipy.special import softmax
from simpletransformers.classification import ClassificationModel
import sklearn
from sklearn.metrics import log_loss, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
import re
import random
import torch
pd.options.display.max_colwidth = 200
#Reproducing same results
SEED = 2020

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

#choose the same seed to assure that our model will be roproducible

def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_all(SEED)

In [None]:
output_path = ''

In [None]:
train = pd.read_csv()
print(train.head())

train_data, valid_data = train_test_split(train, test_size=0.2, random_state = random.seed(SEED))

In [None]:
f = open(output_path, 'a')
f.write('Model,Loss,Accuracy,F1,Precision,Recall,Time\n')
f.close()

More models: https://simpletransformers.ai/docs/classification-specifics/

In [None]:
torch.cuda.empty_cache()
    
for (model_name, tokenizer_name) in [('roberta', 'roberta-base'), ('xlmroberta', 'xlm-roberta-base'), ('bert', 'bert-base-cased')]: # ('xlmroberta', 'xlm-roberta-base'), ('bert', 'bert-base-cased'), ('roberta', 'roberta-base')
  epoch_loss = 0
  epoch_acc = 0
  epoch_f1 = 0
  epoch_precision = 0.
  epoch_recall = 0
  start = time.time()
  
  model = ClassificationModel(model_name,
                              tokenizer_name,
                              use_cuda=True,
                              num_labels=2,
                              args={
                                  'reprocess_input_data': True,
                                  'train_batch_size': 64,
                                  'use_early_stopping' : True,
                                  'early_stopping_delta' : 0.01,
                                  'early_stopping_metric' : "eval_loss",
                                  'early_stopping_metric_minimize' : True,
                                  'early_stopping_patience' : 5,
                                  'reprocess_input_data': True,
                                  'overwrite_output_dir': True,
                                  'fp16': False,
                                  'do_lower_case': False,
                                  'num_train_epochs': 50,
                                  'regression': False,
                                  'manual_seed': SEED,
                                  "learning_rate":2e-5,
                                  'weight_decay':0,
                                  "save_eval_checkpoints": False,
                                  "save_model_every_epoch": False,
                                  "silent": False
                              })
  model.train_model(train_data)
  model.save_model(output_dir=model_name)

  raw_outputs_val = model.eval_model(valid_data)[1]
  predicted = softmax(raw_outputs_val,axis=1)
  real = valid_data['labels']
  max_predict = list([1 if (x[1] > x[0]) else 0 for x in predicted])
  
  epoch_loss = log_loss(real, max_predict)
  epoch_f1 = f1_score(real, max_predict, average="macro")
  epoch_precision = precision_score(real, max_predict, average="macro")
  epoch_recall = recall_score(real, max_predict, average="macro")
  epoch_acc = balanced_accuracy_score(max_predict, real)
  
  end = time.time()
  f = open(output_path, 'a')
  f.write('Test_' + str(model_name) + ',' + str(epoch_loss) + ',' + str(epoch_acc) + ',' + str(epoch_f1) + ',' + str(epoch_precision) + ',' + str(epoch_recall) + ',' + str(end - start) + '\n')
  
  raw_outputs_val = model.eval_model(train_data)[1]
  predicted = softmax(raw_outputs_val,axis=1)
  real = train_data['labels']
  max_predict = list([1 if (x[1] > x[0]) else 0 for x in predicted])
  
  epoch_loss = log_loss(real, max_predict)
  epoch_f1 = f1_score(real, max_predict, average="macro")
  epoch_precision = precision_score(real, max_predict, average="macro")
  epoch_recall = recall_score(real, max_predict, average="macro")
  epoch_acc = balanced_accuracy_score(max_predict, real)
  f.write('Train_' + str(model_name) + ',' + str(epoch_loss) + ',' + str(epoch_acc) + ',' + str(epoch_f1) + ',' + str(epoch_precision) + ',' + str(epoch_recall) + ',' + str(end - start) + '\n')

  f.close()