In [1]:
import pandas as pd
import numpy as np
import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from load_data import initialize_data
from reading_datasets import read_task
from labels_to_ids import labels_to_ids_mal
import time
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
def train(epoch, train_loader, model, optimizer, device, grad_step = 1, max_grad_norm = 10):
    tr_loss, tr_accuracy = 0, 0
    tr_precision, tr_recall = 0, 0
    tr_f1score = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    optimizer.zero_grad()
    
    for idx, batch in enumerate(train_loader):
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        if (idx + 1) % 20 == 0:
            print('FINSIHED BATCH:', idx, 'of', len(train_loader))

        #loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        output = model(input_ids=ids, attention_mask=mask, labels=labels)
        tr_loss += output[0]

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = output[1].view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
        
        # Compute Precision
        tmp_tr_precision = precision_score(labels.cpu().numpy(), predictions.cpu().numpy(), average = 'macro', zero_division=0 )
        tr_precision += tmp_tr_precision
        
        # Compute Recall
        tmp_tr_recall = recall_score(labels.cpu().numpy(), predictions.cpu().numpy(), average = 'macro', zero_division=0)
        tr_recall += tmp_tr_recall
        
        # Compute f1score
        tmp_tr_f1score = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average= 'macro', zero_division=0)
        tr_f1score += tmp_tr_f1score
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=max_grad_norm
        )
        
        # backward pass
        output['loss'].backward()
        if (idx + 1) % grad_step == 0:
            optimizer.step()
            optimizer.zero_grad()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    tr_precision = tr_precision / nb_tr_steps
    tr_recall = tr_recall / nb_tr_steps
    tr_f1score= tr_f1score / nb_tr_steps
    #print(f"Training loss epoch: {epoch_loss}")
    #print(f"Training accuracy epoch: {tr_accuracy}")

    return model

In [3]:
def testing(model, testing_loader, labels_to_ids, device):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    eval_precision, eval_recall = 0, 0
    eval_f1score = 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
     
    
    ids_to_labels = dict((v,k) for k,v in labels_to_ids.items())

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            #loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
            output = model(input_ids=ids, attention_mask=mask, labels=labels)

            eval_loss += output['loss'].item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = output[1].view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
            
            # Compute Precision
            tmp_eval_precision = precision_score(labels.cpu().numpy(), predictions.cpu().numpy(), average = 'macro', zero_division=0)
            eval_precision += tmp_eval_precision
            
            # Compute Recall
            tmp_eval_recall = recall_score(labels.cpu().numpy(), predictions.cpu().numpy(), average = 'macro', zero_division=0)
            eval_recall += tmp_eval_recall
            
            # Compute f1score
            tmp_eval_f1score = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='macro', zero_division=0)
            eval_f1score += tmp_eval_f1score

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    eval_precision = eval_precision / nb_eval_steps
    eval_recall = eval_recall / nb_eval_steps
    eval_f1score = eval_f1score / nb_eval_steps
    #print(f"Validation Loss: {eval_loss}")
    #print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions, eval_accuracy, eval_precision, eval_recall, eval_f1score

In [4]:
def main(n_epochs, model_name, model_save_flag, model_save_location, model_load_flag, model_load_location):
    #Initialization training parameters
    max_len = 256
    batch_size = 32
    grad_step = 1
    learning_rate = 1e-05
    initialization_input = (max_len, batch_size)

    #Reading datasets and initializing data loaders
    dataset_location = '../datasets/task_a/'

    train_data = read_task(dataset_location , split = 'mal_sentiment_train')
    dev_data = read_task(dataset_location , split = 'mal_sentiment_dev')
    #test_data = read_task(dataset_location , split = 'dev')#load test set
    labels_to_ids = labels_to_ids_mal
    #input_data = (train_data, dev_data, labels_to_ids)

    #Define tokenizer, model and optimizer
    device = 'cuda' if cuda.is_available() else 'cpu' #save the processing time
    if model_load_flag:
        tokenizer = AutoTokenizer.from_pretrained(model_load_location)
        model = AutoModelForSequenceClassification.from_pretrained(model_load_location)
    else: 
        tokenizer =  AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels_to_ids))
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    model.to(device)

    #Get dataloaders
    train_loader = initialize_data(tokenizer, initialization_input, train_data, labels_to_ids, shuffle = True)
    dev_loader = initialize_data(tokenizer, initialization_input, dev_data, labels_to_ids, shuffle = True)
    #test_loader = initialize_data(tokenizer, initialization_input, test_data, labels_to_ids, shuffle = True)#create test loader

    best_dev_acc = 0
    best_test_acc = 0
    best_dev_precision = 0
    best_test_precision = 0
    best_dev_recall = 0
    best_test_recall = 0
    best_dev_f1score = 0
    best_test_f1score = 0
    best_epoch = -1
    
    list_dev_acc = [] 
    list_test_acc = []  
    list_dev_precision = []  
    list_test_precision  = []  
    list_dev_recall = []  
    list_test_recall = []  
    list_dev_f1score = []  
    list_test_f1score = []
    
    for epoch in range(n_epochs):
        start = time.time()
        print(f"Training epoch: {epoch + 1}")

        #train model
        if not model_load_flag:
            model = train(epoch, train_loader, model, optimizer, device, grad_step)
        
        #testing and logging
        labels_dev, predictions_dev, dev_accuracy, dev_precision, dev_recall, dev_f1score = testing(model, dev_loader, labels_to_ids, device)
        print('DEV ACC:', dev_accuracy)
        print('DEV Precision:' , dev_precision)
        print('DEV Recall:' , dev_recall)
        print('DEV F1Score:' , dev_f1score)
        
        list_dev_acc.append(dev_accuracy)     
        list_dev_precision.append(dev_precision)   
        list_dev_recall.append(dev_recall)  
        list_dev_f1score.append(dev_f1score)  
        
        
        #labels_test, predictions_test, test_accuracy, test_precision, test_recall, test_f1score = testing(model, test_loader, labels_to_ids, device)
        #print('TEST ACC:', test_accuracy)
        #print('TEST Precision:' , test_precision)
        #print('TEST Recall:' , test_recall)
        #print('TEST F1Score:' , test_f1score)
        
        #list_test_acc.append(test_accuracy) 
        #list_test_precision.append(test_precision)  
        #list_test_recall.append(test_recall)
        #list_test_f1score.append(test_f1score) 

        #saving model
        if dev_accuracy > best_dev_acc:
            best_dev_acc = dev_accuracy
            #best_test_acc = test_accuracy
        if dev_precision > best_dev_precision:
            best_dev_precision = dev_precision
            #best_test_precision = test_precision
        if dev_recall > best_dev_recall:
            best_dev_recall = dev_recall
            #best_test_recall = test_recall
        if dev_f1score > best_dev_f1score:
            best_dev_f1score = dev_f1score
            #best_test_f1score = test_f1score
            best_epoch = epoch
            
            if model_save_flag:
                os.makedirs(model_save_location, exist_ok=True)
                tokenizer.save_pretrained(model_save_location)
                model.save_pretrained(model_save_location)

        now = time.time()
        print('BEST ACCURACY --> ', 'DEV:', round(best_dev_acc, 5))
        print('BEST PRECISION --> ', 'DEV:', round(best_dev_precision, 5))
        print('BEST RECALL --> ', 'DEV:', round(best_dev_recall, 5))
        print('BEST F1SCORE --> ', 'DEV:', round(best_dev_f1score, 5))
        print('TIME PER EPOCH:', (now-start)/60 )
        print()

    return best_dev_acc, best_test_acc, best_epoch, best_dev_precision, best_test_precision, best_dev_recall, best_test_recall, best_dev_f1score, best_test_f1score, list_dev_acc, list_test_acc, list_dev_precision, list_test_precision, list_dev_recall, list_test_recall, list_dev_f1score, list_test_f1score

In [5]:
if __name__ == '__main__':
    n_epochs = 10
    models = ['xlm-roberta-base']
    
    #model saving parameters
    model_save_flag = True
    model_load_flag = False
    
    overall_list_dev_acc = [] 
    overall_list_test_acc = []    
    overall_list_dev_precision = []  
    overall_list_test_precision  = []  
    overall_list_dev_recall = []  
    overall_list_test_recall = []  
    overall_list_dev_f1score = []  
    overall_list_test_f1score = [] 
    
    for i in range(5):
        
        for model_name in models:

            model_save_location = 'saved_models/' + model_name + 'Malayalam' + str(i)
            model_load_location = None

            best_dev_acc, best_test_acc, best_epoch, best_dev_precision, best_test_precision, best_dev_recall, best_test_recall, best_dev_f1score, best_test_f1score, list_dev_acc, list_test_acc, list_dev_precision, list_test_precision, list_dev_recall, list_test_recall, list_dev_f1score, list_test_f1score = main(n_epochs, model_name, model_save_flag, model_save_location, model_load_flag, model_load_location)
            
            overall_list_dev_acc.append(list_dev_acc) 
            overall_list_test_acc.append(list_test_acc) 
            overall_list_dev_precision.append(list_dev_precision)  
            overall_list_test_precision.append(list_test_precision) 
            overall_list_dev_recall.append(list_dev_recall)  
            overall_list_test_recall.append(list_test_recall)  
            overall_list_dev_f1score.append(list_dev_f1score)  
            overall_list_test_f1score.append(list_test_f1score) 

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

Training epoch: 1
FINSIHED BATCH: 19 of 497
FINSIHED BATCH: 39 of 497
FINSIHED BATCH: 59 of 497
FINSIHED BATCH: 79 of 497
FINSIHED BATCH: 99 of 497
FINSIHED BATCH: 119 of 497
FINSIHED BATCH: 139 of 497
FINSIHED BATCH: 159 of 497
FINSIHED BATCH: 179 of 497
FINSIHED BATCH: 199 of 497
FINSIHED BATCH: 219 of 497
FINSIHED BATCH: 239 of 497
FINSIHED BATCH: 259 of 497
FINSIHED BATCH: 279 of 497
FINSIHED BATCH: 299 of 497
FINSIHED BATCH: 319 of 497
FINSIHED BATCH: 339 of 497
FINSIHED BATCH: 359 of 497
FINSIHED BATCH: 379 of 497
FINSIHED BATCH: 399 of 497
FINSIHED BATCH: 419 of 497
FINSIHED BATCH: 439 of 497
FINSIHED BATCH: 459 of 497
FINSIHED BATCH: 479 of 497
Validation loss per 100 evaluation steps: 0.8729907274246216
DEV ACC: 0.6417410714285714
DEV Precision: 0.47390393328767527
DEV Recall: 0.49645170098004837
DEV F1Score: 0.4622464704346024
BEST ACCURACY -->  DEV: 0.64174
BEST PRECISION -->  DEV: 0.4739
BEST RECALL -->  DEV: 0.49645
BEST F1SCORE -->  DEV: 0.46225
TIME PER EPOCH: 11.9590780

FINSIHED BATCH: 419 of 497
FINSIHED BATCH: 439 of 497
FINSIHED BATCH: 459 of 497
FINSIHED BATCH: 479 of 497
Validation loss per 100 evaluation steps: 1.4481534957885742
DEV ACC: 0.7351190476190476
DEV Precision: 0.6684189314834631
DEV Recall: 0.6853898077526311
DEV F1Score: 0.658097685935347
BEST ACCURACY -->  DEV: 0.73828
BEST PRECISION -->  DEV: 0.69161
BEST RECALL -->  DEV: 0.68639
BEST F1SCORE -->  DEV: 0.66506
TIME PER EPOCH: 12.05347146987915

Training epoch: 10
FINSIHED BATCH: 19 of 497
FINSIHED BATCH: 39 of 497
FINSIHED BATCH: 119 of 497
FINSIHED BATCH: 139 of 497
FINSIHED BATCH: 159 of 497
FINSIHED BATCH: 179 of 497
FINSIHED BATCH: 199 of 497
FINSIHED BATCH: 219 of 497
FINSIHED BATCH: 239 of 497
FINSIHED BATCH: 259 of 497
FINSIHED BATCH: 279 of 497
FINSIHED BATCH: 299 of 497
FINSIHED BATCH: 319 of 497
FINSIHED BATCH: 339 of 497
FINSIHED BATCH: 359 of 497
FINSIHED BATCH: 379 of 497
FINSIHED BATCH: 399 of 497
FINSIHED BATCH: 419 of 497
FINSIHED BATCH: 439 of 497
FINSIHED BATCH: 

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

Training epoch: 1
FINSIHED BATCH: 19 of 497
FINSIHED BATCH: 39 of 497
FINSIHED BATCH: 59 of 497
FINSIHED BATCH: 79 of 497
FINSIHED BATCH: 99 of 497
FINSIHED BATCH: 119 of 497
FINSIHED BATCH: 139 of 497
FINSIHED BATCH: 159 of 497
FINSIHED BATCH: 179 of 497
FINSIHED BATCH: 199 of 497
FINSIHED BATCH: 219 of 497
FINSIHED BATCH: 239 of 497
FINSIHED BATCH: 259 of 497
FINSIHED BATCH: 279 of 497
FINSIHED BATCH: 299 of 497
FINSIHED BATCH: 319 of 497
FINSIHED BATCH: 339 of 497
FINSIHED BATCH: 359 of 497
FINSIHED BATCH: 379 of 497
FINSIHED BATCH: 399 of 497
FINSIHED BATCH: 419 of 497
FINSIHED BATCH: 439 of 497
FINSIHED BATCH: 459 of 497
FINSIHED BATCH: 479 of 497
Validation loss per 100 evaluation steps: 0.8178290128707886
DEV ACC: 0.6261160714285714
DEV Precision: 0.4850468813690976
DEV Recall: 0.5411985073021051
DEV F1Score: 0.49154401919446566
BEST ACCURACY -->  DEV: 0.62612
BEST PRECISION -->  DEV: 0.48505
BEST RECALL -->  DEV: 0.5412
BEST F1SCORE -->  DEV: 0.49154
TIME PER EPOCH: 11.88620485

FINSIHED BATCH: 119 of 497
FINSIHED BATCH: 139 of 497
FINSIHED BATCH: 159 of 497
FINSIHED BATCH: 179 of 497
FINSIHED BATCH: 199 of 497
FINSIHED BATCH: 219 of 497
FINSIHED BATCH: 239 of 497
FINSIHED BATCH: 259 of 497
FINSIHED BATCH: 279 of 497
FINSIHED BATCH: 299 of 497
FINSIHED BATCH: 319 of 497
FINSIHED BATCH: 339 of 497
FINSIHED BATCH: 359 of 497
FINSIHED BATCH: 379 of 497
FINSIHED BATCH: 399 of 497
FINSIHED BATCH: 419 of 497
FINSIHED BATCH: 439 of 497
FINSIHED BATCH: 459 of 497
FINSIHED BATCH: 479 of 497
Validation loss per 100 evaluation steps: 1.4059439897537231
DEV ACC: 0.7388392857142857
DEV Precision: 0.678286983748996
DEV Recall: 0.6672454627362231
DEV F1Score: 0.6541816627885166
BEST ACCURACY -->  DEV: 0.73884
BEST PRECISION -->  DEV: 0.68087
BEST RECALL -->  DEV: 0.68292
BEST F1SCORE -->  DEV: 0.65647
TIME PER EPOCH: 12.108386981487275

Training epoch: 10
FINSIHED BATCH: 19 of 497
FINSIHED BATCH: 39 of 497
FINSIHED BATCH: 59 of 497
FINSIHED BATCH: 79 of 497
FINSIHED BATCH: 9

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

Training epoch: 1
FINSIHED BATCH: 19 of 497
FINSIHED BATCH: 39 of 497
FINSIHED BATCH: 59 of 497
FINSIHED BATCH: 79 of 497
FINSIHED BATCH: 99 of 497
FINSIHED BATCH: 119 of 497
FINSIHED BATCH: 139 of 497
FINSIHED BATCH: 159 of 497
FINSIHED BATCH: 179 of 497
FINSIHED BATCH: 199 of 497
FINSIHED BATCH: 219 of 497
FINSIHED BATCH: 239 of 497
FINSIHED BATCH: 259 of 497
FINSIHED BATCH: 279 of 497
FINSIHED BATCH: 299 of 497
FINSIHED BATCH: 319 of 497
FINSIHED BATCH: 339 of 497
FINSIHED BATCH: 359 of 497
FINSIHED BATCH: 379 of 497
FINSIHED BATCH: 399 of 497
FINSIHED BATCH: 419 of 497
FINSIHED BATCH: 439 of 497
FINSIHED BATCH: 459 of 497
FINSIHED BATCH: 479 of 497
Validation loss per 100 evaluation steps: 0.8488255143165588
DEV ACC: 0.6389508928571429
DEV Precision: 0.5170747833943256
DEV Recall: 0.49952207449121805
DEV F1Score: 0.47783561751372605
BEST ACCURACY -->  DEV: 0.63895
BEST PRECISION -->  DEV: 0.51707
BEST RECALL -->  DEV: 0.49952
BEST F1SCORE -->  DEV: 0.47784
TIME PER EPOCH: 11.776398

FINSIHED BATCH: 119 of 497
FINSIHED BATCH: 139 of 497
FINSIHED BATCH: 159 of 497
FINSIHED BATCH: 179 of 497
FINSIHED BATCH: 199 of 497
FINSIHED BATCH: 219 of 497
FINSIHED BATCH: 239 of 497
FINSIHED BATCH: 339 of 497
FINSIHED BATCH: 359 of 497
FINSIHED BATCH: 379 of 497
FINSIHED BATCH: 399 of 497
FINSIHED BATCH: 419 of 497
FINSIHED BATCH: 439 of 497
FINSIHED BATCH: 459 of 497
FINSIHED BATCH: 479 of 497
Validation loss per 100 evaluation steps: 0.7394698262214661
DEV ACC: 0.7420014880952381
DEV Precision: 0.6479621805092421
DEV Recall: 0.6722580626610232
DEV F1Score: 0.6413105274747947
BEST ACCURACY -->  DEV: 0.74833
BEST PRECISION -->  DEV: 0.67854
BEST RECALL -->  DEV: 0.67522
BEST F1SCORE -->  DEV: 0.65587
TIME PER EPOCH: 12.050318789482116

Training epoch: 10
FINSIHED BATCH: 19 of 497
FINSIHED BATCH: 39 of 497
FINSIHED BATCH: 59 of 497
FINSIHED BATCH: 79 of 497
FINSIHED BATCH: 99 of 497
FINSIHED BATCH: 119 of 497
FINSIHED BATCH: 139 of 497
FINSIHED BATCH: 159 of 497
FINSIHED BATCH: 1

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

Training epoch: 1
FINSIHED BATCH: 19 of 497
FINSIHED BATCH: 39 of 497
FINSIHED BATCH: 59 of 497
FINSIHED BATCH: 79 of 497
FINSIHED BATCH: 99 of 497
FINSIHED BATCH: 119 of 497
FINSIHED BATCH: 139 of 497
FINSIHED BATCH: 159 of 497
FINSIHED BATCH: 179 of 497
FINSIHED BATCH: 199 of 497
FINSIHED BATCH: 219 of 497
FINSIHED BATCH: 239 of 497
FINSIHED BATCH: 259 of 497
FINSIHED BATCH: 279 of 497
FINSIHED BATCH: 299 of 497
FINSIHED BATCH: 319 of 497
FINSIHED BATCH: 339 of 497
FINSIHED BATCH: 359 of 497
FINSIHED BATCH: 379 of 497
FINSIHED BATCH: 399 of 497
FINSIHED BATCH: 419 of 497
FINSIHED BATCH: 439 of 497
FINSIHED BATCH: 459 of 497
FINSIHED BATCH: 479 of 497
Validation loss per 100 evaluation steps: 1.0190374851226807
DEV ACC: 0.6482514880952381
DEV Precision: 0.5096759544819033
DEV Recall: 0.5432160419529116
DEV F1Score: 0.5092819898097434
BEST ACCURACY -->  DEV: 0.64825
BEST PRECISION -->  DEV: 0.50968
BEST RECALL -->  DEV: 0.54322
BEST F1SCORE -->  DEV: 0.50928
TIME PER EPOCH: 11.94056435

FINSIHED BATCH: 119 of 497
FINSIHED BATCH: 139 of 497
FINSIHED BATCH: 159 of 497
FINSIHED BATCH: 179 of 497
FINSIHED BATCH: 199 of 497
FINSIHED BATCH: 219 of 497
FINSIHED BATCH: 239 of 497
FINSIHED BATCH: 259 of 497
FINSIHED BATCH: 279 of 497
FINSIHED BATCH: 299 of 497
FINSIHED BATCH: 319 of 497
FINSIHED BATCH: 339 of 497
FINSIHED BATCH: 359 of 497
FINSIHED BATCH: 379 of 497
FINSIHED BATCH: 399 of 497
FINSIHED BATCH: 419 of 497
FINSIHED BATCH: 439 of 497
FINSIHED BATCH: 459 of 497
FINSIHED BATCH: 479 of 497
Validation loss per 100 evaluation steps: 0.6128011345863342
DEV ACC: 0.7328869047619048
DEV Precision: 0.6747301471042348
DEV Recall: 0.663542942235169
DEV F1Score: 0.6463083312137151
BEST ACCURACY -->  DEV: 0.74628
BEST PRECISION -->  DEV: 0.68298
BEST RECALL -->  DEV: 0.6692
BEST F1SCORE -->  DEV: 0.65056
TIME PER EPOCH: 12.140555894374847

Training epoch: 10
FINSIHED BATCH: 19 of 497
FINSIHED BATCH: 39 of 497
FINSIHED BATCH: 59 of 497
FINSIHED BATCH: 79 of 497
FINSIHED BATCH: 99

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

Training epoch: 1
FINSIHED BATCH: 19 of 497
FINSIHED BATCH: 39 of 497
FINSIHED BATCH: 59 of 497
FINSIHED BATCH: 79 of 497
FINSIHED BATCH: 99 of 497
FINSIHED BATCH: 119 of 497
FINSIHED BATCH: 139 of 497
FINSIHED BATCH: 159 of 497
FINSIHED BATCH: 179 of 497
FINSIHED BATCH: 199 of 497
FINSIHED BATCH: 219 of 497
FINSIHED BATCH: 239 of 497
FINSIHED BATCH: 259 of 497
FINSIHED BATCH: 279 of 497
FINSIHED BATCH: 299 of 497
FINSIHED BATCH: 319 of 497
FINSIHED BATCH: 339 of 497
FINSIHED BATCH: 359 of 497
FINSIHED BATCH: 379 of 497
FINSIHED BATCH: 399 of 497
FINSIHED BATCH: 419 of 497
FINSIHED BATCH: 439 of 497
FINSIHED BATCH: 459 of 497
FINSIHED BATCH: 479 of 497
Validation loss per 100 evaluation steps: 1.0592859983444214
DEV ACC: 0.5872395833333333
DEV Precision: 0.3924894354630671
DEV Recall: 0.4667595556157379
DEV F1Score: 0.4067118864768016
BEST ACCURACY -->  DEV: 0.58724
BEST PRECISION -->  DEV: 0.39249
BEST RECALL -->  DEV: 0.46676
BEST F1SCORE -->  DEV: 0.40671
TIME PER EPOCH: 12.07463461

FINSIHED BATCH: 119 of 497
FINSIHED BATCH: 139 of 497
FINSIHED BATCH: 159 of 497
FINSIHED BATCH: 179 of 497
FINSIHED BATCH: 199 of 497
FINSIHED BATCH: 219 of 497
FINSIHED BATCH: 239 of 497
FINSIHED BATCH: 259 of 497
FINSIHED BATCH: 279 of 497
FINSIHED BATCH: 299 of 497
FINSIHED BATCH: 319 of 497
FINSIHED BATCH: 339 of 497
FINSIHED BATCH: 359 of 497
FINSIHED BATCH: 379 of 497
FINSIHED BATCH: 399 of 497
FINSIHED BATCH: 419 of 497
FINSIHED BATCH: 439 of 497
FINSIHED BATCH: 459 of 497
FINSIHED BATCH: 479 of 497
Validation loss per 100 evaluation steps: 0.6193797588348389
DEV ACC: 0.7416294642857143
DEV Precision: 0.6733242361898446
DEV Recall: 0.6494783861357696
DEV F1Score: 0.6368007395525709
BEST ACCURACY -->  DEV: 0.74163
BEST PRECISION -->  DEV: 0.67332
BEST RECALL -->  DEV: 0.66661
BEST F1SCORE -->  DEV: 0.64133
TIME PER EPOCH: 11.629579552014668

Training epoch: 10
FINSIHED BATCH: 19 of 497
FINSIHED BATCH: 39 of 497
FINSIHED BATCH: 59 of 497
FINSIHED BATCH: 79 of 497
FINSIHED BATCH: 

In [6]:
print(best_dev_acc)

0.7416294642857143


In [7]:
print(best_epoch)

9


In [8]:
print(best_dev_precision)

0.6733242361898446


In [9]:
print(best_dev_recall)

0.6819214080783216


In [10]:
print(best_dev_f1score)

0.6555969222773047


In [11]:
print(overall_list_dev_acc)

[[0.6417410714285714, 0.6527157738095238, 0.7062872023809524, 0.7061011904761905, 0.7336309523809524, 0.7369791666666667, 0.73828125, 0.7239583333333333, 0.7351190476190476, 0.7438616071428571], [0.6261160714285714, 0.6765252976190476, 0.671875, 0.7181919642857143, 0.7375372023809524, 0.7341889880952381, 0.7297247023809524, 0.7347470238095238, 0.7388392857142857, 0.7444196428571429], [0.6389508928571429, 0.6819196428571429, 0.697172619047619, 0.7254464285714286, 0.7483258928571429, 0.736235119047619, 0.7332589285714286, 0.7349330357142857, 0.7420014880952381, 0.7410714285714286], [0.6482514880952381, 0.6555059523809524, 0.7062872023809524, 0.716889880952381, 0.7327008928571429, 0.7462797619047619, 0.7386532738095238, 0.7250744047619048, 0.7328869047619048, 0.7328869047619048], [0.5872395833333333, 0.6716889880952381, 0.7007068452380952, 0.7271205357142857, 0.7267485119047619, 0.7276785714285714, 0.7271205357142857, 0.7386532738095238, 0.7416294642857143, 0.7408854166666667]]


In [12]:
print(overall_list_dev_precision)

[[0.47390393328767527, 0.532160602459454, 0.6368235691559165, 0.6285560126700661, 0.6716092706213573, 0.6916060586530649, 0.6604518998895104, 0.6742867952670392, 0.6684189314834631, 0.6913082283735478], [0.4850468813690976, 0.5445623253211849, 0.6012883198151056, 0.6539759253251969, 0.6808705909251505, 0.6637186887161318, 0.6451390117041379, 0.6778838847832105, 0.678286983748996, 0.6763696374720601], [0.5170747833943256, 0.5935569559133028, 0.5945191550772255, 0.6683298973957145, 0.6719625265108803, 0.6610693282266283, 0.6754965976928744, 0.6785422624377315, 0.6479621805092421, 0.6632307292242745], [0.5096759544819033, 0.5353651704141716, 0.592164484529243, 0.6407020030477227, 0.6416445428619245, 0.6694661561870484, 0.6829809100517013, 0.6333755305160037, 0.6747301471042348, 0.6632597963427797], [0.3924894354630671, 0.5343931650649155, 0.5848724187373086, 0.6246520761481318, 0.6606437928463927, 0.6298016649458213, 0.6596556480907294, 0.6558496537455307, 0.6733242361898446, 0.6712152658

In [13]:
print(overall_list_dev_recall)

[[0.49645170098004837, 0.5757048340169401, 0.6206403283195562, 0.6492825822360598, 0.6496756844568585, 0.6863877558599196, 0.6484420107832729, 0.6679435566092465, 0.6853898077526311, 0.6749190594069793], [0.5411985073021051, 0.5436280964968494, 0.6309053450312158, 0.6177198164376474, 0.6705013900227923, 0.6609485933758242, 0.6675741599240604, 0.6829201298050809, 0.6672454627362231, 0.6705202501040902], [0.49952207449121805, 0.5563726914133953, 0.5989964303220673, 0.6673578137702144, 0.6752168326538074, 0.6654966250240741, 0.6699333571569653, 0.6622710262718139, 0.6722580626610232, 0.6634167878674596], [0.5432160419529116, 0.5295074442218611, 0.6032608097143287, 0.6366944790008079, 0.6482573096369882, 0.6538897791051153, 0.6691976220849647, 0.6488633323529386, 0.663542942235169, 0.6805566519294198], [0.4667595556157379, 0.5388176988682848, 0.5925474104268746, 0.6313011596070461, 0.6604141222339751, 0.6416325898496743, 0.6362785218407385, 0.66661227115831, 0.6494783861357696, 0.681921408

In [14]:
print(overall_list_dev_f1score)

[[0.4622464704346024, 0.5294293328641828, 0.6025745558531928, 0.6157691574550028, 0.6344774173542603, 0.6650560734717512, 0.6337847263258539, 0.647105011114274, 0.658097685935347, 0.6592546708737933], [0.49154401919446566, 0.5227336774260519, 0.5941295011758733, 0.6138116795062777, 0.650543147029189, 0.6433351288633399, 0.637286001534221, 0.6564654815442095, 0.6541816627885166, 0.6509445812169533], [0.47783561751372605, 0.5496243136200788, 0.5796731061061793, 0.6425432146274224, 0.6558704825512229, 0.6431168440097897, 0.6483214922478986, 0.6495269540738786, 0.6413105274747947, 0.6425112974539919], [0.5092819898097434, 0.5074808544994215, 0.5794872198601139, 0.6174166489520897, 0.6242074719350471, 0.6403059467666162, 0.6505641345575375, 0.6211258174806573, 0.6463083312137151, 0.6491969484512271], [0.4067118864768016, 0.5102913313489785, 0.5693019431820526, 0.6091966616945207, 0.6407729871728041, 0.6137969540140569, 0.6201961407698918, 0.6413288524554954, 0.6368007395525709, 0.6555969222

In [15]:
#The best model is 3