In [1]:
import pandas as pd
import numpy as np
import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from load_data import initialize_data
from reading_datasets import read_task
from labels_to_ids import labels_to_ids_kan
import time
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
def train(epoch, train_loader, model, optimizer, device, grad_step = 1, max_grad_norm = 10):
    tr_loss, tr_accuracy = 0, 0
    tr_precision, tr_recall = 0, 0
    tr_f1score = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    optimizer.zero_grad()
    
    for idx, batch in enumerate(train_loader):
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        if (idx + 1) % 20 == 0:
            print('FINSIHED BATCH:', idx, 'of', len(train_loader))

        #loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        output = model(input_ids=ids, attention_mask=mask, labels=labels)
        tr_loss += output[0]

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = output[1].view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
        
        # Compute Precision
        tmp_tr_precision = precision_score(labels.cpu().numpy(), predictions.cpu().numpy(), average = 'macro', zero_division=0 )
        tr_precision += tmp_tr_precision
        
        # Compute Recall
        tmp_tr_recall = recall_score(labels.cpu().numpy(), predictions.cpu().numpy(), average = 'macro', zero_division=0)
        tr_recall += tmp_tr_recall
        
        # Compute f1score
        tmp_tr_f1score = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average= 'macro', zero_division=0)
        tr_f1score += tmp_tr_f1score
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=max_grad_norm
        )
        
        # backward pass
        output['loss'].backward()
        if (idx + 1) % grad_step == 0:
            optimizer.step()
            optimizer.zero_grad()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    tr_precision = tr_precision / nb_tr_steps
    tr_recall = tr_recall / nb_tr_steps
    tr_f1score= tr_f1score / nb_tr_steps
    #print(f"Training loss epoch: {epoch_loss}")
    #print(f"Training accuracy epoch: {tr_accuracy}")

    return model

In [3]:
def testing(model, testing_loader, labels_to_ids, device):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    eval_precision, eval_recall = 0, 0
    eval_f1score = 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
     
    
    ids_to_labels = dict((v,k) for k,v in labels_to_ids.items())

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            #loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
            output = model(input_ids=ids, attention_mask=mask, labels=labels)

            eval_loss += output['loss'].item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = output[1].view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
            
            # Compute Precision
            tmp_eval_precision = precision_score(labels.cpu().numpy(), predictions.cpu().numpy(), average = 'macro', zero_division=0)
            eval_precision += tmp_eval_precision
            
            # Compute Recall
            tmp_eval_recall = recall_score(labels.cpu().numpy(), predictions.cpu().numpy(), average = 'macro', zero_division=0)
            eval_recall += tmp_eval_recall
            
            # Compute f1score
            tmp_eval_f1score = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='macro', zero_division=0)
            eval_f1score += tmp_eval_f1score

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    eval_precision = eval_precision / nb_eval_steps
    eval_recall = eval_recall / nb_eval_steps
    eval_f1score = eval_f1score / nb_eval_steps
    #print(f"Validation Loss: {eval_loss}")
    #print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions, eval_accuracy, eval_precision, eval_recall, eval_f1score

In [4]:
def main(n_epochs, model_name, model_save_flag, model_save_location, model_load_flag, model_load_location):
    #Initialization training parameters
    max_len = 256
    batch_size = 32
    grad_step = 1
    learning_rate = 1e-05
    initialization_input = (max_len, batch_size)

    #Reading datasets and initializing data loaders
    dataset_location = '../datasets/task_a/transliterated/'

    train_data = read_task(dataset_location , split = 'kan_train_trans')
    dev_data = read_task(dataset_location , split = 'kan_dev_trans')
    #test_data = read_task(dataset_location , split = 'dev')#load test set
    labels_to_ids = labels_to_ids_kan
    #input_data = (train_data, dev_data, labels_to_ids)

    #Define tokenizer, model and optimizer
    device = 'cuda' if cuda.is_available() else 'cpu' #save the processing time
    if model_load_flag:
        tokenizer = AutoTokenizer.from_pretrained(model_load_location)
        model = AutoModelForSequenceClassification.from_pretrained(model_load_location)
    else: 
        tokenizer =  AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels_to_ids))
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    model.to(device)

    #Get dataloaders
    train_loader = initialize_data(tokenizer, initialization_input, train_data, labels_to_ids, shuffle = True)
    dev_loader = initialize_data(tokenizer, initialization_input, dev_data, labels_to_ids, shuffle = True)
    #test_loader = initialize_data(tokenizer, initialization_input, test_data, labels_to_ids, shuffle = True)#create test loader

    best_dev_acc = 0
    best_test_acc = 0
    best_dev_precision = 0
    best_test_precision = 0
    best_dev_recall = 0
    best_test_recall = 0
    best_dev_f1score = 0
    best_test_f1score = 0
    best_epoch = -1
    
    list_dev_acc = [] 
    list_test_acc = []  
    list_dev_precision = []  
    list_test_precision  = []  
    list_dev_recall = []  
    list_test_recall = []  
    list_dev_f1score = []  
    list_test_f1score = []
    
    for epoch in range(n_epochs):
        start = time.time()
        print(f"Training epoch: {epoch + 1}")

        #train model
        if not model_load_flag:
            model = train(epoch, train_loader, model, optimizer, device, grad_step)
        
        #testing and logging
        labels_dev, predictions_dev, dev_accuracy, dev_precision, dev_recall, dev_f1score = testing(model, dev_loader, labels_to_ids, device)
        print('DEV ACC:', dev_accuracy)
        print('DEV Precision:' , dev_precision)
        print('DEV Recall:' , dev_recall)
        print('DEV F1Score:' , dev_f1score)
        
        list_dev_acc.append(dev_accuracy)     
        list_dev_precision.append(dev_precision)   
        list_dev_recall.append(dev_recall)  
        list_dev_f1score.append(dev_f1score)  
        
        
        #labels_test, predictions_test, test_accuracy, test_precision, test_recall, test_f1score = testing(model, test_loader, labels_to_ids, device)
        #print('TEST ACC:', test_accuracy)
        #print('TEST Precision:' , test_precision)
        #print('TEST Recall:' , test_recall)
        #print('TEST F1Score:' , test_f1score)
        
        #list_test_acc.append(test_accuracy) 
        #list_test_precision.append(test_precision)  
        #list_test_recall.append(test_recall)
        #list_test_f1score.append(test_f1score) 

        #saving model
        if dev_accuracy > best_dev_acc:
            best_dev_acc = dev_accuracy
            #best_test_acc = test_accuracy
        if dev_precision > best_dev_precision:
            best_dev_precision = dev_precision
            #best_test_precision = test_precision
        if dev_recall > best_dev_recall:
            best_dev_recall = dev_recall
            #best_test_recall = test_recall
        if dev_f1score > best_dev_f1score:
            best_dev_f1score = dev_f1score
            #best_test_f1score = test_f1score
            best_epoch = epoch
            
            if model_save_flag:
                os.makedirs(model_save_location, exist_ok=True)
                tokenizer.save_pretrained(model_save_location)
                model.save_pretrained(model_save_location)

        now = time.time()
        print('BEST ACCURACY --> ', 'DEV:', round(best_dev_acc, 5))
        print('BEST PRECISION --> ', 'DEV:', round(best_dev_precision, 5))
        print('BEST RECALL --> ', 'DEV:', round(best_dev_recall, 5))
        print('BEST F1SCORE --> ', 'DEV:', round(best_dev_f1score, 5))
        print('TIME PER EPOCH:', (now-start)/60 )
        print()

    return best_dev_acc, best_test_acc, best_epoch, best_dev_precision, best_test_precision, best_dev_recall, best_test_recall, best_dev_f1score, best_test_f1score, list_dev_acc, list_test_acc, list_dev_precision, list_test_precision, list_dev_recall, list_test_recall, list_dev_f1score, list_test_f1score

In [5]:
if __name__ == '__main__':
    n_epochs = 10
    models = ['ai4bharat/indic-bert']
    
    #model saving parameters
    model_save_flag = True
    model_load_flag = False
    
    overall_list_dev_acc = [] 
    overall_list_test_acc = []    
    overall_list_dev_precision = []  
    overall_list_test_precision  = []  
    overall_list_dev_recall = []  
    overall_list_test_recall = []  
    overall_list_dev_f1score = []  
    overall_list_test_f1score = [] 
    
    for i in range(5):
        
        for model_name in models:

            model_save_location = 'saved_models/' + model_name + 'Kannada' + 'transliterated' + str(i)
            model_load_location = None

            best_dev_acc, best_test_acc, best_epoch, best_dev_precision, best_test_precision, best_dev_recall, best_test_recall, best_dev_f1score, best_test_f1score, list_dev_acc, list_test_acc, list_dev_precision, list_test_precision, list_dev_recall, list_test_recall, list_dev_f1score, list_test_f1score = main(n_epochs, model_name, model_save_flag, model_save_location, model_load_flag, model_load_location)
            
            overall_list_dev_acc.append(list_dev_acc) 
            overall_list_test_acc.append(list_test_acc) 
            overall_list_dev_precision.append(list_dev_precision)  
            overall_list_test_precision.append(list_test_precision) 
            overall_list_dev_recall.append(list_dev_recall)  
            overall_list_test_recall.append(list_test_recall)  
            overall_list_dev_f1score.append(list_dev_f1score)  
            overall_list_test_f1score.append(list_test_f1score) 

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.weight', 'sop_classifier.classifier.bias', 'predictions.dense.weight', 'predictions.decoder.weight', 'predictions.decoder.bias', 'sop_classifier.classifier.weight', 'predictions.dense.bias', 'predictions.LayerNorm.bias', 'predictions.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indi

Training epoch: 1
FINSIHED BATCH: 19 of 195
FINSIHED BATCH: 39 of 195
FINSIHED BATCH: 59 of 195
FINSIHED BATCH: 79 of 195
FINSIHED BATCH: 99 of 195
FINSIHED BATCH: 119 of 195
FINSIHED BATCH: 139 of 195
FINSIHED BATCH: 159 of 195
FINSIHED BATCH: 179 of 195
Validation loss per 100 evaluation steps: 1.2887059450149536
DEV ACC: 0.46082535885167464
DEV Precision: 0.09536109449760764
DEV Recall: 0.2068181818181819
DEV F1Score: 0.12973988583225207
BEST ACCURACY -->  DEV: 0.46083
BEST PRECISION -->  DEV: 0.09536
BEST RECALL -->  DEV: 0.20682
BEST F1SCORE -->  DEV: 0.12974
TIME PER EPOCH: 3.947287102540334

Training epoch: 2
FINSIHED BATCH: 19 of 195
FINSIHED BATCH: 39 of 195
FINSIHED BATCH: 59 of 195
FINSIHED BATCH: 79 of 195
FINSIHED BATCH: 99 of 195
FINSIHED BATCH: 119 of 195
FINSIHED BATCH: 139 of 195
FINSIHED BATCH: 159 of 195
FINSIHED BATCH: 179 of 195
Validation loss per 100 evaluation steps: 1.1868736743927002
DEV ACC: 0.5195873205741627
DEV Precision: 0.2883934964361229
DEV Recall: 0.2

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.weight', 'sop_classifier.classifier.bias', 'predictions.dense.weight', 'predictions.decoder.weight', 'predictions.decoder.bias', 'sop_classifier.classifier.weight', 'predictions.dense.bias', 'predictions.LayerNorm.bias', 'predictions.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indi

Training epoch: 1
FINSIHED BATCH: 19 of 195
FINSIHED BATCH: 39 of 195
FINSIHED BATCH: 59 of 195
FINSIHED BATCH: 79 of 195
FINSIHED BATCH: 99 of 195
FINSIHED BATCH: 119 of 195
FINSIHED BATCH: 139 of 195
FINSIHED BATCH: 159 of 195
FINSIHED BATCH: 179 of 195
Validation loss per 100 evaluation steps: 1.2090815305709839
DEV ACC: 0.46845095693779903
DEV Precision: 0.13603241993067494
DEV Recall: 0.21935064935064938
DEV F1Score: 0.1486466444853959
BEST ACCURACY -->  DEV: 0.46845
BEST PRECISION -->  DEV: 0.13603
BEST RECALL -->  DEV: 0.21935
BEST F1SCORE -->  DEV: 0.14865
TIME PER EPOCH: 4.18689440091451

Training epoch: 2
FINSIHED BATCH: 19 of 195
FINSIHED BATCH: 39 of 195
FINSIHED BATCH: 59 of 195
FINSIHED BATCH: 79 of 195
FINSIHED BATCH: 99 of 195
FINSIHED BATCH: 119 of 195
FINSIHED BATCH: 139 of 195
FINSIHED BATCH: 159 of 195
FINSIHED BATCH: 179 of 195
Validation loss per 100 evaluation steps: 1.1771514415740967
DEV ACC: 0.5911333732057416
DEV Precision: 0.4255159175937207
DEV Recall: 0.42

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.weight', 'sop_classifier.classifier.bias', 'predictions.dense.weight', 'predictions.decoder.weight', 'predictions.decoder.bias', 'sop_classifier.classifier.weight', 'predictions.dense.bias', 'predictions.LayerNorm.bias', 'predictions.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indi

Training epoch: 1
FINSIHED BATCH: 19 of 195
FINSIHED BATCH: 39 of 195
FINSIHED BATCH: 59 of 195
FINSIHED BATCH: 79 of 195
FINSIHED BATCH: 99 of 195
FINSIHED BATCH: 119 of 195
FINSIHED BATCH: 139 of 195
FINSIHED BATCH: 159 of 195
FINSIHED BATCH: 179 of 195
Validation loss per 100 evaluation steps: 1.2643383741378784
DEV ACC: 0.464712918660287
DEV Precision: 0.09542837918660287
DEV Recall: 0.2045454545454546
DEV F1Score: 0.12849843948477735
BEST ACCURACY -->  DEV: 0.46471
BEST PRECISION -->  DEV: 0.09543
BEST RECALL -->  DEV: 0.20455
BEST F1SCORE -->  DEV: 0.1285
TIME PER EPOCH: 4.282869958877564

Training epoch: 2
FINSIHED BATCH: 19 of 195
FINSIHED BATCH: 39 of 195
FINSIHED BATCH: 59 of 195
FINSIHED BATCH: 79 of 195
FINSIHED BATCH: 99 of 195
FINSIHED BATCH: 119 of 195
FINSIHED BATCH: 139 of 195
FINSIHED BATCH: 159 of 195
FINSIHED BATCH: 179 of 195
Validation loss per 100 evaluation steps: 1.3601171970367432
DEV ACC: 0.5030651913875598
DEV Precision: 0.26617275702092164
DEV Recall: 0.257

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.weight', 'sop_classifier.classifier.bias', 'predictions.dense.weight', 'predictions.decoder.weight', 'predictions.decoder.bias', 'sop_classifier.classifier.weight', 'predictions.dense.bias', 'predictions.LayerNorm.bias', 'predictions.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indi

Training epoch: 1
FINSIHED BATCH: 19 of 195
FINSIHED BATCH: 39 of 195
FINSIHED BATCH: 59 of 195
FINSIHED BATCH: 79 of 195
FINSIHED BATCH: 99 of 195
FINSIHED BATCH: 119 of 195
FINSIHED BATCH: 139 of 195
FINSIHED BATCH: 159 of 195
FINSIHED BATCH: 179 of 195
Validation loss per 100 evaluation steps: 1.2140412330627441
DEV ACC: 0.46276913875598086
DEV Precision: 0.09369019138755981
DEV Recall: 0.20227272727272733
DEV F1Score: 0.12725345891298107
BEST ACCURACY -->  DEV: 0.46277
BEST PRECISION -->  DEV: 0.09369
BEST RECALL -->  DEV: 0.20227
BEST F1SCORE -->  DEV: 0.12725
TIME PER EPOCH: 4.2692061026891075

Training epoch: 2
FINSIHED BATCH: 19 of 195
FINSIHED BATCH: 39 of 195
FINSIHED BATCH: 59 of 195
FINSIHED BATCH: 79 of 195
FINSIHED BATCH: 99 of 195
FINSIHED BATCH: 119 of 195
FINSIHED BATCH: 139 of 195
FINSIHED BATCH: 159 of 195
FINSIHED BATCH: 179 of 195
Validation loss per 100 evaluation steps: 1.2654861211776733
DEV ACC: 0.4917763157894737
DEV Precision: 0.23327724284039048
DEV Recall: 

FINSIHED BATCH: 19 of 195
FINSIHED BATCH: 39 of 195
FINSIHED BATCH: 59 of 195
FINSIHED BATCH: 79 of 195
FINSIHED BATCH: 99 of 195
FINSIHED BATCH: 119 of 195
FINSIHED BATCH: 139 of 195
FINSIHED BATCH: 159 of 195
FINSIHED BATCH: 179 of 195
Validation loss per 100 evaluation steps: 0.9834244251251221
DEV ACC: 0.6100478468899522
DEV Precision: 0.47446303262831546
DEV Recall: 0.487383333627001
DEV F1Score: 0.45738095837334813
BEST ACCURACY -->  DEV: 0.61715
BEST PRECISION -->  DEV: 0.55045
BEST RECALL -->  DEV: 0.51566
BEST F1SCORE -->  DEV: 0.50517
TIME PER EPOCH: 4.064034748077392

Training epoch: 8
FINSIHED BATCH: 19 of 195
FINSIHED BATCH: 39 of 195
FINSIHED BATCH: 59 of 195
FINSIHED BATCH: 79 of 195
FINSIHED BATCH: 99 of 195
FINSIHED BATCH: 119 of 195
FINSIHED BATCH: 139 of 195
FINSIHED BATCH: 159 of 195
FINSIHED BATCH: 179 of 195
Validation loss per 100 evaluation steps: 1.0633466243743896
DEV ACC: 0.5741626794258373
DEV Precision: 0.4954662928426297
DEV Recall: 0.5003078096165804
DEV 

In [6]:
print(best_dev_acc)

0.6171501196172249


In [7]:
print(best_epoch)

5


In [8]:
print(best_dev_precision)

0.5504541191678821


In [9]:
print(best_dev_recall)

0.5156615555067567


In [10]:
print(best_dev_f1score)

0.5051700641888434


In [11]:
print(overall_list_dev_acc)

[[0.46082535885167464, 0.5195873205741627, 0.5770035885167464, 0.5860496411483254, 0.61251495215311, 0.6120663875598086, 0.605936004784689, 0.5864234449760766, 0.6021232057416268, 0.5807416267942584], [0.46845095693779903, 0.5911333732057416, 0.5717703349282297, 0.6143839712918661, 0.6200657894736842, 0.5978618421052632, 0.6134868421052632, 0.6087021531100478, 0.5698265550239234, 0.5755831339712919], [0.464712918660287, 0.5030651913875598, 0.5992075358851675, 0.6229066985645934, 0.6191686602870813, 0.6310556220095693, 0.6220095693779905, 0.6015251196172249, 0.6021232057416268, 0.5645933014354066], [0.46276913875598086, 0.4917763157894737, 0.5703498803827751, 0.6134868421052632, 0.6233552631578948, 0.6163277511961722, 0.6153558612440192, 0.5995813397129187, 0.591282894736842, 0.5628738038277512], [0.46762858851674644, 0.5632476076555024, 0.5798444976076556, 0.5797697368421052, 0.6049641148325359, 0.6171501196172249, 0.6100478468899522, 0.5741626794258373, 0.5855263157894737, 0.564144736

In [12]:
print(overall_list_dev_precision)

[[0.09536109449760764, 0.2883934964361229, 0.3382896773889535, 0.3414364857317995, 0.42661125490670954, 0.4776714063780451, 0.47422008090105466, 0.43637933352259267, 0.47447494057682643, 0.45560180558878843], [0.13603241993067494, 0.4255159175937207, 0.4278541526803559, 0.45854261833328813, 0.4786160511349268, 0.4695716798056371, 0.49440293749469094, 0.49742445107664646, 0.4882948479968604, 0.48329622097192215], [0.09542837918660287, 0.26617275702092164, 0.40722852905350243, 0.4851987248228086, 0.443481177912996, 0.47336599733412293, 0.4764765376010427, 0.4887887057824434, 0.4965833104803694, 0.4838365349820459], [0.09369019138755981, 0.23327724284039048, 0.3192137515493429, 0.3661463891493016, 0.3948061883936496, 0.42758343569524265, 0.4637969015596985, 0.49133888326859054, 0.505056500243666, 0.489515247082092], [0.09709180622009571, 0.3796141953449606, 0.34904719583040716, 0.4125266109958348, 0.45529018805989274, 0.5504541191678821, 0.47446303262831546, 0.4954662928426297, 0.49832178

In [13]:
print(overall_list_dev_recall)

[[0.2068181818181819, 0.2969416672157314, 0.376084793337819, 0.41468606260050117, 0.465263208154075, 0.47609573229058527, 0.4979412924451869, 0.4571018702034744, 0.4999471799739179, 0.44267161537215016], [0.21935064935064938, 0.4273705442851235, 0.4765658990562241, 0.5222643161947975, 0.5185779275685694, 0.4691104483751543, 0.47879233585784386, 0.47135988550180774, 0.5014735438905155, 0.44986477945702547], [0.2045454545454546, 0.2571389558154264, 0.4404137050560847, 0.4601533155839375, 0.5063344167091001, 0.48231974798819716, 0.4969213759451763, 0.4921201854560987, 0.49273225140872207, 0.47663059756642634], [0.20227272727272733, 0.23896740514387577, 0.38153459582603966, 0.4238703460829129, 0.45126243487100515, 0.4736219937222612, 0.4172168328747276, 0.5096897517218374, 0.505297276995138, 0.47052713329451307], [0.2068181818181819, 0.35862947923952704, 0.38599658792594344, 0.3946431377282067, 0.43332933707159715, 0.5156615555067567, 0.487383333627001, 0.5003078096165804, 0.49019713008947

In [14]:
print(overall_list_dev_f1score)

[[0.12973988583225207, 0.2602093049368648, 0.3387591510584993, 0.3611354023508775, 0.4266943803871509, 0.4571146098367078, 0.460274628967351, 0.42718691733077613, 0.46645518966239036, 0.4278696783448337], [0.1486466444853959, 0.40413177870121975, 0.435667285282334, 0.46738914136287896, 0.47600718192208225, 0.4500536075664835, 0.46541247381442674, 0.4665432922282306, 0.4697362145932612, 0.43158133993164205], [0.12849843948477735, 0.2121569777840861, 0.399155889810422, 0.43742505738600596, 0.4522747473461349, 0.4574872622500857, 0.46414450578356165, 0.4707966460080273, 0.4702745777607999, 0.4472069872462191], [0.12725345891298107, 0.18471679433183208, 0.30861816468223024, 0.3855306881967567, 0.3973241521451847, 0.42872555853244343, 0.409687669774233, 0.47731673708631406, 0.4770168307783576, 0.44998613309298785], [0.13071709250089444, 0.3300208412812365, 0.3489208288247653, 0.3640422795662795, 0.41676302524313685, 0.5051700641888434, 0.45738095837334813, 0.47377548006841863, 0.47088820434

In [15]:
#The best model is 0