In [1]:
import pandas as pd
import numpy as np
import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from load_data import initialize_data
from reading_datasets import read_task5
from labels_to_ids import task5_labels_to_ids
import time
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
def train(epoch, train_loader, model, optimizer, device, grad_step = 1, max_grad_norm = 10):
    tr_loss, tr_accuracy = 0, 0
    tr_precision, tr_recall = 0, 0
    tr_f1score = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    optimizer.zero_grad()
    
    for idx, batch in enumerate(train_loader):
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        if (idx + 1) % 20 == 0:
            print('FINSIHED BATCH:', idx, 'of', len(train_loader))

        #loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        output = model(input_ids=ids, attention_mask=mask, labels=labels)
        tr_loss += output[0]

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = output[1].view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
        
        # Compute Precision
        tmp_tr_precision = precision_score(labels.cpu().numpy(), predictions.cpu().numpy(), labels=[0,1,2], average = None, zero_division=0 )[2]
        tr_precision += tmp_tr_precision
        
        # Compute Recall
        tmp_tr_recall = recall_score(labels.cpu().numpy(), predictions.cpu().numpy(), labels=[0,1,2], average = None, zero_division=0 )[2]
        tr_recall += tmp_tr_recall
        
        # Compute f1score
        tmp_tr_f1score = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(),labels=[0,1,2], average=None, zero_division=0)[2]
        tr_f1score += tmp_tr_f1score
    

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=max_grad_norm
        )
        
        # backward pass
        output['loss'].backward()
        if (idx + 1) % grad_step == 0:
            optimizer.step()
            optimizer.zero_grad()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    tr_precision = tr_precision / nb_tr_steps
    tr_recall = tr_recall / nb_tr_steps
    tr_f1score= tr_f1score / nb_tr_steps
    #print(f"Training loss epoch: {epoch_loss}")
    #print(f"Training accuracy epoch: {tr_accuracy}")

    return model

In [3]:
def testing(model, testing_loader, labels_to_ids, device):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    eval_precision, eval_recall = 0, 0
    eval_f1score = 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
     
    
    ids_to_labels = dict((v,k) for k,v in labels_to_ids.items())

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            #loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
            output = model(input_ids=ids, attention_mask=mask, labels=labels)

            eval_loss += output['loss'].item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = output[1].view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
            
            # Compute Precision
            tmp_eval_precision = precision_score(labels.cpu().numpy(), predictions.cpu().numpy(), labels=[0,1,2], average = None, zero_division=0)[2]
            eval_precision += tmp_eval_precision
            
            # Compute Recall
            tmp_eval_recall = recall_score(labels.cpu().numpy(), predictions.cpu().numpy(), labels=[0,1,2], average = None, zero_division=0)[2]
            eval_recall += tmp_eval_recall
            
            # Compute f1score
            tmp_eval_f1score = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(),labels=[0,1,2], average=None, zero_division=0)[2]
            eval_f1score += tmp_eval_f1score

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    eval_precision = eval_precision / nb_eval_steps
    eval_recall = eval_recall / nb_eval_steps
    eval_f1score = eval_f1score / nb_eval_steps
    #print(f"Validation Loss: {eval_loss}")
    #print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions, eval_accuracy, eval_precision, eval_recall, eval_f1score

In [4]:
def main(n_epochs, model_name, model_save_flag, model_save_location, model_load_flag, model_load_location):
    #Initialization training parameters
    max_len = 256
    batch_size = 32
    grad_step = 1
    learning_rate = 1e-05
    initialization_input = (max_len, batch_size)

    #Reading datasets and initializing data loaders
    dataset_location = '../2022.07.07_task5/'

    train_data = read_task5(dataset_location , split = 'train')
    dev_data = read_task5(dataset_location , split = 'dev')
    #test_data = read_task5(dataset_location , split = 'dev')#load test set
    labels_to_ids = task5_labels_to_ids
    #input_data = (train_data, dev_data, labels_to_ids)

    #Define tokenizer, model and optimizer
    device = 'cuda' if cuda.is_available() else 'cpu' #save the processing time
    if model_load_flag:
        tokenizer = AutoTokenizer.from_pretrained(model_load_location)
        model = AutoModelForSequenceClassification.from_pretrained(model_load_location)
    else: 
        tokenizer =  AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels_to_ids))
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    model.to(device)

    #Get dataloaders
    train_loader = initialize_data(tokenizer, initialization_input, train_data, labels_to_ids, shuffle = True)
    dev_loader = initialize_data(tokenizer, initialization_input, dev_data, labels_to_ids, shuffle = True)
    #test_loader = initialize_data(tokenizer, initialization_input, test_data, labels_to_ids, shuffle = True)#create test loader

    best_dev_acc = 0
    best_test_acc = 0
    best_dev_precision = 0
    best_test_precision = 0
    best_dev_recall = 0
    best_test_recall = 0
    best_dev_f1score = 0
    best_test_f1score = 0
    best_epoch = -1
    
    list_dev_acc = [] 
    list_test_acc = []  
    list_dev_precision = []  
    list_test_precision  = []  
    list_dev_recall = []  
    list_test_recall = []  
    list_dev_f1score = []  
    list_test_f1score = []
    
    for epoch in range(n_epochs):
        start = time.time()
        print(f"Training epoch: {epoch + 1}")

        #train model
        if not model_load_flag:
            model = train(epoch, train_loader, model, optimizer, device, grad_step)
        
        #testing and logging
        labels_dev, predictions_dev, dev_accuracy, dev_precision, dev_recall, dev_f1score = testing(model, dev_loader, labels_to_ids, device)
        print('DEV ACC:', dev_accuracy)
        print('DEV Precision:' , dev_precision)
        print('DEV Recall:' , dev_recall)
        print('DEV F1Score:' , dev_f1score)
        
        list_dev_acc.append(dev_accuracy)     
        list_dev_precision.append(dev_precision)   
        list_dev_recall.append(dev_recall)  
        list_dev_f1score.append(dev_f1score)  
        
        
        #labels_test, predictions_test, test_accuracy, test_precision, test_recall, test_f1score = testing(model, test_loader, labels_to_ids, device)
        #print('TEST ACC:', test_accuracy)
        #print('TEST Precision:' , test_precision)
        #print('TEST Recall:' , test_recall)
        #print('TEST F1Score:' , test_f1score)
        
        #list_test_acc.append(test_accuracy) 
        #list_test_precision.append(test_precision)  
        #list_test_recall.append(test_recall)
        #list_test_f1score.append(test_f1score) 

        #saving model
        if dev_accuracy > best_dev_acc:
            best_dev_acc = dev_accuracy
            #best_test_acc = test_accuracy
        if dev_precision > best_dev_precision:
            best_dev_precision = dev_precision
            #best_test_precision = test_precision
        if dev_recall > best_dev_recall:
            best_dev_recall = dev_recall
            #best_test_recall = test_recall
        if dev_f1score > best_dev_f1score:
            best_dev_f1score = dev_f1score
            #best_test_f1score = test_f1score
            best_epoch = epoch
            
            if model_save_flag:
                os.makedirs(model_save_location, exist_ok=True)
                tokenizer.save_pretrained(model_save_location)
                model.save_pretrained(model_save_location)

        now = time.time()
        print('BEST ACCURACY --> ', 'DEV:', round(best_dev_acc, 5))
        print('BEST PRECISION --> ', 'DEV:', round(best_dev_precision, 5))
        print('BEST RECALL --> ', 'DEV:', round(best_dev_recall, 5))
        print('BEST F1SCORE --> ', 'DEV:', round(best_dev_f1score, 5))
        print('TIME PER EPOCH:', (now-start)/60 )
        print()

    return best_dev_acc, best_test_acc, best_epoch, best_dev_precision, best_test_precision, best_dev_recall, best_test_recall, best_dev_f1score, best_test_f1score, list_dev_acc, list_test_acc, list_dev_precision, list_test_precision, list_dev_recall, list_test_recall, list_dev_f1score, list_test_f1score

In [5]:
if __name__ == '__main__':
    n_epochs = 10
    models = ['bert-base-multilingual-uncased']
    
    #model saving parameters
    model_save_flag = True
    model_load_flag = False
    
    overall_list_dev_acc = [] 
    overall_list_test_acc = []    
    overall_list_dev_precision = []  
    overall_list_test_precision  = []  
    overall_list_dev_recall = []  
    overall_list_test_recall = []  
    overall_list_dev_f1score = []  
    overall_list_test_f1score = [] 
    
    for i in range(5):
        
        for model_name in models:

            model_save_location = 'saved_models/' + model_name + str(i)
            model_load_location = None

            best_dev_acc, best_test_acc, best_epoch, best_dev_precision, best_test_precision, best_dev_recall, best_test_recall, best_dev_f1score, best_test_f1score, list_dev_acc, list_test_acc, list_dev_precision, list_test_precision, list_dev_recall, list_test_recall, list_dev_f1score, list_test_f1score = main(n_epochs, model_name, model_save_flag, model_save_location, model_load_flag, model_load_location)
            
            overall_list_dev_acc.append(list_dev_acc) 
            overall_list_test_acc.append(list_test_acc) 
            overall_list_dev_precision.append(list_dev_precision)  
            overall_list_test_precision.append(list_test_precision) 
            overall_list_dev_recall.append(list_dev_recall)  
            overall_list_test_recall.append(list_test_recall)  
            overall_list_dev_f1score.append(list_dev_f1score)  
            overall_list_test_f1score.append(list_test_f1score) 

Downloading:   0%|          | 0.00/310 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/650 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

Training epoch: 1
FINSIHED BATCH: 19 of 252
FINSIHED BATCH: 39 of 252
FINSIHED BATCH: 119 of 252
FINSIHED BATCH: 139 of 252
FINSIHED BATCH: 159 of 252
FINSIHED BATCH: 179 of 252
FINSIHED BATCH: 199 of 252
FINSIHED BATCH: 219 of 252
FINSIHED BATCH: 239 of 252
Validation loss per 100 evaluation steps: 0.26903992891311646
DEV ACC: 0.8362544091710759
DEV Precision: 0.6550066335780621
DEV Recall: 0.7937441306488927
DEV F1Score: 0.7045690598072829
BEST ACCURACY -->  DEV: 0.83625
BEST PRECISION -->  DEV: 0.65501
BEST RECALL -->  DEV: 0.79374
BEST F1SCORE -->  DEV: 0.70457
TIME PER EPOCH: 6.191241625944773

Training epoch: 2
FINSIHED BATCH: 19 of 252
FINSIHED BATCH: 39 of 252
FINSIHED BATCH: 59 of 252
FINSIHED BATCH: 79 of 252
FINSIHED BATCH: 99 of 252
FINSIHED BATCH: 119 of 252
FINSIHED BATCH: 139 of 252
FINSIHED BATCH: 159 of 252
FINSIHED BATCH: 179 of 252
FINSIHED BATCH: 199 of 252
FINSIHED BATCH: 219 of 252
FINSIHED BATCH: 239 of 252
Validation loss per 100 evaluation steps: 0.279311299324

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

Training epoch: 1
FINSIHED BATCH: 19 of 252
FINSIHED BATCH: 39 of 252
FINSIHED BATCH: 59 of 252
FINSIHED BATCH: 79 of 252
FINSIHED BATCH: 99 of 252
FINSIHED BATCH: 119 of 252
FINSIHED BATCH: 139 of 252
FINSIHED BATCH: 159 of 252
FINSIHED BATCH: 179 of 252
FINSIHED BATCH: 199 of 252
FINSIHED BATCH: 219 of 252
FINSIHED BATCH: 239 of 252
Validation loss per 100 evaluation steps: 0.4269169867038727
DEV ACC: 0.8179012345679012
DEV Precision: 0.6461203699298939
DEV Recall: 0.8712430426716141
DEV F1Score: 0.7183893949533536
BEST ACCURACY -->  DEV: 0.8179
BEST PRECISION -->  DEV: 0.64612
BEST RECALL -->  DEV: 0.87124
BEST F1SCORE -->  DEV: 0.71839
TIME PER EPOCH: 6.444391755263011

Training epoch: 2
FINSIHED BATCH: 19 of 252
FINSIHED BATCH: 39 of 252
FINSIHED BATCH: 59 of 252
FINSIHED BATCH: 79 of 252
FINSIHED BATCH: 99 of 252
FINSIHED BATCH: 119 of 252
FINSIHED BATCH: 139 of 252
FINSIHED BATCH: 159 of 252
FINSIHED BATCH: 179 of 252
FINSIHED BATCH: 199 of 252
FINSIHED BATCH: 219 of 252
FINSIHE

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

Training epoch: 1
FINSIHED BATCH: 19 of 252
FINSIHED BATCH: 39 of 252
FINSIHED BATCH: 59 of 252
FINSIHED BATCH: 79 of 252
FINSIHED BATCH: 99 of 252
FINSIHED BATCH: 119 of 252
FINSIHED BATCH: 139 of 252
FINSIHED BATCH: 159 of 252
FINSIHED BATCH: 179 of 252
FINSIHED BATCH: 199 of 252
FINSIHED BATCH: 219 of 252
FINSIHED BATCH: 239 of 252
Validation loss per 100 evaluation steps: 0.3450555205345154
DEV ACC: 0.7935956790123457
DEV Precision: 0.7049508692365835
DEV Recall: 0.6732741244646006
DEV F1Score: 0.660975996063715
BEST ACCURACY -->  DEV: 0.7936
BEST PRECISION -->  DEV: 0.70495
BEST RECALL -->  DEV: 0.67327
BEST F1SCORE -->  DEV: 0.66098
TIME PER EPOCH: 6.472367159525553

Training epoch: 2
FINSIHED BATCH: 19 of 252
FINSIHED BATCH: 39 of 252
FINSIHED BATCH: 59 of 252
FINSIHED BATCH: 79 of 252
FINSIHED BATCH: 99 of 252
FINSIHED BATCH: 119 of 252
FINSIHED BATCH: 139 of 252
FINSIHED BATCH: 159 of 252
FINSIHED BATCH: 179 of 252
FINSIHED BATCH: 199 of 252
FINSIHED BATCH: 219 of 252
FINSIHED

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

Training epoch: 1
FINSIHED BATCH: 19 of 252
FINSIHED BATCH: 39 of 252
FINSIHED BATCH: 59 of 252
FINSIHED BATCH: 79 of 252
FINSIHED BATCH: 99 of 252
FINSIHED BATCH: 119 of 252
FINSIHED BATCH: 139 of 252
FINSIHED BATCH: 159 of 252
FINSIHED BATCH: 179 of 252
FINSIHED BATCH: 199 of 252
FINSIHED BATCH: 219 of 252
FINSIHED BATCH: 239 of 252
Validation loss per 100 evaluation steps: 0.48593994975090027
DEV ACC: 0.8235412992357437
DEV Precision: 0.6460168579216198
DEV Recall: 0.8835159989921892
DEV F1Score: 0.7319735814103124
BEST ACCURACY -->  DEV: 0.82354
BEST PRECISION -->  DEV: 0.64602
BEST RECALL -->  DEV: 0.88352
BEST F1SCORE -->  DEV: 0.73197
TIME PER EPOCH: 6.5076332529385885

Training epoch: 2
FINSIHED BATCH: 19 of 252
FINSIHED BATCH: 39 of 252
FINSIHED BATCH: 59 of 252
FINSIHED BATCH: 79 of 252
FINSIHED BATCH: 99 of 252
FINSIHED BATCH: 119 of 252
FINSIHED BATCH: 139 of 252
FINSIHED BATCH: 159 of 252
FINSIHED BATCH: 179 of 252
FINSIHED BATCH: 199 of 252
FINSIHED BATCH: 219 of 252
FINS

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

Training epoch: 1
FINSIHED BATCH: 19 of 252
FINSIHED BATCH: 39 of 252
FINSIHED BATCH: 59 of 252
FINSIHED BATCH: 79 of 252
FINSIHED BATCH: 99 of 252
FINSIHED BATCH: 119 of 252
FINSIHED BATCH: 139 of 252
FINSIHED BATCH: 159 of 252
FINSIHED BATCH: 179 of 252
FINSIHED BATCH: 199 of 252
FINSIHED BATCH: 219 of 252
FINSIHED BATCH: 239 of 252
Validation loss per 100 evaluation steps: 0.45236173272132874
DEV ACC: 0.8265542328042328
DEV Precision: 0.7028659611992943
DEV Recall: 0.7011274880322499
DEV F1Score: 0.6831541318936274
BEST ACCURACY -->  DEV: 0.82655
BEST PRECISION -->  DEV: 0.70287
BEST RECALL -->  DEV: 0.70113
BEST F1SCORE -->  DEV: 0.68315
TIME PER EPOCH: 6.542609135309855

Training epoch: 2
FINSIHED BATCH: 19 of 252
FINSIHED BATCH: 39 of 252
FINSIHED BATCH: 59 of 252
FINSIHED BATCH: 79 of 252
FINSIHED BATCH: 99 of 252
FINSIHED BATCH: 119 of 252
FINSIHED BATCH: 139 of 252
FINSIHED BATCH: 159 of 252
FINSIHED BATCH: 179 of 252
FINSIHED BATCH: 199 of 252
FINSIHED BATCH: 219 of 252
FINSI

In [6]:
print(best_dev_acc)

0.8325617283950617


In [7]:
print(best_epoch)

2


In [8]:
print(best_dev_precision)

0.7107898715041571


In [9]:
print(best_dev_recall)

0.8879503653313175


In [10]:
print(best_dev_f1score)

0.7395190177108205


In [11]:
print(overall_list_dev_acc)

[[0.8362544091710759, 0.8419863315696648, 0.8228615520282188, 0.8249375367430923, 0.8241291887125219, 0.8161008230452675, 0.809836125808348, 0.8139329805996472, 0.8266093474426808, 0.8130327748383305], [0.8179012345679012, 0.8323780129335685, 0.833370076425632, 0.8126286008230453, 0.8194811875367431, 0.8032039976484422, 0.8249375367430923, 0.8106444738389184, 0.8130327748383305, 0.8142085537918871], [0.7935956790123457, 0.8384222516166961, 0.8345825984714873, 0.8383303938859494, 0.8173684597295708, 0.816192680776014, 0.812040711346267, 0.8153292181069959, 0.8158252498530276, 0.8232657260435038], [0.8235412992357437, 0.8240373309817755, 0.8204732510288066, 0.8129409171075838, 0.8264256319811876, 0.8225492357436802, 0.8169091710758377, 0.8044165196942975, 0.817772633744856, 0.8134369488536155], [0.8265542328042328, 0.816413139329806, 0.8325617283950617, 0.8286853321575544, 0.8090645208700764, 0.8111405055849501, 0.8077601410934744, 0.8224573780129336, 0.8239454732510288, 0.80766828336272

In [12]:
print(overall_list_dev_precision)

[[0.6550066335780621, 0.6713805594757976, 0.669263496644449, 0.6115734794306219, 0.6872763920382968, 0.6299333823143346, 0.6913706223230033, 0.6501374287088573, 0.6562461788652265, 0.6487368011177536], [0.6461203699298939, 0.6760501843835177, 0.6910098719622527, 0.6550012597631645, 0.6288202097725906, 0.6684246089007996, 0.6462218557456655, 0.6604898188231522, 0.6665658856135045, 0.6641912320483749], [0.7049508692365835, 0.6723356009070297, 0.6786510226986416, 0.6715505394076822, 0.6333762798048512, 0.6789287432144576, 0.6602481733434116, 0.6581884605694128, 0.6929825465539751, 0.6634508789270694], [0.6460168579216198, 0.6499462178033606, 0.6603363567649282, 0.6523861059575347, 0.6602012185345522, 0.6532553425410568, 0.6988284202569918, 0.6198916603678507, 0.6199180008703821, 0.6564877802973041], [0.7028659611992943, 0.6351210517877185, 0.6543020912068529, 0.6585543645067455, 0.6810657596371881, 0.6465075929361644, 0.6749496094734192, 0.7107898715041571, 0.6980954671430862, 0.636769095

In [13]:
print(overall_list_dev_recall)

[[0.7937441306488927, 0.7973653771272818, 0.7202569916855631, 0.9040678897821754, 0.7096858608763369, 0.7241920337158432, 0.7037586751872466, 0.6763227513227511, 0.7245711079044413, 0.6605568153187199], [0.8712430426716141, 0.7953136810279665, 0.7419438145628622, 0.6944129503653312, 0.7664147140337617, 0.571261939119082, 0.7449609473418997, 0.6245315971506448, 0.665784832451499, 0.6733227971323209], [0.6732741244646006, 0.7819035021415972, 0.7715087381754047, 0.7545099521289996, 0.7267928720309674, 0.4988936988936989, 0.5680146132527084, 0.6314562862181908, 0.6418018048970431, 0.7262093726379443], [0.8835159989921892, 0.699853409377219, 0.6365772234819853, 0.723834719072814, 0.8065982958840098, 0.6611380242332622, 0.5647581254724112, 0.6773540392588012, 0.7531683043587805, 0.6011463844797178], [0.7011274880322499, 0.6643573357859072, 0.8879503653313175, 0.818644494834971, 0.5587553539934491, 0.6476803179184132, 0.6183925422020661, 0.6063240110859159, 0.6664525069286973, 0.6632871537633

In [14]:
print(overall_list_dev_f1score)

[[0.7045690598072829, 0.7092142114551074, 0.6763709606036008, 0.7106997022012502, 0.6754412199864062, 0.6568284709704434, 0.679580340294626, 0.6367987495438474, 0.6689146857914364, 0.637389809588689], [0.7183893949533536, 0.7126474148014569, 0.6911315002425127, 0.6514433434601501, 0.6743677136607992, 0.5953955356321566, 0.6709982028498908, 0.6221497752715502, 0.6422845318282441, 0.6394192227525561], [0.660975996063715, 0.6997197931777023, 0.701661609722202, 0.6906768824025198, 0.6562026945485592, 0.5465307075951333, 0.5817313224175968, 0.6173157504389996, 0.6331585614793633, 0.6721388459310631], [0.7319735814103124, 0.6532122757907807, 0.6235266520354239, 0.6535060874770443, 0.70618066158019, 0.6309803167167164, 0.6002278585611917, 0.6202103244820333, 0.6595784493543598, 0.604284604284604], [0.6831541318936274, 0.6273651960711681, 0.7395190177108205, 0.7117404071185581, 0.5934366235486683, 0.6286320822035107, 0.6190194749091993, 0.6202711495318011, 0.6567850579755339, 0.630446757827710

In [None]:
#Best model is model 4