In [1]:
import pandas as pd
import numpy as np
import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from load_data import initialize_data
from reading_datasets import read_task5
from labels_to_ids import task5_labels_to_ids
import time
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
def train(epoch, train_loader, model, optimizer, device, grad_step = 1, max_grad_norm = 10):
    tr_loss, tr_accuracy = 0, 0
    tr_precision, tr_recall = 0, 0
    tr_f1score = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    optimizer.zero_grad()
    
    for idx, batch in enumerate(train_loader):
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        if (idx + 1) % 20 == 0:
            print('FINSIHED BATCH:', idx, 'of', len(train_loader))

        #loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        output = model(input_ids=ids, attention_mask=mask, labels=labels)
        tr_loss += output[0]

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = output[1].view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
        
        # Compute Precision
        tmp_tr_precision = precision_score(labels.cpu().numpy(), predictions.cpu().numpy(), labels=[0,1,2], average = None, zero_division=0 )[2]
        tr_precision += tmp_tr_precision
        
        # Compute Recall
        tmp_tr_recall = recall_score(labels.cpu().numpy(), predictions.cpu().numpy(), labels=[0,1,2], average = None, zero_division=0 )[2]
        tr_recall += tmp_tr_recall
        
        # Compute f1score
        tmp_tr_f1score = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(),labels=[0,1,2], average=None, zero_division=0)[2]
        tr_f1score += tmp_tr_f1score
    

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=max_grad_norm
        )
        
        # backward pass
        output['loss'].backward()
        if (idx + 1) % grad_step == 0:
            optimizer.step()
            optimizer.zero_grad()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    tr_precision = tr_precision / nb_tr_steps
    tr_recall = tr_recall / nb_tr_steps
    tr_f1score= tr_f1score / nb_tr_steps
    #print(f"Training loss epoch: {epoch_loss}")
    #print(f"Training accuracy epoch: {tr_accuracy}")

    return model

In [3]:
def testing(model, testing_loader, labels_to_ids, device):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    eval_precision, eval_recall = 0, 0
    eval_f1score = 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
     
    
    ids_to_labels = dict((v,k) for k,v in labels_to_ids.items())

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            #loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
            output = model(input_ids=ids, attention_mask=mask, labels=labels)

            eval_loss += output['loss'].item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = output[1].view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
            
            # Compute Precision
            tmp_eval_precision = precision_score(labels.cpu().numpy(), predictions.cpu().numpy(), labels=[0,1,2], average = None, zero_division=0)[2]
            eval_precision += tmp_eval_precision
            
            # Compute Recall
            tmp_eval_recall = recall_score(labels.cpu().numpy(), predictions.cpu().numpy(), labels=[0,1,2], average = None, zero_division=0)[2]
            eval_recall += tmp_eval_recall
            
            # Compute f1score
            tmp_eval_f1score = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(),labels=[0,1,2], average=None, zero_division=0)[2]
            eval_f1score += tmp_eval_f1score

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    eval_precision = eval_precision / nb_eval_steps
    eval_recall = eval_recall / nb_eval_steps
    eval_f1score = eval_f1score / nb_eval_steps
    #print(f"Validation Loss: {eval_loss}")
    #print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions, eval_accuracy, eval_precision, eval_recall, eval_f1score

In [4]:
def main(n_epochs, model_name, model_save_flag, model_save_location, model_load_flag, model_load_location):
    #Initialization training parameters
    max_len = 256
    batch_size = 32
    grad_step = 1
    learning_rate = 1e-05
    initialization_input = (max_len, batch_size)

    #Reading datasets and initializing data loaders
    dataset_location = '../2022.07.07_task5/'

    train_data = read_task5(dataset_location , split = 'train_over_under_sample')
    dev_data = read_task5(dataset_location , split = 'dev')
    #test_data = read_task5(dataset_location , split = 'dev')#load test set
    labels_to_ids = task5_labels_to_ids
    #input_data = (train_data, dev_data, labels_to_ids)

    #Define tokenizer, model and optimizer
    device = 'cuda' if cuda.is_available() else 'cpu' #save the processing time
    if model_load_flag:
        tokenizer = AutoTokenizer.from_pretrained(model_load_location)
        model = AutoModelForSequenceClassification.from_pretrained(model_load_location)
    else: 
        tokenizer =  AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels_to_ids))
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    model.to(device)

    #Get dataloaders
    train_loader = initialize_data(tokenizer, initialization_input, train_data, labels_to_ids, shuffle = True)
    dev_loader = initialize_data(tokenizer, initialization_input, dev_data, labels_to_ids, shuffle = True)
    #test_loader = initialize_data(tokenizer, initialization_input, test_data, labels_to_ids, shuffle = True)#create test loader

    best_dev_acc = 0
    best_test_acc = 0
    best_dev_precision = 0
    best_test_precision = 0
    best_dev_recall = 0
    best_test_recall = 0
    best_dev_f1score = 0
    best_test_f1score = 0
    best_epoch = -1
    
    list_dev_acc = [] 
    list_test_acc = []  
    list_dev_precision = []  
    list_test_precision  = []  
    list_dev_recall = []  
    list_test_recall = []  
    list_dev_f1score = []  
    list_test_f1score = []
    
    for epoch in range(n_epochs):
        start = time.time()
        print(f"Training epoch: {epoch + 1}")

        #train model
        if not model_load_flag:
            model = train(epoch, train_loader, model, optimizer, device, grad_step)
        
        #testing and logging
        labels_dev, predictions_dev, dev_accuracy, dev_precision, dev_recall, dev_f1score = testing(model, dev_loader, labels_to_ids, device)
        print('DEV ACC:', dev_accuracy)
        print('DEV Precision:' , dev_precision)
        print('DEV Recall:' , dev_recall)
        print('DEV F1Score:' , dev_f1score)
        
        list_dev_acc.append(dev_accuracy)     
        list_dev_precision.append(dev_precision)   
        list_dev_recall.append(dev_recall)  
        list_dev_f1score.append(dev_f1score)  
        
        
        #labels_test, predictions_test, test_accuracy, test_precision, test_recall, test_f1score = testing(model, test_loader, labels_to_ids, device)
        #print('TEST ACC:', test_accuracy)
        #print('TEST Precision:' , test_precision)
        #print('TEST Recall:' , test_recall)
        #print('TEST F1Score:' , test_f1score)
        
        #list_test_acc.append(test_accuracy) 
        #list_test_precision.append(test_precision)  
        #list_test_recall.append(test_recall)
        #list_test_f1score.append(test_f1score) 

        #saving model
        if dev_accuracy > best_dev_acc:
            best_dev_acc = dev_accuracy
            #best_test_acc = test_accuracy
        if dev_precision > best_dev_precision:
            best_dev_precision = dev_precision
            #best_test_precision = test_precision
        if dev_recall > best_dev_recall:
            best_dev_recall = dev_recall
            #best_test_recall = test_recall
        if dev_f1score > best_dev_f1score:
            best_dev_f1score = dev_f1score
            #best_test_f1score = test_f1score
            best_epoch = epoch
            
            if model_save_flag:
                os.makedirs(model_save_location, exist_ok=True)
                tokenizer.save_pretrained(model_save_location)
                model.save_pretrained(model_save_location)

        now = time.time()
        print('BEST ACCURACY --> ', 'DEV:', round(best_dev_acc, 5))
        print('BEST PRECISION --> ', 'DEV:', round(best_dev_precision, 5))
        print('BEST RECALL --> ', 'DEV:', round(best_dev_recall, 5))
        print('BEST F1SCORE --> ', 'DEV:', round(best_dev_f1score, 5))
        print('TIME PER EPOCH:', (now-start)/60 )
        print()

    return best_dev_acc, best_test_acc, best_epoch, best_dev_precision, best_test_precision, best_dev_recall, best_test_recall, best_dev_f1score, best_test_f1score, list_dev_acc, list_test_acc, list_dev_precision, list_test_precision, list_dev_recall, list_test_recall, list_dev_f1score, list_test_f1score

In [5]:
if __name__ == '__main__':
    n_epochs = 5
    models = ['bert-base-multilingual-uncased']
    
    #model saving parameters
    model_save_flag = True
    model_load_flag = False
    
    overall_list_dev_acc = [] 
    overall_list_test_acc = []    
    overall_list_dev_precision = []  
    overall_list_test_precision  = []  
    overall_list_dev_recall = []  
    overall_list_test_recall = []  
    overall_list_dev_f1score = []  
    overall_list_test_f1score = [] 
    
    for i in range(5):
        
        for model_name in models:

            model_save_location = 'saved_models/' + model_name + '_overandundersampled' + str(i)
            model_load_location = None

            best_dev_acc, best_test_acc, best_epoch, best_dev_precision, best_test_precision, best_dev_recall, best_test_recall, best_dev_f1score, best_test_f1score, list_dev_acc, list_test_acc, list_dev_precision, list_test_precision, list_dev_recall, list_test_recall, list_dev_f1score, list_test_f1score = main(n_epochs, model_name, model_save_flag, model_save_location, model_load_flag, model_load_location)
            
            overall_list_dev_acc.append(list_dev_acc) 
            overall_list_test_acc.append(list_test_acc) 
            overall_list_dev_precision.append(list_dev_precision)  
            overall_list_test_precision.append(list_test_precision) 
            overall_list_dev_recall.append(list_dev_recall)  
            overall_list_test_recall.append(list_test_recall)  
            overall_list_dev_f1score.append(list_dev_f1score)  
            overall_list_test_f1score.append(list_test_f1score) 

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.64M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Training epoch: 1
FINSIHED BATCH: 19 of 330
FINSIHED BATCH: 39 of 330
FINSIHED BATCH: 59 of 330
FINSIHED BATCH: 79 of 330
FINSIHED BATCH: 99 of 330
FINSIHED BATCH: 119 of 330
FINSIHED BATCH: 139 of 330
FINSIHED BATCH: 159 of 330
FINSIHED BATCH: 179 of 330
FINSIHED BATCH: 199 of 330
FINSIHED BATCH: 219 of 330
FINSIHED BATCH: 239 of 330
FINSIHED BATCH: 259 of 330
FINSIHED BATCH: 279 of 330
FINSIHED BATCH: 299 of 330
FINSIHED BATCH: 319 of 330
Validation loss per 100 evaluation steps: 0.37108945846557617
DEV ACC: 0.8089726631393298
DEV Precision: 0.5798154226725657
DEV Recall: 0.9638321995464852
DEV F1Score: 0.7078225326585463
BEST ACCURACY -->  DEV: 0.80897
BEST PRECISION -->  DEV: 0.57982
BEST RECALL -->  DEV: 0.96383
BEST F1SCORE -->  DEV: 0.70782
TIME PER EPOCH: 7.961158533891042

Training epoch: 2
FINSIHED BATCH: 19 of 330
FINSIHED BATCH: 39 of 330
FINSIHED BATCH: 59 of 330
FINSIHED BATCH: 79 of 330
FINSIHED BATCH: 99 of 330
FINSIHED BATCH: 119 of 330
FINSIHED BATCH: 139 of 330
FINSI

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Training epoch: 1
FINSIHED BATCH: 19 of 330
FINSIHED BATCH: 39 of 330
FINSIHED BATCH: 59 of 330
FINSIHED BATCH: 79 of 330
FINSIHED BATCH: 99 of 330
FINSIHED BATCH: 119 of 330
FINSIHED BATCH: 139 of 330
FINSIHED BATCH: 159 of 330
FINSIHED BATCH: 179 of 330
FINSIHED BATCH: 199 of 330
FINSIHED BATCH: 219 of 330
FINSIHED BATCH: 239 of 330
FINSIHED BATCH: 259 of 330
FINSIHED BATCH: 279 of 330
FINSIHED BATCH: 299 of 330
FINSIHED BATCH: 319 of 330
Validation loss per 100 evaluation steps: 0.31211376190185547
DEV ACC: 0.8295855379188712
DEV Precision: 0.6289763587382635
DEV Recall: 0.8719261778785586
DEV F1Score: 0.7166434224557472
BEST ACCURACY -->  DEV: 0.82959
BEST PRECISION -->  DEV: 0.62898
BEST RECALL -->  DEV: 0.87193
BEST F1SCORE -->  DEV: 0.71664
TIME PER EPOCH: 8.172454524040223

Training epoch: 2
FINSIHED BATCH: 19 of 330
FINSIHED BATCH: 39 of 330
FINSIHED BATCH: 59 of 330
FINSIHED BATCH: 79 of 330
FINSIHED BATCH: 99 of 330
FINSIHED BATCH: 119 of 330
FINSIHED BATCH: 139 of 330
FINSI

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Training epoch: 1
FINSIHED BATCH: 19 of 330
FINSIHED BATCH: 39 of 330
FINSIHED BATCH: 59 of 330
FINSIHED BATCH: 79 of 330
FINSIHED BATCH: 99 of 330
FINSIHED BATCH: 119 of 330
FINSIHED BATCH: 139 of 330
FINSIHED BATCH: 159 of 330
FINSIHED BATCH: 179 of 330
FINSIHED BATCH: 199 of 330
FINSIHED BATCH: 219 of 330
FINSIHED BATCH: 239 of 330
FINSIHED BATCH: 259 of 330
FINSIHED BATCH: 279 of 330
FINSIHED BATCH: 299 of 330
FINSIHED BATCH: 319 of 330
Validation loss per 100 evaluation steps: 0.5114367008209229
DEV ACC: 0.8083847736625515
DEV Precision: 0.5752732717018433
DEV Recall: 0.946195515243134
DEV F1Score: 0.6985515198376313
BEST ACCURACY -->  DEV: 0.80838
BEST PRECISION -->  DEV: 0.57527
BEST RECALL -->  DEV: 0.9462
BEST F1SCORE -->  DEV: 0.69855
TIME PER EPOCH: 8.111426893870036

Training epoch: 2
FINSIHED BATCH: 19 of 330
FINSIHED BATCH: 39 of 330
FINSIHED BATCH: 59 of 330
FINSIHED BATCH: 79 of 330
FINSIHED BATCH: 99 of 330
FINSIHED BATCH: 119 of 330
FINSIHED BATCH: 139 of 330
FINSIHED

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Training epoch: 1
FINSIHED BATCH: 19 of 330
FINSIHED BATCH: 39 of 330
FINSIHED BATCH: 59 of 330
FINSIHED BATCH: 79 of 330
FINSIHED BATCH: 99 of 330
FINSIHED BATCH: 119 of 330
FINSIHED BATCH: 139 of 330
FINSIHED BATCH: 159 of 330
FINSIHED BATCH: 179 of 330
FINSIHED BATCH: 199 of 330
FINSIHED BATCH: 219 of 330
FINSIHED BATCH: 239 of 330
FINSIHED BATCH: 259 of 330
FINSIHED BATCH: 279 of 330
FINSIHED BATCH: 299 of 330
FINSIHED BATCH: 319 of 330
Validation loss per 100 evaluation steps: 0.29823553562164307
DEV ACC: 0.8255621693121693
DEV Precision: 0.6568789675932534
DEV Recall: 0.9113252708490805
DEV F1Score: 0.7525982230476306
BEST ACCURACY -->  DEV: 0.82556
BEST PRECISION -->  DEV: 0.65688
BEST RECALL -->  DEV: 0.91133
BEST F1SCORE -->  DEV: 0.7526
TIME PER EPOCH: 8.161926356951396

Training epoch: 2
FINSIHED BATCH: 19 of 330
FINSIHED BATCH: 39 of 330
FINSIHED BATCH: 59 of 330
FINSIHED BATCH: 79 of 330
FINSIHED BATCH: 99 of 330
FINSIHED BATCH: 119 of 330
FINSIHED BATCH: 139 of 330
FINSIH

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Training epoch: 1
FINSIHED BATCH: 19 of 330
FINSIHED BATCH: 39 of 330
FINSIHED BATCH: 59 of 330
FINSIHED BATCH: 79 of 330
FINSIHED BATCH: 99 of 330
FINSIHED BATCH: 119 of 330
FINSIHED BATCH: 139 of 330
FINSIHED BATCH: 159 of 330
FINSIHED BATCH: 179 of 330
FINSIHED BATCH: 199 of 330
FINSIHED BATCH: 219 of 330
FINSIHED BATCH: 239 of 330
FINSIHED BATCH: 259 of 330
FINSIHED BATCH: 279 of 330
FINSIHED BATCH: 299 of 330
FINSIHED BATCH: 319 of 330
Validation loss per 100 evaluation steps: 0.22063033282756805
DEV ACC: 0.816192680776014
DEV Precision: 0.626502025311549
DEV Recall: 0.9064539957397098
DEV F1Score: 0.7246383319430383
BEST ACCURACY -->  DEV: 0.81619
BEST PRECISION -->  DEV: 0.6265
BEST RECALL -->  DEV: 0.90645
BEST F1SCORE -->  DEV: 0.72464
TIME PER EPOCH: 8.145313473542531

Training epoch: 2
FINSIHED BATCH: 19 of 330
FINSIHED BATCH: 39 of 330
FINSIHED BATCH: 59 of 330
FINSIHED BATCH: 79 of 330
FINSIHED BATCH: 99 of 330
FINSIHED BATCH: 119 of 330
FINSIHED BATCH: 139 of 330
FINSIHED

In [6]:
print(best_dev_acc)

0.843290711346267


In [7]:
print(best_epoch)

1


In [8]:
print(best_dev_precision)

0.6730410682791634


In [9]:
print(best_dev_recall)

0.9064539957397098


In [10]:
print(best_dev_f1score)

0.730632470957548


In [11]:
print(overall_list_dev_acc)

[[0.8089726631393298, 0.8356297766019988, 0.8317901234567902, 0.8019363609641388, 0.8205651087595532], [0.8295855379188712, 0.8236331569664902, 0.8254335684891241, 0.8196649029982362, 0.8165417401528512], [0.8083847736625515, 0.8209692827748384, 0.8153843327454439, 0.814024838330394, 0.8114528218694885], [0.8255621693121693, 0.8329659024103468, 0.8105526161081716, 0.8083847736625515, 0.8030202821869489], [0.816192680776014, 0.8352623456790124, 0.843290711346267, 0.8214653145208701, 0.8200690770135215]]


In [12]:
print(overall_list_dev_precision)

[[0.5798154226725657, 0.6623565587851303, 0.6708655982465506, 0.5938689176784414, 0.6554232804232806], [0.6289763587382635, 0.5931406776644872, 0.5857850703088799, 0.6620181405895694, 0.6065807208664349], [0.5752732717018433, 0.6564471243042672, 0.6639266817838249, 0.6148393053154958, 0.5600835231787613], [0.6568789675932534, 0.6301450313355079, 0.6336952289333242, 0.582826564969422, 0.616086691086691], [0.626502025311549, 0.6631824965158301, 0.6730410682791634, 0.6465270620032523, 0.6436359055406675]]


In [13]:
print(overall_list_dev_recall)

[[0.9638321995464852, 0.8939393939393938, 0.8214909869671772, 0.8954270597127738, 0.7783383723859915], [0.8719261778785586, 0.9005165028974553, 0.9219004099956477, 0.6541131267321745, 0.8471071256785542], [0.946195515243134, 0.6114037197370532, 0.6390652557319224, 0.8063932980599647, 0.8988404452690166], [0.9113252708490805, 0.949590004351909, 0.6754409171075839, 0.8317889782175496, 0.6586419753086419], [0.9064539957397098, 0.8609599395313678, 0.7375230149039672, 0.6335411942554798, 0.6847225085320323]]


In [14]:
print(overall_list_dev_f1score)

[[0.7078225326585463, 0.743264721753623, 0.7187334681854225, 0.6951653488178624, 0.6931225716202863], [0.7166434224557472, 0.698215207374134, 0.7017146413357528, 0.6389405904812067, 0.6860749201601332], [0.6985515198376313, 0.6098171195030992, 0.63101202667029, 0.6793182888648535, 0.6785752485811455], [0.7525982230476306, 0.7411589145973057, 0.6345557859452889, 0.671566836823649, 0.6138197853331273], [0.7246383319430383, 0.730632470957548, 0.6786252395149637, 0.6240321773824649, 0.6370042297102594]]


In [15]:
#Best model is model 3