In [1]:
import pandas as pd
import numpy as np
import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from load_data import initialize_data
from reading_datasets import read_task
from labels_to_ids import labels_to_ids_mal
import time
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
def train(epoch, train_loader, model, optimizer, device, grad_step = 1, max_grad_norm = 10):
    tr_loss, tr_accuracy = 0, 0
    tr_precision, tr_recall = 0, 0
    tr_f1score = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    optimizer.zero_grad()
    
    for idx, batch in enumerate(train_loader):
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        if (idx + 1) % 20 == 0:
            print('FINSIHED BATCH:', idx, 'of', len(train_loader))

        #loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        output = model(input_ids=ids, attention_mask=mask, labels=labels)
        tr_loss += output[0]

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = output[1].view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
        
        # Compute Precision
        tmp_tr_precision = precision_score(labels.cpu().numpy(), predictions.cpu().numpy(), average = 'macro', zero_division=0 )
        tr_precision += tmp_tr_precision
        
        # Compute Recall
        tmp_tr_recall = recall_score(labels.cpu().numpy(), predictions.cpu().numpy(), average = 'macro', zero_division=0)
        tr_recall += tmp_tr_recall
        
        # Compute f1score
        tmp_tr_f1score = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average= 'macro', zero_division=0)
        tr_f1score += tmp_tr_f1score
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=max_grad_norm
        )
        
        # backward pass
        output['loss'].backward()
        if (idx + 1) % grad_step == 0:
            optimizer.step()
            optimizer.zero_grad()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    tr_precision = tr_precision / nb_tr_steps
    tr_recall = tr_recall / nb_tr_steps
    tr_f1score= tr_f1score / nb_tr_steps
    #print(f"Training loss epoch: {epoch_loss}")
    #print(f"Training accuracy epoch: {tr_accuracy}")

    return model

In [3]:
def testing(model, testing_loader, labels_to_ids, device):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    eval_precision, eval_recall = 0, 0
    eval_f1score = 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
     
    
    ids_to_labels = dict((v,k) for k,v in labels_to_ids.items())

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            #loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
            output = model(input_ids=ids, attention_mask=mask, labels=labels)

            eval_loss += output['loss'].item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = output[1].view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
            
            # Compute Precision
            tmp_eval_precision = precision_score(labels.cpu().numpy(), predictions.cpu().numpy(), average = 'macro', zero_division=0)
            eval_precision += tmp_eval_precision
            
            # Compute Recall
            tmp_eval_recall = recall_score(labels.cpu().numpy(), predictions.cpu().numpy(), average = 'macro', zero_division=0)
            eval_recall += tmp_eval_recall
            
            # Compute f1score
            tmp_eval_f1score = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='macro', zero_division=0)
            eval_f1score += tmp_eval_f1score

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    eval_precision = eval_precision / nb_eval_steps
    eval_recall = eval_recall / nb_eval_steps
    eval_f1score = eval_f1score / nb_eval_steps
    #print(f"Validation Loss: {eval_loss}")
    #print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions, eval_accuracy, eval_precision, eval_recall, eval_f1score

In [4]:
def main(n_epochs, model_name, model_save_flag, model_save_location, model_load_flag, model_load_location):
    #Initialization training parameters
    max_len = 256
    batch_size = 32
    grad_step = 1
    learning_rate = 1e-05
    initialization_input = (max_len, batch_size)

    #Reading datasets and initializing data loaders
    dataset_location = '../datasets/task_a/'

    train_data = read_task(dataset_location , split = 'mal_train_negative_augmented')
    dev_data = read_task(dataset_location , split = 'mal_sentiment_dev')
    #test_data = read_task(dataset_location , split = 'dev')#load test set
    labels_to_ids = labels_to_ids_mal
    #input_data = (train_data, dev_data, labels_to_ids)

    #Define tokenizer, model and optimizer
    device = 'cuda' if cuda.is_available() else 'cpu' #save the processing time
    if model_load_flag:
        tokenizer = AutoTokenizer.from_pretrained(model_load_location)
        model = AutoModelForSequenceClassification.from_pretrained(model_load_location)
    else: 
        tokenizer =  AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels_to_ids))
    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    model.to(device)

    #Get dataloaders
    train_loader = initialize_data(tokenizer, initialization_input, train_data, labels_to_ids, shuffle = True)
    dev_loader = initialize_data(tokenizer, initialization_input, dev_data, labels_to_ids, shuffle = True)
    #test_loader = initialize_data(tokenizer, initialization_input, test_data, labels_to_ids, shuffle = True)#create test loader

    best_dev_acc = 0
    best_test_acc = 0
    best_dev_precision = 0
    best_test_precision = 0
    best_dev_recall = 0
    best_test_recall = 0
    best_dev_f1score = 0
    best_test_f1score = 0
    best_epoch = -1
    
    list_dev_acc = [] 
    list_test_acc = []  
    list_dev_precision = []  
    list_test_precision  = []  
    list_dev_recall = []  
    list_test_recall = []  
    list_dev_f1score = []  
    list_test_f1score = []
    
    for epoch in range(n_epochs):
        start = time.time()
        print(f"Training epoch: {epoch + 1}")

        #train model
        if not model_load_flag:
            model = train(epoch, train_loader, model, optimizer, device, grad_step)
        
        #testing and logging
        labels_dev, predictions_dev, dev_accuracy, dev_precision, dev_recall, dev_f1score = testing(model, dev_loader, labels_to_ids, device)
        print('DEV ACC:', dev_accuracy)
        print('DEV Precision:' , dev_precision)
        print('DEV Recall:' , dev_recall)
        print('DEV F1Score:' , dev_f1score)
        
        list_dev_acc.append(dev_accuracy)     
        list_dev_precision.append(dev_precision)   
        list_dev_recall.append(dev_recall)  
        list_dev_f1score.append(dev_f1score)  
        
        
        #labels_test, predictions_test, test_accuracy, test_precision, test_recall, test_f1score = testing(model, test_loader, labels_to_ids, device)
        #print('TEST ACC:', test_accuracy)
        #print('TEST Precision:' , test_precision)
        #print('TEST Recall:' , test_recall)
        #print('TEST F1Score:' , test_f1score)
        
        #list_test_acc.append(test_accuracy) 
        #list_test_precision.append(test_precision)  
        #list_test_recall.append(test_recall)
        #list_test_f1score.append(test_f1score) 

        #saving model
        if dev_accuracy > best_dev_acc:
            best_dev_acc = dev_accuracy
            #best_test_acc = test_accuracy
        if dev_precision > best_dev_precision:
            best_dev_precision = dev_precision
            #best_test_precision = test_precision
        if dev_recall > best_dev_recall:
            best_dev_recall = dev_recall
            #best_test_recall = test_recall
        if dev_f1score > best_dev_f1score:
            best_dev_f1score = dev_f1score
            #best_test_f1score = test_f1score
            best_epoch = epoch
            
            if model_save_flag:
                os.makedirs(model_save_location, exist_ok=True)
                tokenizer.save_pretrained(model_save_location)
                model.save_pretrained(model_save_location)

        now = time.time()
        print('BEST ACCURACY --> ', 'DEV:', round(best_dev_acc, 5))
        print('BEST PRECISION --> ', 'DEV:', round(best_dev_precision, 5))
        print('BEST RECALL --> ', 'DEV:', round(best_dev_recall, 5))
        print('BEST F1SCORE --> ', 'DEV:', round(best_dev_f1score, 5))
        print('TIME PER EPOCH:', (now-start)/60 )
        print()

    return best_dev_acc, best_test_acc, best_epoch, best_dev_precision, best_test_precision, best_dev_recall, best_test_recall, best_dev_f1score, best_test_f1score, list_dev_acc, list_test_acc, list_dev_precision, list_test_precision, list_dev_recall, list_test_recall, list_dev_f1score, list_test_f1score

In [5]:
if __name__ == '__main__':
    n_epochs = 10
    models = ['bert-base-uncased']
    
    #model saving parameters
    model_save_flag = True
    model_load_flag = False
    
    overall_list_dev_acc = [] 
    overall_list_test_acc = []    
    overall_list_dev_precision = []  
    overall_list_test_precision  = []  
    overall_list_dev_recall = []  
    overall_list_test_recall = []  
    overall_list_dev_f1score = []  
    overall_list_test_f1score = [] 
    
    for i in range(5):
        
        for model_name in models:

            model_save_location = 'saved_models/' + model_name + 'Malayalam' + 'Augmented'+str(i)
            model_load_location = None

            best_dev_acc, best_test_acc, best_epoch, best_dev_precision, best_test_precision, best_dev_recall, best_test_recall, best_dev_f1score, best_test_f1score, list_dev_acc, list_test_acc, list_dev_precision, list_test_precision, list_dev_recall, list_test_recall, list_dev_f1score, list_test_f1score = main(n_epochs, model_name, model_save_flag, model_save_location, model_load_flag, model_load_location)
            
            overall_list_dev_acc.append(list_dev_acc) 
            overall_list_test_acc.append(list_test_acc) 
            overall_list_dev_precision.append(list_dev_precision)  
            overall_list_test_precision.append(list_test_precision) 
            overall_list_dev_recall.append(list_dev_recall)  
            overall_list_test_recall.append(list_test_recall)  
            overall_list_dev_f1score.append(list_dev_f1score)  
            overall_list_test_f1score.append(list_test_f1score) 

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Training epoch: 1
FINSIHED BATCH: 19 of 632
FINSIHED BATCH: 39 of 632
FINSIHED BATCH: 59 of 632
FINSIHED BATCH: 79 of 632
FINSIHED BATCH: 99 of 632
FINSIHED BATCH: 119 of 632
FINSIHED BATCH: 139 of 632
FINSIHED BATCH: 159 of 632
FINSIHED BATCH: 179 of 632
FINSIHED BATCH: 199 of 632
FINSIHED BATCH: 219 of 632
FINSIHED BATCH: 239 of 632
FINSIHED BATCH: 259 of 632
FINSIHED BATCH: 279 of 632
FINSIHED BATCH: 299 of 632
FINSIHED BATCH: 319 of 632
FINSIHED BATCH: 339 of 632
FINSIHED BATCH: 359 of 632
FINSIHED BATCH: 379 of 632
FINSIHED BATCH: 399 of 632
FINSIHED BATCH: 419 of 632
FINSIHED BATCH: 439 of 632
FINSIHED BATCH: 459 of 632
FINSIHED BATCH: 479 of 632
FINSIHED BATCH: 499 of 632
FINSIHED BATCH: 519 of 632
FINSIHED BATCH: 539 of 632
FINSIHED BATCH: 559 of 632
FINSIHED BATCH: 579 of 632
FINSIHED BATCH: 599 of 632
FINSIHED BATCH: 619 of 632
Validation loss per 100 evaluation steps: 1.0303698778152466
DEV ACC: 0.5094866071428571
DEV Precision: 0.47097297929952886
DEV Recall: 0.471039098304

FINSIHED BATCH: 19 of 632
FINSIHED BATCH: 39 of 632
FINSIHED BATCH: 59 of 632
FINSIHED BATCH: 79 of 632
FINSIHED BATCH: 99 of 632
FINSIHED BATCH: 119 of 632
FINSIHED BATCH: 139 of 632
FINSIHED BATCH: 159 of 632
FINSIHED BATCH: 179 of 632
FINSIHED BATCH: 199 of 632
FINSIHED BATCH: 219 of 632
FINSIHED BATCH: 239 of 632
FINSIHED BATCH: 259 of 632
FINSIHED BATCH: 279 of 632
FINSIHED BATCH: 299 of 632
FINSIHED BATCH: 319 of 632
FINSIHED BATCH: 339 of 632
FINSIHED BATCH: 359 of 632
FINSIHED BATCH: 379 of 632
FINSIHED BATCH: 399 of 632
FINSIHED BATCH: 419 of 632
FINSIHED BATCH: 439 of 632
FINSIHED BATCH: 459 of 632
FINSIHED BATCH: 479 of 632
FINSIHED BATCH: 499 of 632
FINSIHED BATCH: 519 of 632
FINSIHED BATCH: 539 of 632
FINSIHED BATCH: 559 of 632
FINSIHED BATCH: 579 of 632
FINSIHED BATCH: 599 of 632
FINSIHED BATCH: 619 of 632
Validation loss per 100 evaluation steps: 1.3831042051315308
DEV ACC: 0.576078869047619
DEV Precision: 0.5574214050575449
DEV Recall: 0.5394781790346184
DEV F1Score: 0.

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Training epoch: 1
FINSIHED BATCH: 19 of 632
FINSIHED BATCH: 39 of 632
FINSIHED BATCH: 59 of 632
FINSIHED BATCH: 79 of 632
FINSIHED BATCH: 99 of 632
FINSIHED BATCH: 119 of 632
FINSIHED BATCH: 139 of 632
FINSIHED BATCH: 159 of 632
FINSIHED BATCH: 179 of 632
FINSIHED BATCH: 199 of 632
FINSIHED BATCH: 219 of 632
FINSIHED BATCH: 239 of 632
FINSIHED BATCH: 259 of 632
FINSIHED BATCH: 279 of 632
FINSIHED BATCH: 299 of 632
FINSIHED BATCH: 319 of 632
FINSIHED BATCH: 339 of 632
FINSIHED BATCH: 359 of 632
FINSIHED BATCH: 379 of 632
FINSIHED BATCH: 399 of 632
FINSIHED BATCH: 419 of 632
FINSIHED BATCH: 439 of 632
FINSIHED BATCH: 459 of 632
FINSIHED BATCH: 479 of 632
FINSIHED BATCH: 499 of 632
FINSIHED BATCH: 519 of 632
FINSIHED BATCH: 539 of 632
FINSIHED BATCH: 559 of 632
FINSIHED BATCH: 579 of 632
FINSIHED BATCH: 599 of 632
FINSIHED BATCH: 619 of 632
Validation loss per 100 evaluation steps: 0.9408068060874939
DEV ACC: 0.5052083333333334
DEV Precision: 0.45884069775852304
DEV Recall: 0.471033543255

FINSIHED BATCH: 139 of 632
FINSIHED BATCH: 159 of 632
FINSIHED BATCH: 179 of 632
FINSIHED BATCH: 199 of 632
FINSIHED BATCH: 219 of 632
FINSIHED BATCH: 239 of 632
FINSIHED BATCH: 259 of 632
FINSIHED BATCH: 279 of 632
FINSIHED BATCH: 299 of 632
FINSIHED BATCH: 319 of 632
FINSIHED BATCH: 339 of 632
FINSIHED BATCH: 359 of 632
FINSIHED BATCH: 379 of 632
FINSIHED BATCH: 399 of 632
FINSIHED BATCH: 419 of 632
FINSIHED BATCH: 439 of 632
FINSIHED BATCH: 459 of 632
FINSIHED BATCH: 479 of 632
FINSIHED BATCH: 499 of 632
FINSIHED BATCH: 519 of 632
FINSIHED BATCH: 539 of 632
FINSIHED BATCH: 559 of 632
FINSIHED BATCH: 579 of 632
FINSIHED BATCH: 599 of 632
FINSIHED BATCH: 619 of 632
Validation loss per 100 evaluation steps: 1.1688603162765503
DEV ACC: 0.5987723214285714
DEV Precision: 0.5651273636294782
DEV Recall: 0.5742557628191749
DEV F1Score: 0.5412793583870271
BEST ACCURACY -->  DEV: 0.6064
BEST PRECISION -->  DEV: 0.60275
BEST RECALL -->  DEV: 0.58499
BEST F1SCORE -->  DEV: 0.55849
TIME PER EPOCH

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Training epoch: 1
FINSIHED BATCH: 19 of 632
FINSIHED BATCH: 39 of 632
FINSIHED BATCH: 59 of 632
FINSIHED BATCH: 79 of 632
FINSIHED BATCH: 99 of 632
FINSIHED BATCH: 119 of 632
FINSIHED BATCH: 139 of 632
FINSIHED BATCH: 159 of 632
FINSIHED BATCH: 179 of 632
FINSIHED BATCH: 199 of 632
FINSIHED BATCH: 219 of 632
FINSIHED BATCH: 239 of 632
FINSIHED BATCH: 259 of 632
FINSIHED BATCH: 279 of 632
FINSIHED BATCH: 299 of 632
FINSIHED BATCH: 319 of 632
FINSIHED BATCH: 339 of 632
FINSIHED BATCH: 359 of 632
FINSIHED BATCH: 379 of 632
FINSIHED BATCH: 399 of 632
FINSIHED BATCH: 419 of 632
FINSIHED BATCH: 439 of 632
FINSIHED BATCH: 459 of 632
FINSIHED BATCH: 479 of 632
FINSIHED BATCH: 499 of 632
FINSIHED BATCH: 519 of 632
FINSIHED BATCH: 539 of 632
FINSIHED BATCH: 559 of 632
FINSIHED BATCH: 579 of 632
FINSIHED BATCH: 599 of 632
FINSIHED BATCH: 619 of 632
Validation loss per 100 evaluation steps: 1.3317477703094482
DEV ACC: 0.5316220238095238
DEV Precision: 0.4562086304129941
DEV Recall: 0.4768907239929

FINSIHED BATCH: 19 of 632
FINSIHED BATCH: 39 of 632
FINSIHED BATCH: 59 of 632
FINSIHED BATCH: 79 of 632
FINSIHED BATCH: 99 of 632
FINSIHED BATCH: 119 of 632
FINSIHED BATCH: 139 of 632
FINSIHED BATCH: 159 of 632
FINSIHED BATCH: 179 of 632
FINSIHED BATCH: 199 of 632
FINSIHED BATCH: 219 of 632
FINSIHED BATCH: 239 of 632
FINSIHED BATCH: 259 of 632
FINSIHED BATCH: 279 of 632
FINSIHED BATCH: 299 of 632
FINSIHED BATCH: 319 of 632
FINSIHED BATCH: 339 of 632
FINSIHED BATCH: 359 of 632
FINSIHED BATCH: 379 of 632
FINSIHED BATCH: 399 of 632
FINSIHED BATCH: 419 of 632
FINSIHED BATCH: 439 of 632
FINSIHED BATCH: 459 of 632
FINSIHED BATCH: 479 of 632
FINSIHED BATCH: 499 of 632
FINSIHED BATCH: 519 of 632
FINSIHED BATCH: 539 of 632
FINSIHED BATCH: 559 of 632
FINSIHED BATCH: 579 of 632
FINSIHED BATCH: 599 of 632
FINSIHED BATCH: 619 of 632
Validation loss per 100 evaluation steps: 1.3914364576339722
DEV ACC: 0.5922619047619048
DEV Precision: 0.58859854970543
DEV Recall: 0.5916337679448235
DEV F1Score: 0.5

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Training epoch: 1
FINSIHED BATCH: 19 of 632
FINSIHED BATCH: 39 of 632
FINSIHED BATCH: 59 of 632
FINSIHED BATCH: 79 of 632
FINSIHED BATCH: 99 of 632
FINSIHED BATCH: 119 of 632
FINSIHED BATCH: 139 of 632
FINSIHED BATCH: 159 of 632
FINSIHED BATCH: 179 of 632
FINSIHED BATCH: 199 of 632
FINSIHED BATCH: 219 of 632
FINSIHED BATCH: 239 of 632
FINSIHED BATCH: 259 of 632
FINSIHED BATCH: 279 of 632
FINSIHED BATCH: 299 of 632
FINSIHED BATCH: 319 of 632
FINSIHED BATCH: 339 of 632
FINSIHED BATCH: 359 of 632
FINSIHED BATCH: 379 of 632
FINSIHED BATCH: 399 of 632
FINSIHED BATCH: 419 of 632
FINSIHED BATCH: 439 of 632
FINSIHED BATCH: 459 of 632
FINSIHED BATCH: 479 of 632
FINSIHED BATCH: 499 of 632
FINSIHED BATCH: 519 of 632
FINSIHED BATCH: 539 of 632
FINSIHED BATCH: 559 of 632
FINSIHED BATCH: 579 of 632
FINSIHED BATCH: 599 of 632
FINSIHED BATCH: 619 of 632
Validation loss per 100 evaluation steps: 1.031965732574463
DEV ACC: 0.5176711309523809
DEV Precision: 0.4391214532141528
DEV Recall: 0.46343358314295

FINSIHED BATCH: 19 of 632
FINSIHED BATCH: 39 of 632
FINSIHED BATCH: 59 of 632
FINSIHED BATCH: 79 of 632
FINSIHED BATCH: 99 of 632
FINSIHED BATCH: 119 of 632
FINSIHED BATCH: 139 of 632
FINSIHED BATCH: 159 of 632
FINSIHED BATCH: 179 of 632
FINSIHED BATCH: 199 of 632
FINSIHED BATCH: 219 of 632
FINSIHED BATCH: 239 of 632
FINSIHED BATCH: 259 of 632
FINSIHED BATCH: 279 of 632
FINSIHED BATCH: 299 of 632
FINSIHED BATCH: 319 of 632
FINSIHED BATCH: 339 of 632
FINSIHED BATCH: 359 of 632
FINSIHED BATCH: 379 of 632
FINSIHED BATCH: 399 of 632
FINSIHED BATCH: 419 of 632
FINSIHED BATCH: 439 of 632
FINSIHED BATCH: 459 of 632
FINSIHED BATCH: 479 of 632
FINSIHED BATCH: 499 of 632
FINSIHED BATCH: 519 of 632
FINSIHED BATCH: 539 of 632
FINSIHED BATCH: 559 of 632
FINSIHED BATCH: 579 of 632
FINSIHED BATCH: 599 of 632
FINSIHED BATCH: 619 of 632
Validation loss per 100 evaluation steps: 1.3472529649734497
DEV ACC: 0.5652901785714286
DEV Precision: 0.5750268146302812
DEV Recall: 0.5560978089670111
DEV F1Score: 0

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Training epoch: 1
FINSIHED BATCH: 19 of 632
FINSIHED BATCH: 39 of 632
FINSIHED BATCH: 59 of 632
FINSIHED BATCH: 79 of 632
FINSIHED BATCH: 99 of 632
FINSIHED BATCH: 119 of 632
FINSIHED BATCH: 139 of 632
FINSIHED BATCH: 159 of 632
FINSIHED BATCH: 179 of 632
FINSIHED BATCH: 199 of 632
FINSIHED BATCH: 219 of 632
FINSIHED BATCH: 239 of 632
FINSIHED BATCH: 259 of 632
FINSIHED BATCH: 279 of 632
FINSIHED BATCH: 299 of 632
FINSIHED BATCH: 319 of 632
FINSIHED BATCH: 339 of 632
FINSIHED BATCH: 359 of 632
FINSIHED BATCH: 379 of 632
FINSIHED BATCH: 399 of 632
FINSIHED BATCH: 419 of 632
FINSIHED BATCH: 439 of 632
FINSIHED BATCH: 459 of 632
FINSIHED BATCH: 479 of 632
FINSIHED BATCH: 499 of 632
FINSIHED BATCH: 519 of 632
FINSIHED BATCH: 539 of 632
FINSIHED BATCH: 559 of 632
FINSIHED BATCH: 579 of 632
FINSIHED BATCH: 599 of 632
FINSIHED BATCH: 619 of 632
Validation loss per 100 evaluation steps: 1.1390210390090942
DEV ACC: 0.5518973214285714
DEV Precision: 0.4413929717428931
DEV Recall: 0.4746231774323

FINSIHED BATCH: 19 of 632
FINSIHED BATCH: 39 of 632
FINSIHED BATCH: 59 of 632
FINSIHED BATCH: 79 of 632
FINSIHED BATCH: 99 of 632
FINSIHED BATCH: 119 of 632
FINSIHED BATCH: 139 of 632
FINSIHED BATCH: 159 of 632
FINSIHED BATCH: 179 of 632
FINSIHED BATCH: 199 of 632
FINSIHED BATCH: 219 of 632
FINSIHED BATCH: 239 of 632
FINSIHED BATCH: 259 of 632
FINSIHED BATCH: 279 of 632
FINSIHED BATCH: 299 of 632
FINSIHED BATCH: 319 of 632
FINSIHED BATCH: 339 of 632
FINSIHED BATCH: 359 of 632
FINSIHED BATCH: 379 of 632
FINSIHED BATCH: 399 of 632
FINSIHED BATCH: 419 of 632
FINSIHED BATCH: 439 of 632
FINSIHED BATCH: 459 of 632
FINSIHED BATCH: 479 of 632
FINSIHED BATCH: 499 of 632
FINSIHED BATCH: 519 of 632
FINSIHED BATCH: 539 of 632
FINSIHED BATCH: 559 of 632
FINSIHED BATCH: 579 of 632
FINSIHED BATCH: 599 of 632
FINSIHED BATCH: 619 of 632
Validation loss per 100 evaluation steps: 1.1835293769836426
DEV ACC: 0.5784970238095238
DEV Precision: 0.5831182408860979
DEV Recall: 0.5820014219495919
DEV F1Score: 0

In [6]:
print(best_dev_acc)

0.5948660714285714


In [7]:
print(best_epoch)

7


In [8]:
print(best_dev_precision)

0.5831182408860979


In [9]:
print(best_dev_recall)

0.5820014219495919


In [10]:
print(best_dev_f1score)

0.5402929095408041


In [11]:
print(overall_list_dev_acc)

[[0.5094866071428571, 0.5814732142857143, 0.5915178571428571, 0.5835193452380952, 0.5818452380952381, 0.5855654761904762, 0.5773809523809524, 0.576078869047619, 0.587797619047619, 0.5792410714285714], [0.5052083333333334, 0.5461309523809523, 0.5740327380952381, 0.5479910714285714, 0.6063988095238095, 0.5987723214285714, 0.5933779761904762, 0.5987723214285714, 0.587983630952381, 0.5915178571428571], [0.5316220238095238, 0.5600818452380952, 0.5809151785714286, 0.5824032738095238, 0.5859375, 0.5935639880952381, 0.5967261904761905, 0.5922619047619048, 0.5900297619047619, 0.5837053571428571], [0.5176711309523809, 0.5571056547619048, 0.570498511904762, 0.5892857142857143, 0.5881696428571429, 0.5883556547619048, 0.5890997023809524, 0.5652901785714286, 0.5987723214285714, 0.5965401785714286], [0.5518973214285714, 0.5593377976190477, 0.5520833333333334, 0.5688244047619048, 0.5948660714285714, 0.5738467261904762, 0.5716145833333333, 0.5784970238095238, 0.5928199404761905, 0.5825892857142857]]


In [12]:
print(overall_list_dev_precision)

[[0.47097297929952886, 0.5261269501973282, 0.5484330426517924, 0.5804342817400855, 0.6071569470041523, 0.5877581680253688, 0.5669916772070135, 0.5574214050575449, 0.5701290236182499, 0.5726784277230704], [0.45884069775852304, 0.5256265106567102, 0.577139066436598, 0.5932655890383389, 0.5688208479778859, 0.5925600466002249, 0.6027467077302952, 0.5651273636294782, 0.5726161584971984, 0.5772696068520649], [0.4562086304129941, 0.5172095585028883, 0.5566512331914117, 0.570752399881864, 0.5785026008509463, 0.5728182407869907, 0.5736251335585149, 0.58859854970543, 0.5748101688227739, 0.5931237958470101], [0.4391214532141528, 0.5064209539439319, 0.5335868347921917, 0.5791715402652902, 0.5736321416985394, 0.5804276264701684, 0.5636626604313054, 0.5750268146302812, 0.579497884308152, 0.5931634957304599], [0.4413929717428931, 0.5395616498347593, 0.5588043361820866, 0.5748861555111555, 0.5248123800802372, 0.581531308036035, 0.5674544748429369, 0.5831182408860979, 0.5813638817269332, 0.556337849825

In [13]:
print(overall_list_dev_recall)

[[0.47103909830479235, 0.5531906624100688, 0.546863828423806, 0.566311822261416, 0.5883099922354539, 0.5887221173434407, 0.5443644316087373, 0.5394781790346184, 0.558287980329078, 0.5645651579396327], [0.47103354325566565, 0.5382476530822119, 0.5515436589845727, 0.5568340482010802, 0.556231360731478, 0.5849874792881208, 0.5766006649792171, 0.5742557628191749, 0.5734816227282227, 0.5761430674895404], [0.47689072399294624, 0.5345001015654901, 0.5428613597168718, 0.5801879568498667, 0.5786173957414215, 0.5776080219383791, 0.5703954231022247, 0.5916337679448235, 0.5909897295165153, 0.579610808119868], [0.46343358314295546, 0.5077533175274772, 0.5593702969062322, 0.5691968114076694, 0.5623095467790847, 0.563182118575715, 0.5608350481494279, 0.5560978089670111, 0.5673533730447154, 0.5838674160200519], [0.47462317743239474, 0.5427459077039603, 0.5509604936088655, 0.5578480107422986, 0.5379049216108661, 0.5737254561298819, 0.5726285239512932, 0.5820014219495919, 0.5638403517192977, 0.549456697

In [14]:
print(overall_list_dev_f1score)

[[0.4276412138184524, 0.5099004194093949, 0.519233755981337, 0.5368265846057371, 0.5508148872242012, 0.5472854372417508, 0.5284106418212744, 0.5160763408480179, 0.5333535047545135, 0.5339718390810032], [0.4296930860397529, 0.48731360475713853, 0.5260829192738969, 0.5148189631719935, 0.5368739461962267, 0.5584874748346299, 0.5513470800595347, 0.5412793583870271, 0.5380527973184156, 0.5385177151011422], [0.44463473618251287, 0.48814795077731227, 0.5160243096128668, 0.5375616748695279, 0.5383457514961789, 0.5366245052274723, 0.5389612746326515, 0.5483261811991343, 0.54462678283342, 0.5490094190060167], [0.422335780363117, 0.480296379368564, 0.5056465311474129, 0.5357278741245756, 0.5366097006882998, 0.5323156957110827, 0.5344300721454412, 0.5245532542635385, 0.5440649660060771, 0.5582628217407514], [0.44169764143920726, 0.5023615094604281, 0.502282157095719, 0.521843634511183, 0.5078204727798532, 0.5339898580412704, 0.5237562404717236, 0.5402929095408041, 0.5372449938199837, 0.52255090222

In [15]:
#The best model is 3