In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !pip install transformers

In [63]:
import json
import torch
import itertools
import pandas as pd
import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from tqdm import trange
from tqdm.notebook import tqdm
from torch.utils.data import (TensorDataset, DataLoader,
                              RandomSampler, SequentialSampler)

from transformers import BertTokenizer, BertConfig
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.metrics import f1_score as f1
from sklearn.model_selection import train_test_split

In [4]:
def check_for_gpu():
    if torch.cuda.is_available():       
        device = torch.device("cuda")
        print(f'There are {torch.cuda.device_count()} GPU(s) available.')
        print('We will use the GPU:', torch.cuda.get_device_name(0))
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")
    return device

In [5]:
device = check_for_gpu()
device

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


device(type='cuda')

# Importing Train/Test sets

In [10]:
## STILL TESTING
## Very small samples to debug training and evaluation functions in reasonable amount of time

train_df = pd.read_parquet("/content/drive/MyDrive/train_df.parquet", columns=["review_text", "genre"]).sample(n=500)
test_df = pd.read_parquet("/content/drive/MyDrive/test_df.parquet", columns=["review_text", "genre"]).sample(n=300)

In [11]:
x_train = [row for row in train_df["review_text"]]
y_train = [row for row in train_df["genre"]]

x_test = [row for row in test_df["review_text"]]
y_test = [row for row in test_df["genre"]]

In [12]:
len(x_train),len(y_train),len(x_test),len(y_test)

(500, 500, 300, 300)

### Labelling y_train/y_test

In [13]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(y_train)
print(le.classes_)

y_train = le.transform(y_train)
y_test = le.transform(y_test)

y_test

['Art' 'Fantasy' 'Fiction' 'History' 'Horror' 'Literature' 'NonFiction'
 'Romance' 'Science Fiction' 'Thriller']


array([9, 7, 2, 4, 7, 2, 2, 5, 1, 5, 0, 2, 5, 3, 2, 7, 1, 2, 1, 2, 1, 2,
       2, 2, 2, 1, 2, 0, 7, 2, 1, 2, 5, 7, 2, 2, 1, 2, 1, 7, 1, 1, 1, 2,
       9, 8, 7, 2, 2, 2, 7, 7, 7, 2, 2, 2, 1, 2, 8, 1, 2, 2, 1, 1, 2, 2,
       2, 7, 8, 2, 8, 7, 7, 1, 2, 8, 1, 7, 1, 1, 1, 1, 2, 6, 2, 5, 2, 1,
       7, 2, 2, 1, 2, 2, 2, 8, 7, 7, 1, 4, 9, 1, 7, 1, 9, 2, 7, 1, 3, 1,
       2, 9, 1, 1, 9, 4, 2, 0, 2, 2, 8, 2, 2, 2, 7, 7, 2, 7, 7, 4, 1, 1,
       2, 7, 8, 8, 2, 2, 1, 2, 2, 7, 9, 2, 0, 4, 2, 2, 7, 7, 9, 1, 4, 1,
       2, 7, 7, 2, 1, 2, 7, 1, 2, 2, 7, 1, 2, 7, 8, 2, 2, 2, 7, 1, 7, 0,
       1, 7, 2, 1, 2, 2, 3, 1, 1, 1, 7, 4, 3, 1, 8, 1, 7, 1, 4, 2, 7, 1,
       2, 2, 1, 1, 8, 1, 2, 2, 7, 2, 1, 7, 1, 0, 1, 7, 7, 7, 7, 1, 1, 2,
       2, 3, 5, 1, 2, 2, 2, 7, 0, 5, 8, 4, 2, 2, 7, 2, 2, 1, 9, 8, 2, 6,
       1, 0, 7, 7, 2, 9, 2, 2, 7, 9, 1, 1, 7, 1, 2, 8, 1, 3, 9, 2, 7, 2,
       7, 1, 8, 9, 0, 7, 7, 2, 1, 2, 0, 0, 2, 1, 5, 1, 2, 7, 3, 7, 2, 1,
       2, 2, 7, 2, 2, 1, 2, 1, 2, 2, 1, 9, 4, 2])

# Transformer Inizialization

In [14]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [15]:
# # Print the original sentence.
# print(' Original: \n', x_train[0],"\n")

# # Print the sentence split into tokens.
# print('Tokenized: \n', tokenizer.tokenize(x_train[0]),"\n")

# # Print the sentence mapped to token ids.
# print('Token IDs: \n', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x_train[0])))

#### BERT Requirements:
- Add special tokens at the beginning and the end of each Tokenized text: `[CLS]` and `[SEP]`
- Set token lenght to a fixed amount
- Make artificial token explicit with the attention mask

In [16]:
def ids_attention_masks(_input: list, MAX_LEN:int = 100) -> (list, list):
    """ Tokenizes text with '[CLS]' and '[SEP]' tokens and creates relatives masks """
    input_ids = []
    attention_masks = []
    
    for review in tqdm(_input):
        encoded_dict = tokenizer.encode_plus(
                            review,                        # Review to encode.
                            add_special_tokens = True,     # Add '[CLS]' and '[SEP]'
                            max_length = MAX_LEN,          # Pad & truncate all sentences.
                            pad_to_max_length = True,      # (padding = "max_length" dives error later on)
                            return_attention_mask = True,  # Construct attention masks.
                            return_tensors = 'pt')         # Return pytorch tensors.
            
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return input_ids, attention_masks

In [17]:
def splitting_train_val_with_masks(ids_train: list, amasks_train: list, labels_train: list):
    """ Return splitted Train_set and Validation_set with relatives masks"""
    (train_inputs, validation_inputs,
     train_labels, validation_labels) = train_test_split(ids_train, labels_train,
                                                         random_state=42,
                                                         test_size=0.20)
    (train_masks, validation_masks,
     _, _) = train_test_split(amasks_train, ids_train,
                              random_state=42, test_size=20)
    
    return tuple(train_inputs, train_labels, validation_inputs, validation_labels, train_masks, validation_masks)

In [18]:
def to_tensor(*args) -> tuple:
    """ Transforms multiple dim. lists and 1 dim. arrays to tensors """
    return tuple( torch.cat(el, dim=0) if type(el) == list else torch.tensor(el) for el in args  )

In [19]:
input_ids_train, attention_masks_train = ids_attention_masks(x_train)

  0%|          | 0/500 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [20]:
input_ids_test, attention_masks_test = ids_attention_masks(x_test)

  0%|          | 0/300 [00:00<?, ?it/s]

In [21]:
len(input_ids_test), len(attention_masks_test), len(y_test)

(300, 300, 300)

## Spitting Training to train/Validation

***Retrieving Validation Inputs, Masks and Labels***

In [22]:
# retrieving Validation Inputs, Masks and Labels
(train_inputs, val_inputs, train_labels, val_labels) = train_test_split(input_ids_train, y_train,
                                                                        random_state=42,
                                                                        test_size=0.15)

(train_masks, val_masks, _, _) = train_test_split(attention_masks_train, input_ids_train,
                                                  random_state=42, test_size=0.15)

***Converting lists to Tensors***

In [23]:
# Convert the lists into tensors.
train_inputs, train_masks, train_labels = to_tensor(train_inputs, train_masks, train_labels)

validation_inputs, validation_masks, validation_labels = to_tensor(val_inputs, val_masks, val_labels)

test_inputs, test_masks, test_labels = to_tensor(input_ids_test, attention_masks_test, y_test)

# Tensor Dataset

In [24]:
#creating iterator od the dataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) #CPU to GPU

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

print(f"Train_set: \t{len(train_data)} reviews \n" +
      f"Val_set: \t{len(validation_data)} reviews \n" +
      f"Test_data: \t{len(test_data)} reviews")

Train_set: 	425 reviews 
Val_set: 	75 reviews 
Test_data: 	300 reviews


# BERT

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model_name = "bert-base-uncased"
bert_BU_model = BertForSequenceClassification.from_pretrained(model_name, num_labels= len(le.classes_))

# Make it run on GPU
bert_BU_model.cuda()

In [26]:
def train(model, optimizer, scheduler, epochs, loss_vector=None, log_interval=200):
    # Set model to training mode
    model.train()

    # Loop over each batch from the training set
    for step, batch in enumerate(train_dataloader):

        batch = tuple(t.to(device) for t in batch)         # Copy data to GPU if needed
        b_input_ids, b_input_mask, b_labels = batch        # Unpack the inputs from our dataloader
        optimizer.zero_grad()                              # Zero gradient buffers

        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)

        loss = outputs[0]
        if loss_vector is not None: loss_vector.append(loss.item())

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()
        scheduler.step()

        if step % log_interval == 0:
            print(f'Train Epoch: {epochs} [{step * len(b_input_ids)}/{len(train_dataloader.dataset)}' +
                  f'({100. * step / len(train_dataloader):.0f}%)]\tLoss: {loss:.6f}')

from sklearn.metrics import confusion_matrix, classification_report

def evaluate(model, loader):
    model.eval()
    n_correct, n_all = 0, 0
    predicted_labels, true_labels = list(), list()
    full_predictions, full_labels = np.array([]), np.array([])
    
    for batch in loader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                          attention_mask=b_input_mask)
            
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        predictions = np.argmax(logits, axis=1)

        labels = b_labels.to('cpu').numpy()
        n_correct += np.sum(predictions == labels)

        true_labels.extend(labels)
        predicted_labels.extend(predictions)

        full_predictions = np.concatenate((full_predictions, predictions), axis=0)
        full_labels = np.concatenate((full_labels, labels), axis=0)
        n_all += len(labels)
    
    f1_score = f1(full_labels, full_predictions, average="macro")
    print(f'F1 Score: {f1_score:.4f}')
    print(f'Accuracy: [{n_correct}/{n_all}] {(n_correct/n_all):.4f}')
    print('Classification report:')
    print(classification_report(true_labels, predicted_labels))
    print('Confusion matrix:')
    cm = confusion_matrix(true_labels, predicted_labels)
    print(cm)
    return f1_score

***Setting up Optimizer and Scheduler***

In [28]:
def get_optimizer_scheduler(model, epochs: int, weight_decay: float, learning_rate: float):
    """ Returning optimizer and scheduler of the model for the current input Hyperparameters"""

    warmup_steps = int(0.2*len(train_dataloader))
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
                                    {'params': [tensor for parameter, tensor in model.named_parameters()
                                                if not any(el in parameter for el in no_decay)],
                                     'weight_decay': weight_decay},
                                    {'params': [tensor for parameter, tensor in model.named_parameters()
                                                if any(el in parameter for el in no_decay)],
                                     'weight_decay': 0.0}
                                    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr= learning_rate, eps= 1e-8, no_deprecation_warning=True)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps= warmup_steps,
                                     num_training_steps =len(train_dataloader)*epochs)
    return optimizer, scheduler

In [29]:
optimizer, scheduler = get_optimizer_scheduler(bert_BU_model, 3, 0.1, 2e-5, )

In [33]:
def hypter_param_selection(input_params: list, n_classes: int, validation_dataloader, model_name):
    
    """ 
    Hyper parameters tuning on Validation set.
    Returns a dict of best params
    """
    
    epochs = 4
    params_list = input_params.keys()
    res = dict()
    
    #cartesian product of the params
    for param in itertools.product(*[param for param in input_params.values()]):
        current_params_dict = dict(zip(params_list, param))
        print(f"Hyperparameters --> {current_params_dict}")
        
        #initialize the model, retrieving Optimizer and Scheduler
        model = BertForSequenceClassification.from_pretrained(model_name, num_labels= n_classes)   
        model.cuda()
        optimizer, scheduler= get_optimizer_scheduler(model, epochs, *current_params_dict.values())
        
        #training on validation
        val_score = {str(current_params_dict): list()}
        train_lossv = list()
        for epoch in range(1, epochs + 1):
            train(model, optimizer, scheduler, epoch, train_lossv)
            print('\nValidation set:')
            f1_score = evaluate(model, validation_dataloader)
            val_score[str(current_params_dict)].append(f1_score)
        
        res.update({str(current_params_dict): val_score[str(current_params_dict)]})
    return res

In [34]:
param_grid = {
              'weight_decay': [0.01, 0.1, 0.001], 
              'lr': [2e-5, 3e-5, 4e-5]
             }

res = hypter_param_selection(param_grid, 10, validation_dataloader, model_name)

Hyperparameters --> {'weight_decay': 0.01, 'lr': 2e-05}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


Validation set:
F1 Score: 0.0472
Accuracy: [9/75] 0.1200
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.11      0.78      0.19         9
           3       0.25      0.33      0.29         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00        15

    accuracy                           0.12        75
   macro avg       0.04      0.11      0.05        75
weighted avg       0.03      0.12      0.05        75

Confusion matrix:
[[ 0  0  2  0  0  0  0  0  0  0]
 [ 0  0  5  1  0  0  0  0  0  0]
 [ 0  0  7  1  0  0  0  1  0  0]
 [ 0  0  4  2  0  0  0  0  0  0]
 [ 0  0 12

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0302
Accuracy: [4/75] 0.0533
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.03      0.17      0.06         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.08      0.17      0.11         6
           8       0.00      0.00      0.00         2
           9       0.13      0.13      0.13        15

    accuracy                           0.05        75
   macro avg       0.03      0.05      0.03        75
weighted avg       0.04      0.05      0.04        75

Confusion matrix:
[[0 0 0 1 0 0 0 0 0 1]
 [0 0 0 2 0 0 0 0 0 4]
 [0 0 0 4 0 2 1 1 0 1]
 [0 0 0 1 0 0 0 4 0 1]
 [0 0 1 4 0 3 0 4 0 1]
 [0 0 1 4 0 0 0 1 2 2]
 [0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0489
Accuracy: [7/75] 0.0933
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.04      0.17      0.07         6
           4       0.50      0.08      0.13        13
           5       0.11      0.10      0.11        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00         2
           9       0.14      0.27      0.18        15

    accuracy                           0.09        75
   macro avg       0.08      0.06      0.05        75
weighted avg       0.13      0.09      0.08        75

Confusion matrix:
[[0 0 0 0 0 0 0 0 0 2]
 [0 0 0 1 0 0 0 0 0 5]
 [0 0 0 3 0 0 0 3 0 3]
 [0 0 0 1 1 0 0 2 0 2]
 [0 0 0 4 1 1 0 1 0 6]
 [0 0 0 4 0 1 0 1 1 3]
 [0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0616
Accuracy: [8/75] 0.1067
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.07      0.33      0.12         6
           4       0.50      0.08      0.13        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.12      0.17      0.14         6
           8       0.00      0.00      0.00         2
           9       0.19      0.27      0.22        15

    accuracy                           0.11        75
   macro avg       0.09      0.08      0.06        75
weighted avg       0.14      0.11      0.09        75

Confusion matrix:
[[0 0 0 1 0 0 0 0 0 1]
 [0 0 0 1 0 0 0 0 0 5]
 [0 0 0 4 0 2 0 2 0 1]
 [0 0 0 2 1 1 0 2 0 0]
 [0 0 0 4 1 2 0 1 0 5]
 [0 0 0 4 0 0 0 1 3 2]
 [0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequ


Validation set:
F1 Score: 0.0307
Accuracy: [7/75] 0.0933
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.09      1.00      0.16         6
           2       0.20      0.11      0.14         9
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00        15

    accuracy                           0.09        75
   macro avg       0.03      0.11      0.03        75
weighted avg       0.03      0.09      0.03        75

Confusion matrix:
[[ 0  1  1  0  0  0  0  0  0  0]
 [ 0  6  0  0  0  0  0  0  0  0]
 [ 0  8  1  0  0  0  0  0  0  0]
 [ 0  4  1  0  0  1  0  0  0  0]
 [ 0 13  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0978
Accuracy: [13/75] 0.1733
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.14      0.33      0.20         6
           4       0.33      0.15      0.21        13
           5       0.28      0.50      0.36        10
           6       0.00      0.00      0.00         6
           7       0.12      0.67      0.21         6
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00        15

    accuracy                           0.17        75
   macro avg       0.09      0.17      0.10        75
weighted avg       0.12      0.17      0.12        75

Confusion matrix:
[[0 0 0 1 0 0 0 1 0 0]
 [1 0 0 2 0 1 0 2 0 0]
 [0 0 0 2 1 1 0 5 0 0]
 [0 1 0 2 0 1 0 2 0 0]
 [0 0 0 3 2 3 0 5 0 0]
 [0 0 0 1 1 5 0 2 1 0]
 [0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0774
Accuracy: [9/75] 0.1200
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.14      0.50      0.22         6
           4       0.00      0.00      0.00        13
           5       0.50      0.10      0.17        10
           6       0.00      0.00      0.00         6
           7       0.25      0.83      0.38         6
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00        15

    accuracy                           0.12        75
   macro avg       0.09      0.14      0.08        75
weighted avg       0.10      0.12      0.07        75

Confusion matrix:
[[0 0 0 0 0 0 0 0 2 0]
 [1 0 0 1 0 0 0 0 4 0]
 [0 1 0 3 0 0 0 3 2 0]
 [0 1 0 3 0 0 0 2 0 0]
 [2 0 0 2 0 0 0 5 4 0]
 [1 1 0 1 0 1 0 2 4 0]
 [1 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0697
Accuracy: [9/75] 0.1200
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.06      0.33      0.11         6
           4       0.00      0.00      0.00        13
           5       0.40      0.20      0.27        10
           6       0.00      0.00      0.00         6
           7       0.20      0.83      0.32         6
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00        15

    accuracy                           0.12        75
   macro avg       0.07      0.14      0.07        75
weighted avg       0.07      0.12      0.07        75

Confusion matrix:
[[ 0  0  0  1  0  0  0  0  1  0]
 [ 0  0  0  3  0  0  0  1  2  0]
 [ 0  0  0  3  0  0  0  5  1  0]
 [ 0  1  0  2  0  1  0  2  0  0]
 [ 0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequ


Validation set:
F1 Score: 0.0385
Accuracy: [8/75] 0.1067
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.10      1.00      0.18         6
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        13
           5       0.20      0.20      0.20        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00        15

    accuracy                           0.11        75
   macro avg       0.03      0.12      0.04        75
weighted avg       0.03      0.11      0.04        75

Confusion matrix:
[[ 0  2  0  0  0  0  0  0  0  0]
 [ 0  6  0  0  0  0  0  0  0  0]
 [ 0  4  0  0  4  1  0  0  0  0]
 [ 0  3  0  0  1  2  0  0  0  0]
 [ 0 12  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0925
Accuracy: [17/75] 0.2267
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.14      0.33      0.20         6
           4       0.00      0.00      0.00        13
           5       0.19      0.50      0.27        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00         2
           9       0.34      0.67      0.45        15

    accuracy                           0.23        75
   macro avg       0.07      0.15      0.09        75
weighted avg       0.11      0.23      0.14        75

Confusion matrix:
[[ 0  0  0  0  0  0  0  1  0  1]
 [ 0  0  0  0  0  4  0  1  0  1]
 [ 0  0  0  1  0  3  0  2  0  3]
 [ 0  0  0  2  0  3  0  0  0  1]
 [ 0  0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0845
Accuracy: [10/75] 0.1333
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.14      0.17      0.15         6
           2       0.00      0.00      0.00         9
           3       0.05      0.17      0.08         6
           4       0.00      0.00      0.00        13
           5       0.17      0.20      0.18        10
           6       0.00      0.00      0.00         6
           7       0.12      0.67      0.21         6
           8       0.00      0.00      0.00         2
           9       0.67      0.13      0.22        15

    accuracy                           0.13        75
   macro avg       0.12      0.13      0.08        75
weighted avg       0.18      0.13      0.10        75

Confusion matrix:
[[0 0 0 1 0 0 0 1 0 0]
 [0 1 0 1 0 0 0 4 0 0]
 [0 1 0 1 0 2 0 5 0 0]
 [0 0 0 1 0 1 0 4 0 0]
 [0 3 0 3 0 3 0 3 0 1]
 [0 0 0 2 1 2 0 5 0 0]
 [0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0797
Accuracy: [10/75] 0.1333
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.20      0.83      0.32         6
           2       0.00      0.00      0.00         9
           3       0.10      0.33      0.15         6
           4       0.00      0.00      0.00        13
           5       0.17      0.10      0.12        10
           6       0.00      0.00      0.00         6
           7       0.06      0.17      0.08         6
           8       0.00      0.00      0.00         2
           9       0.50      0.07      0.12        15

    accuracy                           0.13        75
   macro avg       0.10      0.15      0.08        75
weighted avg       0.15      0.13      0.08        75

Confusion matrix:
[[0 0 0 1 0 0 0 1 0 0]
 [0 5 0 0 0 0 0 1 0 0]
 [0 2 0 2 0 1 0 4 0 0]
 [0 1 0 2 0 1 0 2 0 0]
 [0 7 0 3 0 0 0 2 0 1]
 [0 1 0 3 1 1 0 4 0 0]
 [0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequ


Validation set:
F1 Score: 0.0598
Accuracy: [11/75] 0.1467
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        13
           5       0.33      0.10      0.15        10
           6       0.00      0.00      0.00         6
           7       0.11      0.83      0.20         6
           8       0.00      0.00      0.00         2
           9       0.19      0.33      0.24        15

    accuracy                           0.15        75
   macro avg       0.06      0.13      0.06        75
weighted avg       0.09      0.15      0.09        75

Confusion matrix:
[[0 0 0 0 0 0 0 2 0 0]
 [0 0 0 0 0 0 0 2 0 4]
 [0 0 0 0 1 0 0 5 0 3]
 [0 0 0 0 0 0 0 6 0 0]
 [0 0 0 0 0 1 0 6 0 6]
 [0 0 0 0 0 1 0 4 0 5]
 [0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0283
Accuracy: [3/75] 0.0400
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.08      0.17      0.11         6
           4       0.10      0.08      0.09        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00         2
           9       0.14      0.07      0.09        15

    accuracy                           0.04        75
   macro avg       0.03      0.03      0.03        75
weighted avg       0.05      0.04      0.04        75

Confusion matrix:
[[0 0 0 0 0 1 0 0 1 0]
 [0 0 0 1 1 1 0 0 3 0]
 [0 0 0 2 2 4 0 0 1 0]
 [0 0 0 1 1 2 0 0 2 0]
 [0 0 0 2 1 6 0 0 1 3]
 [0 0 0 3 3 0 0 0 2 2]
 [0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0967
Accuracy: [9/75] 0.1200
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.12      0.17      0.14         6
           4       0.12      0.08      0.10        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.11      0.33      0.17         6
           8       0.25      0.50      0.33         2
           9       0.20      0.27      0.23        15

    accuracy                           0.12        75
   macro avg       0.08      0.13      0.10        75
weighted avg       0.09      0.12      0.10        75

Confusion matrix:
[[0 0 0 0 0 1 0 1 0 0]
 [0 0 0 1 1 1 0 1 1 1]
 [0 0 0 2 2 1 0 1 0 3]
 [0 0 0 1 1 1 0 2 0 1]
 [0 0 0 0 1 4 0 1 1 6]
 [0 0 0 2 2 0 0 2 0 4]
 [0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.1147
Accuracy: [14/75] 0.1867
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.09      0.17      0.12         6
           4       0.20      0.08      0.11        13
           5       0.29      0.20      0.24        10
           6       0.00      0.00      0.00         6
           7       0.13      0.33      0.19         6
           8       0.08      0.50      0.14         2
           9       0.28      0.47      0.35        15

    accuracy                           0.19        75
   macro avg       0.11      0.17      0.11        75
weighted avg       0.15      0.19      0.15        75

Confusion matrix:
[[0 0 0 0 0 0 0 1 1 0]
 [0 0 0 2 0 1 0 0 1 2]
 [0 0 0 2 2 0 0 1 1 3]
 [0 0 0 1 1 0 0 2 1 1]
 [0 0 0 1 1 1 0 1 3 6]
 [0 0 0 2 0 2 0 2 0 4]
 [0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequ


Validation set:
F1 Score: 0.0349
Accuracy: [15/75] 0.2000
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00         2
           9       0.21      1.00      0.35        15

    accuracy                           0.20        75
   macro avg       0.02      0.10      0.03        75
weighted avg       0.04      0.20      0.07        75

Confusion matrix:
[[ 0  0  0  0  0  0  0  0  0  2]
 [ 0  0  0  0  0  0  0  0  2  4]
 [ 0  0  0  0  0  0  0  0  0  9]
 [ 0  0  0  0  0  0  0  0  0  6]
 [ 0  0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0708
Accuracy: [13/75] 0.1733
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        13
           5       0.15      0.30      0.20        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.04      0.50      0.07         2
           9       0.35      0.60      0.44        15

    accuracy                           0.17        75
   macro avg       0.05      0.14      0.07        75
weighted avg       0.09      0.17      0.12        75

Confusion matrix:
[[0 0 0 0 0 0 0 0 2 0]
 [0 0 0 0 0 3 0 0 2 1]
 [0 0 0 0 0 4 0 0 3 2]
 [0 0 0 0 0 2 0 0 2 2]
 [0 0 0 0 0 3 0 1 6 3]
 [0 0 0 0 0 3 0 1 2 4]
 [0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0975
Accuracy: [14/75] 0.1867
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       1.00      0.17      0.29         6
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        13
           5       0.13      0.30      0.18        10
           6       0.00      0.00      0.00         6
           7       0.11      0.33      0.16         6
           8       0.00      0.00      0.00         2
           9       0.26      0.53      0.35        15

    accuracy                           0.19        75
   macro avg       0.15      0.13      0.10        75
weighted avg       0.16      0.19      0.13        75

Confusion matrix:
[[0 0 0 0 0 0 0 2 0 0]
 [0 1 0 0 0 2 0 2 0 1]
 [0 0 0 0 0 4 0 0 1 4]
 [0 0 0 0 0 2 0 1 0 3]
 [0 0 0 0 0 4 0 6 0 3]
 [0 0 0 0 0 3 0 0 0 7]
 [0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.1508
Accuracy: [18/75] 0.2400
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.75      0.50      0.60         6
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        13
           5       0.10      0.30      0.15        10
           6       0.00      0.00      0.00         6
           7       0.25      0.67      0.36         6
           8       0.00      0.00      0.00         2
           9       0.31      0.53      0.39        15

    accuracy                           0.24        75
   macro avg       0.14      0.20      0.15        75
weighted avg       0.16      0.24      0.18        75

Confusion matrix:
[[0 0 0 0 0 0 0 2 0 0]
 [0 3 0 0 0 1 0 1 0 1]
 [0 0 0 0 0 5 0 0 0 4]
 [0 0 0 0 0 2 0 1 0 3]
 [0 0 0 0 0 7 0 3 0 3]
 [0 0 0 0 0 3 0 2 0 5]
 [0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequ


Validation set:
F1 Score: 0.0152
Accuracy: [6/75] 0.0800
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.08      1.00      0.15         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00        15

    accuracy                           0.08        75
   macro avg       0.01      0.10      0.02        75
weighted avg       0.01      0.08      0.01        75

Confusion matrix:
[[ 0  0  0  2  0  0  0  0  0  0]
 [ 0  0  0  6  0  0  0  0  0  0]
 [ 0  0  0  9  0  0  0  0  0  0]
 [ 0  0  0  6  0  0  0  0  0  0]
 [ 0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0179
Accuracy: [6/75] 0.0800
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.10      1.00      0.18         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00        15

    accuracy                           0.08        75
   macro avg       0.01      0.10      0.02        75
weighted avg       0.01      0.08      0.01        75

Confusion matrix:
[[ 0  0  0  2  0  0  0  0  0  0]
 [ 0  0  0  6  0  0  0  0  0  0]
 [ 0  0  0  8  0  0  0  1  0  0]
 [ 0  0  0  6  0  0  0  0  0  0]
 [ 0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0225
Accuracy: [5/75] 0.0667
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.08      0.17      0.11         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.07      0.67      0.12         6
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00        15

    accuracy                           0.07        75
   macro avg       0.01      0.08      0.02        75
weighted avg       0.01      0.07      0.02        75

Confusion matrix:
[[ 0  0  0  0  0  0  0  2  0  0]
 [ 0  0  0  2  0  0  0  4  0  0]
 [ 0  0  0  0  0  0  0  8  0  1]
 [ 0  0  0  1  0  0  0  5  0  0]
 [ 0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0408
Accuracy: [7/75] 0.0933
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.06      0.50      0.10         6
           8       0.00      0.00      0.00         2
           9       0.36      0.27      0.31        15

    accuracy                           0.09        75
   macro avg       0.04      0.08      0.04        75
weighted avg       0.08      0.09      0.07        75

Confusion matrix:
[[ 0  0  0  0  0  0  0  1  0  1]
 [ 0  0  0  1  0  0  0  4  0  1]
 [ 0  0  0  0  0  0  0  8  0  1]
 [ 0  0  0  0  0  0  0  5  0  1]
 [ 0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequ


Validation set:
F1 Score: 0.0467
Accuracy: [7/75] 0.0933
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.25      0.17      0.20         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00         2
           9       0.20      0.40      0.27        15

    accuracy                           0.09        75
   macro avg       0.04      0.06      0.05        75
weighted avg       0.06      0.09      0.07        75

Confusion matrix:
[[0 0 0 0 0 0 0 0 0 2]
 [1 0 0 1 0 0 0 2 0 2]
 [1 0 0 0 0 1 0 3 0 4]
 [0 0 0 1 0 3 0 0 0 2]
 [2 0 0 0 0 1 0 8 0 2]
 [1 1 0 1 0 0 0 3 0 4]
 [1 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0359
Accuracy: [5/75] 0.0667
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.08      0.33      0.13         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.06      1.00      0.11         2
           9       0.50      0.07      0.12        15

    accuracy                           0.07        75
   macro avg       0.06      0.14      0.04        75
weighted avg       0.11      0.07      0.04        75

Confusion matrix:
[[0 0 0 0 0 0 0 0 2 0]
 [0 0 0 0 0 0 0 1 5 0]
 [3 0 0 3 0 0 0 0 3 0]
 [0 0 1 2 0 0 0 0 3 0]
 [1 0 0 4 0 0 0 2 6 0]
 [1 0 0 3 0 0 0 0 5 1]
 [2 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0882
Accuracy: [9/75] 0.1200
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.07      0.17      0.10         6
           4       0.00      0.00      0.00        13
           5       0.12      0.10      0.11        10
           6       0.00      0.00      0.00         6
           7       0.04      0.17      0.07         6
           8       0.18      1.00      0.31         2
           9       0.33      0.27      0.30        15

    accuracy                           0.12        75
   macro avg       0.08      0.17      0.09        75
weighted avg       0.10      0.12      0.10        75

Confusion matrix:
[[0 0 0 0 0 0 0 0 1 1]
 [0 0 0 3 1 0 0 2 0 0]
 [1 0 0 2 0 0 0 3 2 1]
 [0 0 1 1 0 1 0 1 1 1]
 [1 0 0 0 0 1 0 8 2 1]
 [1 0 0 2 0 1 0 4 0 2]
 [0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0600
Accuracy: [6/75] 0.0800
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.09      0.33      0.14         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.04      0.17      0.06         6
           8       0.18      1.00      0.31         2
           9       0.12      0.07      0.09        15

    accuracy                           0.08        75
   macro avg       0.04      0.16      0.06        75
weighted avg       0.04      0.08      0.04        75

Confusion matrix:
[[0 0 0 1 0 0 0 0 1 0]
 [0 0 0 3 1 0 0 2 0 0]
 [1 0 0 2 0 0 0 3 2 1]
 [0 0 1 2 0 0 0 1 1 1]
 [1 0 0 0 0 1 0 8 2 1]
 [1 0 0 3 0 0 0 5 0 1]
 [0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequ


Validation set:
F1 Score: 0.0052
Accuracy: [2/75] 0.0267
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.03      1.00      0.05         2
           9       0.00      0.00      0.00        15

    accuracy                           0.03        75
   macro avg       0.00      0.10      0.01        75
weighted avg       0.00      0.03      0.00        75

Confusion matrix:
[[ 0  0  0  0  0  0  0  0  2  0]
 [ 0  0  0  0  0  0  0  0  6  0]
 [ 0  0  0  0  0  0  0  0  9  0]
 [ 0  0  0  0  0  0  0  0  6  0]
 [ 0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0242
Accuracy: [5/75] 0.0667
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.08      0.50      0.14         6
           8       0.05      1.00      0.10         2
           9       0.00      0.00      0.00        15

    accuracy                           0.07        75
   macro avg       0.01      0.15      0.02        75
weighted avg       0.01      0.07      0.01        75

Confusion matrix:
[[ 0  0  0  0  0  0  0  1  1  0]
 [ 0  0  0  0  0  0  0  3  3  0]
 [ 0  0  0  1  0  0  0  4  4  0]
 [ 0  0  0  0  0  0  0  3  3  0]
 [ 0  0  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0730
Accuracy: [12/75] 0.1600
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.33      0.17      0.22         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.07      0.50      0.12         6
           8       0.00      0.00      0.00         2
           9       0.31      0.53      0.39        15

    accuracy                           0.16        75
   macro avg       0.07      0.12      0.07        75
weighted avg       0.09      0.16      0.11        75

Confusion matrix:
[[0 0 0 0 0 0 0 2 0 0]
 [0 0 0 0 0 0 0 3 0 3]
 [0 0 0 0 0 0 0 8 0 1]
 [0 0 0 1 0 0 0 0 0 5]
 [0 0 0 0 0 0 0 8 1 4]
 [0 0 0 1 0 0 0 6 0 3]
 [0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0541
Accuracy: [10/75] 0.1333
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.09      0.17      0.12         6
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         6
           7       0.06      0.33      0.10         6
           8       0.00      0.00      0.00         2
           9       0.25      0.47      0.33        15

    accuracy                           0.13        75
   macro avg       0.04      0.10      0.05        75
weighted avg       0.06      0.13      0.08        75

Confusion matrix:
[[0 0 0 0 0 0 0 2 0 0]
 [0 0 0 0 0 0 0 2 0 4]
 [0 0 0 1 0 0 0 7 0 1]
 [0 0 0 1 0 0 0 0 0 5]
 [0 0 0 1 0 0 0 7 1 4]
 [0 0 0 3 0 0 0 3 0 4]
 [0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequ


Validation set:
F1 Score: 0.0717
Accuracy: [10/75] 0.1333
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         6
           4       0.14      0.08      0.10        13
           5       0.09      0.10      0.10        10
           6       0.00      0.00      0.00         6
           7       0.08      0.17      0.11         6
           8       0.05      0.50      0.08         2
           9       0.29      0.40      0.33        15

    accuracy                           0.13        75
   macro avg       0.06      0.12      0.07        75
weighted avg       0.10      0.13      0.11        75

Confusion matrix:
[[0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 1 0 2 3 0]
 [0 0 0 0 0 1 0 1 4 3]
 [0 0 0 0 1 1 0 0 1 3]
 [0 1 0 0 1 3 0 2 3 3]
 [0 0 0 0 2 1 0 4 2 1]
 [0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0641
Accuracy: [9/75] 0.1200
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.08      0.17      0.11         6
           4       0.00      0.00      0.00        13
           5       0.17      0.10      0.12        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.07      0.50      0.12         2
           9       0.23      0.40      0.29        15

    accuracy                           0.12        75
   macro avg       0.05      0.12      0.06        75
weighted avg       0.08      0.12      0.09        75

Confusion matrix:
[[0 0 0 0 0 0 0 0 1 1]
 [0 0 0 1 0 0 0 3 1 1]
 [0 0 0 2 0 1 0 2 2 2]
 [0 0 0 1 0 0 0 0 1 4]
 [0 0 0 1 0 3 0 3 2 4]
 [0 0 0 3 1 1 0 2 2 1]
 [0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0488
Accuracy: [7/75] 0.0933
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        13
           5       0.06      0.10      0.08        10
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00         6
           8       0.08      0.50      0.13         2
           9       0.24      0.33      0.28        15

    accuracy                           0.09        75
   macro avg       0.04      0.09      0.05        75
weighted avg       0.06      0.09      0.07        75

Confusion matrix:
[[0 0 0 0 0 0 0 0 1 1]
 [0 0 0 1 0 1 0 2 2 0]
 [0 0 0 2 0 1 0 2 2 2]
 [0 0 0 0 0 1 0 0 1 4]
 [0 0 0 1 0 5 0 3 2 2]
 [0 0 0 2 1 1 0 3 2 1]
 [0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validation set:
F1 Score: 0.0994
Accuracy: [12/75] 0.1600
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         9
           3       0.08      0.17      0.11         6
           4       0.00      0.00      0.00        13
           5       0.09      0.10      0.10        10
           6       0.00      0.00      0.00         6
           7       0.12      0.33      0.18         6
           8       0.20      0.50      0.29         2
           9       0.25      0.47      0.33        15

    accuracy                           0.16        75
   macro avg       0.07      0.16      0.10        75
weighted avg       0.08      0.16      0.11        75

Confusion matrix:
[[0 0 0 0 0 0 0 0 1 1]
 [0 0 0 2 0 1 0 2 1 0]
 [0 0 0 3 0 1 0 2 1 2]
 [0 0 0 1 0 0 0 1 0 4]
 [0 0 0 1 0 5 0 3 0 4]
 [0 0 0 2 1 1 0 2 0 4]
 [0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
res

{"{'weight_decay': 0.01, 'lr': 2e-05}": [0.04723809523809524,
  0.030158730158730163,
  0.04893801902876313,
  0.06160597572362279],
 "{'weight_decay': 0.01, 'lr': 3e-05}": [0.030724070450097847,
  0.09781954887218045,
  0.07735042735042735,
  0.06973554199360651],
 "{'weight_decay': 0.01, 'lr': 4e-05}": [0.03846153846153847,
  0.09248157248157247,
  0.08453359505991084,
  0.07967091854663012],
 "{'weight_decay': 0.1, 'lr': 2e-05}": [0.05977485928705441,
  0.028312877054295815,
  0.09666666666666668,
  0.11473856209150328],
 "{'weight_decay': 0.1, 'lr': 3e-05}": [0.03488372093023256,
  0.07079899074852818,
  0.09753585544889892,
  0.15077264199215418],
 "{'weight_decay': 0.1, 'lr': 4e-05}": [0.01518987341772152,
  0.017910447761194027,
  0.02246661429693637,
  0.040769230769230766],
 "{'weight_decay': 0.001, 'lr': 2e-05}": [0.04666666666666667,
  0.03590885002649709,
  0.08817663817663818,
  0.06000059722885811],
 "{'weight_decay': 0.001, 'lr': 3e-05}": [0.005194805194805195,
  0.02420

In [60]:
def getting_best_param(res: dict) -> str:
    correct_params = None
    total_max = 0

    for key, _list in res.items():
        idx, local_max = 0, 0
        for i,num in enumerate(_list, start=1):
            if num > local_max: 
                local_max = num
                idx = i
        if local_max > total_max:
            total_max = local_max
            correct_params = "{" + f"'epochs': {idx}, " + key[1:]
    return correct_params

In [69]:
import ast
best_params_dict = ast.literal_eval(getting_best_param(res))
best_params_dict

{'epochs': 4, 'weight_decay': 0.1, 'lr': 3e-05}

## To do
- Setting up for testing on Test_set
- (Optional) Implement k-fold cross validation