In [33]:
import pandas as pd

df = pd.read_csv ('data/nutrition_table_balanced.csv', encoding= 'unicode_escape')
print (df)

                                          Product Name  \
0         Kellogg's Rice Krispies Breakfast Cereal 1kg   
1       Sainsbury's Berry Hooplas, Summer Edition 375g   
2    Sainsbury's  Peach Melba Granola, Summer Editi...   
3    Kellogg's Special K Original Breakfast Cereal ...   
4           Kellogg's Crunchy Nut Breakfast Cereal 1kg   
..                                                 ...   
594  Doritos Stax Sour Cream & Onion Sharing Snacks...   
595                  Walkers Prawn Cocktail Crisps 45g   
596  Pringles Sizzl'N Spicy Chorizo Flavour Sharing...   
597  Doritos Flame Grilled Steak Sharing Tortilla C...   
598  Walkers Max KFC Kentucky Fried Chicken Sharing...   

                                   Product Description Category  Energy  \
0    Toasted Rice Cereal Fortified with Vitamins an...   cereal   389.0   
1    Coated cereal extrudate with natural strawberr...   cereal   395.0   
2    A blend of honey toasted wholegrain oats with ...   cereal   437.0   
3  

In [34]:
import torch
import numpy as np
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'cereal':0,
          'chocolate':1,
         'rice':2,
         'yogurts':3,
         'crisps':4}

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):
        self.labels = [labels[label] for label in df['Category']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['Product Name']]


    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [35]:
'''build bert model'''

from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.Softmax()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output, dim=1)

        return final_layer

In [36]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

479 60 60


In [54]:
from torch.optim import Adam
from tqdm import tqdm
import torch

def train(model, train_data, val_data, learning_rate, epochs):
    
    #train_data = train_data.type(torch.LongTensor)

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 9
model = BertClassifier(dropout=0.1)
LR = 5.00E-07
              
train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:53<00:00,  2.22s

Epochs: 1 | Train Loss:  0.802                 | Train Accuracy:  0.217                 | Val Loss:  0.791                 | Val Accuracy:  0.250


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:58<00:00,  2.24s/it]


Epochs: 2 | Train Loss:  0.774                 | Train Accuracy:  0.428                 | Val Loss:  0.757                 | Val Accuracy:  0.533


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:53<00:00,  2.22s/it]


Epochs: 3 | Train Loss:  0.730                 | Train Accuracy:  0.626                 | Val Loss:  0.720                 | Val Accuracy:  0.583


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:01<00:00,  2.26s/it]


Epochs: 4 | Train Loss:  0.695                 | Train Accuracy:  0.720                 | Val Loss:  0.691                 | Val Accuracy:  0.683


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:11<00:00,  2.30s/it]


Epochs: 5 | Train Loss:  0.663                 | Train Accuracy:  0.837                 | Val Loss:  0.657                 | Val Accuracy:  0.817


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:35<00:00,  2.15s/it]


Epochs: 6 | Train Loss:  0.630                 | Train Accuracy:  0.852                 | Val Loss:  0.623                 | Val Accuracy:  0.850


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:44<00:00,  2.19s/it]


Epochs: 7 | Train Loss:  0.598                 | Train Accuracy:  0.900                 | Val Loss:  0.595                 | Val Accuracy:  0.917


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:21<00:00,  2.09s/it]


Epochs: 8 | Train Loss:  0.573                 | Train Accuracy:  0.925                 | Val Loss:  0.574                 | Val Accuracy:  0.917


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:27<00:00,  2.11s/it]


Epochs: 9 | Train Loss:  0.552                 | Train Accuracy:  0.973                 | Val Loss:  0.556                 | Val Accuracy:  0.933


In [55]:
df_db = pd.read_csv('data/Sample_db.csv')

In [56]:
"""
Multi-class evaluator
"""
from sklearn.metrics import f1_score
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    TP = 0
    FP = 0
    FN = 0
    with torch.no_grad():
        y_pred = []
        y_true = []
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            i = 0
            output = model(input_id, mask)
            print(output)
            selected_output = output.argmax(dim=1)[i]#0,1,2,3,4
            y_pred.append(selected_output.detach().numpy())
            y_true.append(test_label[0].detach().numpy())

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
            
            #selected = (output.argmax(dim=1) == 1).sum().item()

    '''
    Weighted F1: 
    '''
    print(y_true)
    print(y_pred)
    weighted_f1 = f1_score(y_true, y_pred, average='weighted')
    #F1 = TP/(TP + 0.5*(FP+FN))
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    print(f'Test Weighted F1 score: {weighted_f1: .3f}')
    return (total_acc_test / len(test_data), weighted_f1)
    
evaluate(model, df_db)

  final_layer = self.relu(linear_output)


tensor([[0.0511, 0.0368, 0.3369, 0.0928, 0.4824],
        [0.0457, 0.0299, 0.2440, 0.0589, 0.6215]])
tensor([[0.0122, 0.0137, 0.0195, 0.0224, 0.9322],
        [0.0120, 0.0091, 0.0173, 0.0233, 0.9384]])
tensor([[0.1037, 0.0675, 0.1852, 0.5911, 0.0525],
        [0.1052, 0.0779, 0.0934, 0.6810, 0.0425]])
tensor([[0.0098, 0.0096, 0.0132, 0.0123, 0.9552],
        [0.0138, 0.0112, 0.0127, 0.0164, 0.9460]])
tensor([[0.0133, 0.0129, 0.0137, 0.0179, 0.9422],
        [0.0623, 0.0593, 0.6081, 0.1699, 0.1003]])
tensor([[0.0303, 0.0552, 0.7744, 0.0737, 0.0664],
        [0.3024, 0.2093, 0.1875, 0.2569, 0.0440]])
tensor([[0.0988, 0.0699, 0.3257, 0.3743, 0.1313],
        [0.0485, 0.0413, 0.7814, 0.0528, 0.0761]])
tensor([[0.1003, 0.0545, 0.4203, 0.3238, 0.1011],
        [0.2957, 0.2346, 0.1619, 0.2643, 0.0435]])
tensor([[0.4369, 0.1807, 0.0928, 0.2267, 0.0629],
        [0.1656, 0.0851, 0.1320, 0.5402, 0.0771]])
tensor([[0.1001, 0.0515, 0.2371, 0.3344, 0.2769],
        [0.1898, 0.0800, 0.1567, 0.4481, 

(0.44680851063829785, 0.6386452241715399)

In [49]:
##split
import random
from csv import writer

np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])
print(len(df_train),len(df_val), len(df_test))

EPOCHS = 5
LR = 1e-6

for i in range(200):
    EPOCHS = random.randint(2,15)
    LR = 0.5 * random.randint(1,2) * 10 ** (-random.randint(1,10))
    DROPOUT = 0.1 * random.randint(1,9)
    model = BertClassifier(dropout=DROPOUT)           
    train(model, df_train, df_val, LR, EPOCHS)
    acc, f1 = evaluate(model, df_test)
    print(f'EPOCHS: {EPOCHS} | LR:{LR} | DROPOUT:{DROPOUT}')
    list_data = [EPOCHS,LR,DROPOUT,acc,f1]
    with open('data/training_category.csv','a', newline='') as f_object:
        writer_object = writer(f_object)
        writer_object.writerow(list_data)
        f_object.close()

479 60 60


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:50<00:00,  2.21s

Epochs: 1 | Train Loss:  0.807                 | Train Accuracy:  0.225                 | Val Loss:  0.807                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:54<00:00,  2.23s/it]


Epochs: 2 | Train Loss:  0.807                 | Train Accuracy:  0.221                 | Val Loss:  0.807                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:48<00:00,  2.20s/it]


Epochs: 3 | Train Loss:  0.805                 | Train Accuracy:  0.240                 | Val Loss:  0.808                 | Val Accuracy:  0.117


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:56<00:00,  2.24s/it]


Epochs: 4 | Train Loss:  0.806                 | Train Accuracy:  0.213                 | Val Loss:  0.807                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:57<00:00,  2.24s/it]


Epochs: 5 | Train Loss:  0.809                 | Train Accuracy:  0.209                 | Val Loss:  0.805                 | Val Accuracy:  0.167


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:59<00:00,  2.25s/it]


Epochs: 6 | Train Loss:  0.808                 | Train Accuracy:  0.196                 | Val Loss:  0.807                 | Val Accuracy:  0.150


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:34<00:00,  2.14s/it]


Epochs: 7 | Train Loss:  0.808                 | Train Accuracy:  0.221                 | Val Loss:  0.808                 | Val Accuracy:  0.133


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:50<00:00,  2.21s/it]


Epochs: 8 | Train Loss:  0.808                 | Train Accuracy:  0.198                 | Val Loss:  0.811                 | Val Accuracy:  0.167
tensor([[0.1275, 0.2706, 0.2852, 0.1956, 0.1212],
        [0.1176, 0.1816, 0.2693, 0.3394, 0.0921]])
tensor([[0.0747, 0.1772, 0.3515, 0.2407, 0.1559],
        [0.0823, 0.0708, 0.2865, 0.2793, 0.2811]])
tensor([[0.1533, 0.1512, 0.1322, 0.3810, 0.1823],
        [0.1640, 0.1521, 0.2888, 0.2768, 0.1182]])
tensor([[0.1384, 0.1471, 0.2830, 0.3081, 0.1234],
        [0.1552, 0.2695, 0.2434, 0.1983, 0.1336]])
tensor([[0.2865, 0.1191, 0.3749, 0.1368, 0.0826],
        [0.1602, 0.1271, 0.3972, 0.1541, 0.1613]])
tensor([[0.1186, 0.1388, 0.3831, 0.1665, 0.1930],
        [0.1485, 0.1823, 0.3398, 0.2045, 0.1248]])
tensor([[0.0680, 0.2547, 0.3365, 0.2010, 0.1398],
        [0.1378, 0.1594, 0.3054, 0.2327, 0.1647]])
tensor([[0.1752, 0.1337, 0.3356, 0.2384, 0.1171],
        [0.1882, 0.1276, 0.3448, 0.1986, 0.1409]])
tensor([[0.1161, 0.1768, 0.2606, 0.2958, 0.150

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:00<00:00,  2.25s

Epochs: 1 | Train Loss:  0.804                 | Train Accuracy:  0.213                 | Val Loss:  0.797                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:59<00:00,  2.25s/it]


Epochs: 2 | Train Loss:  0.764                 | Train Accuracy:  0.486                 | Val Loss:  0.726                 | Val Accuracy:  0.633


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:57<00:00,  2.24s/it]


Epochs: 3 | Train Loss:  0.698                 | Train Accuracy:  0.724                 | Val Loss:  0.681                 | Val Accuracy:  0.767


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:04<00:00,  2.27s/it]


Epochs: 4 | Train Loss:  0.656                 | Train Accuracy:  0.816                 | Val Loss:  0.643                 | Val Accuracy:  0.850


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:56<00:00,  2.23s/it]


Epochs: 5 | Train Loss:  0.620                 | Train Accuracy:  0.843                 | Val Loss:  0.605                 | Val Accuracy:  0.850


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:02<00:00,  2.26s/it]


Epochs: 6 | Train Loss:  0.584                 | Train Accuracy:  0.910                 | Val Loss:  0.575                 | Val Accuracy:  0.917


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:59<00:00,  2.25s/it]


Epochs: 7 | Train Loss:  0.558                 | Train Accuracy:  0.950                 | Val Loss:  0.553                 | Val Accuracy:  0.950


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:04<00:00,  2.27s/it]


Epochs: 8 | Train Loss:  0.540                 | Train Accuracy:  0.969                 | Val Loss:  0.540                 | Val Accuracy:  0.967


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:55<00:00,  2.23s/it]


Epochs: 9 | Train Loss:  0.525                 | Train Accuracy:  0.977                 | Val Loss:  0.525                 | Val Accuracy:  0.967
tensor([[0.0382, 0.0299, 0.0153, 0.8892, 0.0274],
        [0.1066, 0.0435, 0.0191, 0.7901, 0.0406]])
tensor([[0.0303, 0.0288, 0.0135, 0.8982, 0.0291],
        [0.3426, 0.2123, 0.2205, 0.1256, 0.0989]])
tensor([[0.0090, 0.0080, 0.0089, 0.0093, 0.9648],
        [0.5926, 0.1574, 0.0794, 0.1090, 0.0617]])
tensor([[0.1571, 0.4540, 0.2779, 0.0756, 0.0355],
        [0.6864, 0.0886, 0.0847, 0.0693, 0.0711]])
tensor([[0.0099, 0.0118, 0.0084, 0.0165, 0.9534],
        [0.0459, 0.0386, 0.0194, 0.8627, 0.0335]])
tensor([[0.0667, 0.0852, 0.0262, 0.7896, 0.0322],
        [0.0316, 0.0253, 0.0133, 0.9094, 0.0203]])
tensor([[0.5183, 0.1861, 0.1686, 0.0741, 0.0528],
        [0.0176, 0.0218, 0.9362, 0.0088, 0.0155]])
tensor([[0.0172, 0.0239, 0.9373, 0.0061, 0.0155],
        [0.0171, 0.0216, 0.9350, 0.0099, 0.0163]])
tensor([[0.6424, 0.1090, 0.0997, 0.0859, 0.063

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:59<00:00,  2.25s

Epochs: 1 | Train Loss:  0.803                 | Train Accuracy:  0.230                 | Val Loss:  0.791                 | Val Accuracy:  0.317


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:53<00:00,  2.22s/it]


Epochs: 2 | Train Loss:  0.790                 | Train Accuracy:  0.334                 | Val Loss:  0.790                 | Val Accuracy:  0.267


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:52<00:00,  2.22s/it]


Epochs: 3 | Train Loss:  0.780                 | Train Accuracy:  0.382                 | Val Loss:  0.775                 | Val Accuracy:  0.450


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:57<00:00,  2.24s/it]


Epochs: 4 | Train Loss:  0.758                 | Train Accuracy:  0.472                 | Val Loss:  0.757                 | Val Accuracy:  0.450


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:54<00:00,  2.23s/it]


Epochs: 5 | Train Loss:  0.726                 | Train Accuracy:  0.601                 | Val Loss:  0.728                 | Val Accuracy:  0.517


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:50<00:00,  2.21s/it]


Epochs: 6 | Train Loss:  0.694                 | Train Accuracy:  0.660                 | Val Loss:  0.691                 | Val Accuracy:  0.600


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:56<00:00,  2.24s/it]


Epochs: 7 | Train Loss:  0.669                 | Train Accuracy:  0.674                 | Val Loss:  0.676                 | Val Accuracy:  0.683


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:47<00:00,  2.20s/it]


Epochs: 8 | Train Loss:  0.641                 | Train Accuracy:  0.770                 | Val Loss:  0.639                 | Val Accuracy:  0.867
tensor([[0.1328, 0.1690, 0.2821, 0.2929, 0.1232],
        [0.6898, 0.1403, 0.0504, 0.0634, 0.0560]])
tensor([[0.1795, 0.1836, 0.3057, 0.1294, 0.2019],
        [0.6188, 0.0639, 0.0807, 0.1012, 0.1354]])
tensor([[0.0893, 0.0720, 0.0472, 0.0637, 0.7279],
        [0.7401, 0.0514, 0.0490, 0.0660, 0.0935]])
tensor([[0.1094, 0.1837, 0.3229, 0.2198, 0.1642],
        [0.7910, 0.0794, 0.0157, 0.0690, 0.0450]])
tensor([[0.0520, 0.0420, 0.0200, 0.0394, 0.8466],
        [0.0307, 0.1277, 0.1802, 0.4867, 0.1747]])
tensor([[0.0574, 0.1900, 0.2712, 0.2329, 0.2485],
        [0.0713, 0.1387, 0.3260, 0.2800, 0.1840]])
tensor([[0.5629, 0.1377, 0.1093, 0.0744, 0.1156],
        [0.0532, 0.0345, 0.8386, 0.0331, 0.0406]])
tensor([[0.0332, 0.0301, 0.8662, 0.0293, 0.0412],
        [0.0339, 0.0304, 0.8442, 0.0385, 0.0530]])
tensor([[0.8379, 0.0291, 0.0342, 0.0544, 0.044

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:51<00:00,  2.21s

Epochs: 1 | Train Loss:  0.809                 | Train Accuracy:  0.177                 | Val Loss:  0.811                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:45<00:00,  2.19s/it]


Epochs: 2 | Train Loss:  0.808                 | Train Accuracy:  0.211                 | Val Loss:  0.813                 | Val Accuracy:  0.133


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:49<00:00,  2.21s/it]


Epochs: 3 | Train Loss:  0.809                 | Train Accuracy:  0.173                 | Val Loss:  0.804                 | Val Accuracy:  0.200


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:53<00:00,  2.22s/it]


Epochs: 4 | Train Loss:  0.807                 | Train Accuracy:  0.196                 | Val Loss:  0.811                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:56<00:00,  2.24s/it]


Epochs: 5 | Train Loss:  0.808                 | Train Accuracy:  0.186                 | Val Loss:  0.807                 | Val Accuracy:  0.183
tensor([[0.2944, 0.1353, 0.2295, 0.1792, 0.1615],
        [0.3019, 0.1013, 0.2951, 0.1411, 0.1606]])
tensor([[0.4150, 0.1150, 0.1732, 0.1461, 0.1507],
        [0.2798, 0.1056, 0.3252, 0.1318, 0.1575]])
tensor([[0.3710, 0.1049, 0.1940, 0.1478, 0.1823],
        [0.2381, 0.1128, 0.2439, 0.1681, 0.2371]])
tensor([[0.2275, 0.1056, 0.2077, 0.1581, 0.3011],
        [0.3155, 0.0794, 0.1878, 0.1508, 0.2665]])
tensor([[0.3242, 0.1007, 0.2206, 0.1382, 0.2163],
        [0.2561, 0.1098, 0.2707, 0.1522, 0.2113]])
tensor([[0.2428, 0.1802, 0.2104, 0.1669, 0.1996],
        [0.3374, 0.1230, 0.1930, 0.1571, 0.1895]])
tensor([[0.2939, 0.1398, 0.2068, 0.1930, 0.1665],
        [0.2536, 0.2233, 0.2091, 0.1345, 0.1794]])
tensor([[0.3279, 0.0962, 0.2460, 0.1438, 0.1861],
        [0.1790, 0.1014, 0.3146, 0.2239, 0.1811]])
tensor([[0.2585, 0.0909, 0.2076, 0.1804, 0.262

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [11:35<00:00,  2.90s

Epochs: 1 | Train Loss:  0.851                 | Train Accuracy:  0.205                 | Val Loss:  0.861                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [12:00<00:00,  3.00s/it]


Epochs: 2 | Train Loss:  0.851                 | Train Accuracy:  0.207                 | Val Loss:  0.861                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [11:51<00:00,  2.96s/it]


Epochs: 3 | Train Loss:  0.851                 | Train Accuracy:  0.207                 | Val Loss:  0.861                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [12:04<00:00,  3.02s/it]


Epochs: 4 | Train Loss:  0.851                 | Train Accuracy:  0.207                 | Val Loss:  0.861                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [12:12<00:00,  3.05s/it]


Epochs: 5 | Train Loss:  0.851                 | Train Accuracy:  0.207                 | Val Loss:  0.861                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [12:05<00:00,  3.02s/it]


Epochs: 6 | Train Loss:  0.851                 | Train Accuracy:  0.207                 | Val Loss:  0.861                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [12:04<00:00,  3.02s/it]


Epochs: 7 | Train Loss:  0.851                 | Train Accuracy:  0.207                 | Val Loss:  0.861                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [12:05<00:00,  3.02s/it]


Epochs: 8 | Train Loss:  0.851                 | Train Accuracy:  0.207                 | Val Loss:  0.861                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [12:04<00:00,  3.02s/it]


Epochs: 9 | Train Loss:  0.851                 | Train Accuracy:  0.207                 | Val Loss:  0.861                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [12:14<00:00,  3.06s/it]


Epochs: 10 | Train Loss:  0.851                 | Train Accuracy:  0.207                 | Val Loss:  0.861                 | Val Accuracy:  0.183
tensor([[3.7531e-10, 1.4089e-20, 6.4974e-21, 3.6832e-21, 1.0000e+00],
        [2.0437e-10, 2.3940e-20, 1.6069e-20, 7.3912e-21, 1.0000e+00]])
tensor([[4.9830e-10, 4.1798e-20, 1.9963e-20, 1.9521e-20, 1.0000e+00],
        [2.1394e-10, 4.9289e-21, 2.3527e-21, 2.4114e-21, 1.0000e+00]])
tensor([[4.6662e-10, 1.0600e-19, 4.4452e-20, 3.2331e-20, 1.0000e+00],
        [4.9224e-10, 3.1646e-20, 1.0487e-20, 7.7977e-21, 1.0000e+00]])
tensor([[3.9487e-10, 1.9140e-19, 7.4985e-20, 5.8248e-20, 1.0000e+00],
        [3.1841e-10, 1.0422e-20, 6.9469e-21, 4.6580e-21, 1.0000e+00]])
tensor([[3.0874e-10, 3.7820e-20, 1.8139e-20, 2.3611e-20, 1.0000e+00],
        [7.6653e-10, 4.2685e-20, 2.8928e-20, 1.7980e-20, 1.0000e+00]])
tensor([[7.6523e-10, 2.9511e-19, 2.0120e-19, 1.3729e-19, 1.0000e+00],
        [6.1100e-10, 3.7962e-20, 2.1473e-20, 1.4773e-20, 1.0000e+00]])
tensor(

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [15:00<00:00,  3.75s

Epochs: 1 | Train Loss:  0.854                 | Train Accuracy:  0.198                 | Val Loss:  0.836                 | Val Accuracy:  0.233


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [15:48<00:00,  3.95s/it]


Epochs: 2 | Train Loss:  0.853                 | Train Accuracy:  0.203                 | Val Loss:  0.836                 | Val Accuracy:  0.233


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [15:50<00:00,  3.96s/it]


Epochs: 3 | Train Loss:  0.853                 | Train Accuracy:  0.203                 | Val Loss:  0.836                 | Val Accuracy:  0.233


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [16:09<00:00,  4.04s/it]


Epochs: 4 | Train Loss:  0.853                 | Train Accuracy:  0.203                 | Val Loss:  0.836                 | Val Accuracy:  0.233


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [15:53<00:00,  3.97s/it]


Epochs: 5 | Train Loss:  0.853                 | Train Accuracy:  0.203                 | Val Loss:  0.836                 | Val Accuracy:  0.233


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [16:11<00:00,  4.05s/it]


Epochs: 6 | Train Loss:  0.853                 | Train Accuracy:  0.203                 | Val Loss:  0.836                 | Val Accuracy:  0.233


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [15:55<00:00,  3.98s/it]


Epochs: 7 | Train Loss:  0.853                 | Train Accuracy:  0.203                 | Val Loss:  0.836                 | Val Accuracy:  0.233
tensor([[4.0334e-19, 1.0000e+00, 7.0072e-18, 8.2008e-18, 6.2178e-19],
        [4.8867e-21, 1.0000e+00, 8.5602e-21, 1.0849e-18, 3.0747e-21]])
tensor([[8.0889e-19, 1.0000e+00, 5.4257e-18, 2.3585e-17, 1.2704e-18],
        [1.0047e-18, 1.0000e+00, 1.4908e-18, 9.4058e-18, 4.5103e-19]])
tensor([[5.0360e-19, 1.0000e+00, 9.6400e-18, 5.6197e-18, 2.0946e-18],
        [2.5488e-17, 1.0000e+00, 9.2536e-17, 2.6295e-16, 1.4663e-17]])
tensor([[1.2373e-17, 1.0000e+00, 3.9482e-17, 1.5821e-16, 1.7661e-17],
        [1.7073e-19, 1.0000e+00, 8.7395e-19, 9.4026e-17, 1.9669e-19]])
tensor([[5.7371e-18, 1.0000e+00, 1.8073e-17, 7.1401e-17, 3.9517e-18],
        [5.6891e-19, 1.0000e+00, 1.5682e-18, 1.6944e-17, 1.0227e-18]])
tensor([[3.2192e-19, 1.0000e+00, 5.9989e-19, 2.0459e-18, 2.1829e-19],
        [7.7345e-20, 1.0000e+00, 2.2071e-19, 1.7662e-17, 8.5427e-20]])
tensor([

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:06<00:00,  2.28s

Epochs: 1 | Train Loss:  0.795                 | Train Accuracy:  0.307                 | Val Loss:  0.779                 | Val Accuracy:  0.433


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:58<00:00,  2.24s/it]


Epochs: 2 | Train Loss:  0.772                 | Train Accuracy:  0.453                 | Val Loss:  0.764                 | Val Accuracy:  0.450


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:01<00:00,  2.26s/it]


Epochs: 3 | Train Loss:  0.755                 | Train Accuracy:  0.499                 | Val Loss:  0.744                 | Val Accuracy:  0.517


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:58<00:00,  2.24s/it]


Epochs: 4 | Train Loss:  0.722                 | Train Accuracy:  0.591                 | Val Loss:  0.707                 | Val Accuracy:  0.533


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:59<00:00,  2.25s/it]


Epochs: 5 | Train Loss:  0.685                 | Train Accuracy:  0.658                 | Val Loss:  0.676                 | Val Accuracy:  0.650


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:57<00:00,  2.24s/it]


Epochs: 6 | Train Loss:  0.656                 | Train Accuracy:  0.768                 | Val Loss:  0.651                 | Val Accuracy:  0.833


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:07<00:00,  2.28s/it]


Epochs: 7 | Train Loss:  0.628                 | Train Accuracy:  0.889                 | Val Loss:  0.617                 | Val Accuracy:  0.917


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:52<00:00,  2.22s/it]


Epochs: 8 | Train Loss:  0.598                 | Train Accuracy:  0.939                 | Val Loss:  0.584                 | Val Accuracy:  0.950


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:01<00:00,  2.26s/it]


Epochs: 9 | Train Loss:  0.566                 | Train Accuracy:  0.969                 | Val Loss:  0.549                 | Val Accuracy:  0.983


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:03<00:00,  2.26s/it]


Epochs: 10 | Train Loss:  0.537                 | Train Accuracy:  0.983                 | Val Loss:  0.528                 | Val Accuracy:  0.983


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:00<00:00,  2.25s/it]


Epochs: 11 | Train Loss:  0.519                 | Train Accuracy:  0.983                 | Val Loss:  0.515                 | Val Accuracy:  0.983
tensor([[0.0276, 0.0252, 0.0404, 0.8895, 0.0173],
        [0.5839, 0.0599, 0.0707, 0.1786, 0.1068]])
tensor([[0.0332, 0.0297, 0.0232, 0.8875, 0.0264],
        [0.5377, 0.0586, 0.2150, 0.1058, 0.0829]])
tensor([[0.0147, 0.0137, 0.0153, 0.0131, 0.9432],
        [0.7312, 0.0917, 0.0857, 0.0422, 0.0492]])
tensor([[0.3530, 0.4495, 0.0271, 0.1025, 0.0678],
        [0.8376, 0.0344, 0.0492, 0.0318, 0.0470]])
tensor([[0.0203, 0.0132, 0.0160, 0.0189, 0.9316],
        [0.0293, 0.0170, 0.0335, 0.9021, 0.0181]])
tensor([[0.0486, 0.0267, 0.0910, 0.8022, 0.0314],
        [0.0218, 0.0177, 0.0335, 0.9109, 0.0161]])
tensor([[0.3761, 0.0484, 0.3821, 0.1031, 0.0903],
        [0.0583, 0.0207, 0.8154, 0.0603, 0.0454]])
tensor([[0.0721, 0.0343, 0.7787, 0.0595, 0.0553],
        [0.0539, 0.0232, 0.8280, 0.0498, 0.0451]])
tensor([[0.8171, 0.0550, 0.0546, 0.0352, 0.03

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [16:17<00:00,  4.07s

Epochs: 1 | Train Loss:  0.860                 | Train Accuracy:  0.182                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [17:24<00:00,  4.35s/it]


Epochs: 2 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [17:13<00:00,  4.31s/it]


Epochs: 3 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [17:26<00:00,  4.36s/it]


Epochs: 4 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [17:23<00:00,  4.35s/it]


Epochs: 5 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [17:16<00:00,  4.32s/it]


Epochs: 6 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [17:17<00:00,  4.32s/it]


Epochs: 7 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217
tensor([[4.7704e-20, 2.5735e-17, 1.1823e-15, 1.0000e+00, 4.3110e-20],
        [8.0285e-16, 2.1332e-13, 1.1376e-17, 1.0000e+00, 1.1022e-14]])
tensor([[4.8252e-19, 1.8340e-14, 1.9066e-18, 1.0000e+00, 2.6128e-16],
        [6.6329e-18, 4.9347e-10, 3.3036e-17, 1.0000e+00, 3.4957e-15]])
tensor([[9.7802e-16, 5.0769e-14, 6.4068e-18, 1.0000e+00, 7.0147e-16],
        [4.4492e-18, 3.2354e-14, 5.8216e-12, 1.0000e+00, 1.0424e-13]])
tensor([[3.7706e-20, 5.2209e-20, 1.5546e-19, 1.0000e+00, 6.2547e-19],
        [1.4439e-16, 8.9965e-14, 2.7017e-14, 1.0000e+00, 1.0015e-10]])
tensor([[9.6704e-18, 1.8231e-13, 1.5709e-15, 1.0000e+00, 4.2016e-16],
        [6.7369e-14, 5.4555e-11, 1.3528e-14, 1.0000e+00, 2.8540e-11]])
tensor([[6.8041e-18, 3.8033e-12, 3.2733e-18, 1.0000e+00, 3.3241e-17],
        [1.0122e-19, 7.8296e-14, 6.5148e-18, 1.0000e+00, 2.5586e-16]])
tensor([

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:07<00:00,  2.28s

Epochs: 1 | Train Loss:  0.816                 | Train Accuracy:  0.211                 | Val Loss:  0.818                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:05<00:00,  2.27s/it]


Epochs: 2 | Train Loss:  0.821                 | Train Accuracy:  0.190                 | Val Loss:  0.830                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:00<00:00,  2.25s/it]


Epochs: 3 | Train Loss:  0.824                 | Train Accuracy:  0.171                 | Val Loss:  0.826                 | Val Accuracy:  0.167


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:59<00:00,  2.25s/it]


Epochs: 4 | Train Loss:  0.824                 | Train Accuracy:  0.169                 | Val Loss:  0.809                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:03<00:00,  2.26s/it]


Epochs: 5 | Train Loss:  0.814                 | Train Accuracy:  0.207                 | Val Loss:  0.823                 | Val Accuracy:  0.200


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:01<00:00,  2.26s/it]


Epochs: 6 | Train Loss:  0.822                 | Train Accuracy:  0.188                 | Val Loss:  0.835                 | Val Accuracy:  0.133


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:47<00:00,  2.20s/it]


Epochs: 7 | Train Loss:  0.806                 | Train Accuracy:  0.238                 | Val Loss:  0.837                 | Val Accuracy:  0.133
tensor([[0.2187, 0.1930, 0.0636, 0.1667, 0.3580],
        [0.2436, 0.1997, 0.1217, 0.2562, 0.1788]])
tensor([[0.4914, 0.0170, 0.0989, 0.0862, 0.3064],
        [0.3724, 0.0352, 0.1190, 0.2025, 0.2709]])
tensor([[0.0536, 0.2135, 0.5984, 0.1002, 0.0344],
        [0.7738, 0.0220, 0.0134, 0.0239, 0.1669]])
tensor([[0.2727, 0.1245, 0.4611, 0.0454, 0.0963],
        [0.0803, 0.3621, 0.0528, 0.0154, 0.4893]])
tensor([[0.5003, 0.0290, 0.0380, 0.3390, 0.0936],
        [0.1654, 0.2095, 0.2956, 0.0707, 0.2588]])
tensor([[0.4606, 0.0242, 0.4595, 0.0328, 0.0228],
        [0.3584, 0.0476, 0.0307, 0.0656, 0.4977]])
tensor([[0.0315, 0.0795, 0.5201, 0.3360, 0.0330],
        [0.0062, 0.3324, 0.3857, 0.1600, 0.1156]])
tensor([[0.2031, 0.2008, 0.1538, 0.1321, 0.3103],
        [0.0738, 0.1053, 0.4369, 0.3146, 0.0695]])
tensor([[0.1887, 0.1189, 0.1614, 0.4356, 0.095

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [19:44<00:00,  4.94s

Epochs: 1 | Train Loss:  0.847                 | Train Accuracy:  0.207                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [23:01<00:00,  5.76s/it]


Epochs: 2 | Train Loss:  0.857                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [23:14<00:00,  5.81s/it]


Epochs: 3 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [23:40<00:00,  5.92s/it]


Epochs: 4 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [23:36<00:00,  5.90s/it]


Epochs: 5 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [23:19<00:00,  5.83s/it]


Epochs: 6 | Train Loss:  0.857                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [23:06<00:00,  5.78s/it]


Epochs: 7 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [23:09<00:00,  5.79s/it]


Epochs: 8 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [23:01<00:00,  5.76s/it]


Epochs: 9 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [23:04<00:00,  5.77s/it]


Epochs: 10 | Train Loss:  0.857                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [23:11<00:00,  5.80s/it]


Epochs: 11 | Train Loss:  0.857                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [23:03<00:00,  5.77s/it]


Epochs: 12 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [23:24<00:00,  5.85s/it]


Epochs: 13 | Train Loss:  0.858                 | Train Accuracy:  0.192                 | Val Loss:  0.844                 | Val Accuracy:  0.217
tensor([[7.3437e-22, 3.0270e-22, 1.0111e-21, 1.0000e+00, 1.9369e-23],
        [6.6999e-25, 3.1314e-22, 8.3597e-21, 1.0000e+00, 8.4371e-27]])
tensor([[1.2934e-23, 2.1171e-22, 1.9075e-21, 1.0000e+00, 5.3170e-28],
        [1.1447e-25, 4.4598e-23, 1.6624e-22, 1.0000e+00, 9.5036e-27]])
tensor([[9.9561e-25, 2.3512e-21, 1.4397e-19, 1.0000e+00, 4.1999e-26],
        [1.1246e-26, 2.0924e-22, 2.5977e-25, 1.0000e+00, 2.4760e-28]])
tensor([[6.8251e-22, 5.6684e-23, 2.9593e-20, 1.0000e+00, 2.7104e-27],
        [3.6953e-25, 3.7912e-23, 6.7774e-23, 1.0000e+00, 3.2748e-29]])
tensor([[1.8421e-23, 1.8001e-22, 3.0918e-23, 1.0000e+00, 1.2416e-27],
        [7.6346e-26, 5.9249e-24, 1.1198e-26, 1.0000e+00, 1.9618e-29]])
tensor([[4.7924e-20, 2.0106e-18, 3.6903e-19, 1.0000e+00, 2.0324e-25],
        [7.9767e-26, 2.0750e-24, 1.3064e-24, 1.0000e+00, 7.0498e-30]])
tensor(

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:59<00:00,  2.25s

Epochs: 1 | Train Loss:  0.813                 | Train Accuracy:  0.188                 | Val Loss:  0.811                 | Val Accuracy:  0.167


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [08:59<00:00,  2.25s/it]


Epochs: 2 | Train Loss:  0.816                 | Train Accuracy:  0.150                 | Val Loss:  0.801                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:03<00:00,  2.27s/it]


Epochs: 3 | Train Loss:  0.819                 | Train Accuracy:  0.167                 | Val Loss:  0.801                 | Val Accuracy:  0.217
tensor([[0.1119, 0.0738, 0.1306, 0.6183, 0.0654],
        [0.0764, 0.1848, 0.0465, 0.4048, 0.2876]])
tensor([[0.0903, 0.1332, 0.0682, 0.4714, 0.2369],
        [0.0866, 0.1136, 0.1168, 0.4558, 0.2273]])
tensor([[0.1956, 0.0935, 0.0914, 0.5710, 0.0485],
        [0.1315, 0.1693, 0.0765, 0.3887, 0.2340]])
tensor([[0.2095, 0.4038, 0.1322, 0.0923, 0.1622],
        [0.1634, 0.0281, 0.4362, 0.3308, 0.0414]])
tensor([[0.1651, 0.0328, 0.0423, 0.5165, 0.2434],
        [0.1435, 0.1156, 0.1027, 0.6008, 0.0374]])
tensor([[0.0939, 0.1322, 0.2534, 0.1885, 0.3319],
        [0.0940, 0.2821, 0.1442, 0.3120, 0.1676]])
tensor([[0.2250, 0.1379, 0.1504, 0.2885, 0.1982],
        [0.2740, 0.1692, 0.0690, 0.2317, 0.2561]])
tensor([[0.2685, 0.1982, 0.1450, 0.1888, 0.1995],
        [0.2212, 0.1566, 0.0527, 0.1363, 0.4332]])
tensor([[0.2181, 0.1227, 0.1796, 0.3141, 0.165

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:08<00:00,  2.29s

Epochs: 1 | Train Loss:  0.836                 | Train Accuracy:  0.173                 | Val Loss:  0.819                 | Val Accuracy:  0.233


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:08<00:00,  2.29s/it]


Epochs: 2 | Train Loss:  0.837                 | Train Accuracy:  0.203                 | Val Loss:  0.852                 | Val Accuracy:  0.200


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:07<00:00,  2.28s/it]


Epochs: 3 | Train Loss:  0.853                 | Train Accuracy:  0.192                 | Val Loss:  0.853                 | Val Accuracy:  0.167


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:07<00:00,  2.28s/it]


Epochs: 4 | Train Loss:  0.842                 | Train Accuracy:  0.186                 | Val Loss:  0.860                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:02<00:00,  2.26s/it]


Epochs: 5 | Train Loss:  0.851                 | Train Accuracy:  0.198                 | Val Loss:  0.847                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:11<00:00,  2.30s/it]


Epochs: 6 | Train Loss:  0.839                 | Train Accuracy:  0.203                 | Val Loss:  0.859                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:13<00:00,  2.31s/it]


Epochs: 7 | Train Loss:  0.850                 | Train Accuracy:  0.205                 | Val Loss:  0.855                 | Val Accuracy:  0.183


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:10<00:00,  2.29s/it]


Epochs: 8 | Train Loss:  0.840                 | Train Accuracy:  0.200                 | Val Loss:  0.848                 | Val Accuracy:  0.200


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:09<00:00,  2.29s/it]


Epochs: 9 | Train Loss:  0.842                 | Train Accuracy:  0.196                 | Val Loss:  0.828                 | Val Accuracy:  0.250


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:07<00:00,  2.28s/it]


Epochs: 10 | Train Loss:  0.841                 | Train Accuracy:  0.200                 | Val Loss:  0.844                 | Val Accuracy:  0.217


100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [09:13<00:00,  2.31s/it]


Epochs: 11 | Train Loss:  0.853                 | Train Accuracy:  0.186                 | Val Loss:  0.870                 | Val Accuracy:  0.150
tensor([[7.1878e-06, 1.6004e-03, 9.9787e-01, 4.4752e-05, 4.7538e-04],
        [3.0621e-02, 5.0514e-03, 9.4801e-01, 5.3109e-04, 1.5787e-02]])
tensor([[2.4235e-06, 7.2856e-05, 9.9992e-01, 4.2740e-06, 1.5519e-07],
        [2.3811e-08, 2.6640e-06, 9.9998e-01, 6.3836e-07, 1.2675e-05]])
tensor([[2.4035e-02, 6.7508e-03, 7.7814e-01, 3.0694e-04, 1.9076e-01],
        [6.3461e-01, 2.3774e-03, 3.5966e-01, 3.0598e-04, 3.0455e-03]])
tensor([[3.4390e-02, 1.8507e-03, 7.5988e-01, 9.4811e-05, 2.0379e-01],
        [2.3683e-05, 1.6324e-08, 9.9974e-01, 8.3355e-07, 2.3476e-04]])
tensor([[7.8527e-03, 1.2255e-05, 9.8942e-01, 2.1016e-05, 2.6971e-03],
        [4.8760e-04, 1.1843e-03, 9.9787e-01, 4.4642e-04, 9.6985e-06]])
tensor([[6.0508e-04, 6.4222e-03, 9.6458e-01, 1.5527e-03, 2.6837e-02],
        [6.6404e-09, 1.8869e-06, 9.9975e-01, 9.0409e-11, 2.4756e-04]])
tensor(

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  final_layer = self.relu(linear_output)
100%|█████████████████████████████████████████████████████████████████████████████████| 240/240 [10:24<00:00,  2.60s

KeyboardInterrupt: 