In [1]:
import numpy as np

import torch
from torch import nn
from transformers import BertPreTrainedModel, BertConfig, BertModel, BertTokenizer, AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader, RandomSampler, Dataset, SequentialSampler

from load_data_ensemble import initialize_data
from reading_datasets import read_task5
from labels_to_ids import task5_labels_to_ids

import pickle
import json
import csv

In [2]:
class BertEnsemble(BertPreTrainedModel):
  def __init__(self, config, *args, **kwargs):
      super().__init__(config)

      # model 1
      self.bert_model_1 = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels=3)
      # model 2
      self.bert_model_2 = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels=3)
      # combine the 2 models into 1
      self.cls = nn.Linear(6, 3)
      self.init_weights()

  def forward(
          self,
          input_ids=None,
          attention_mask=None,
          token_type_ids=None,
          position_ids=None,
          head_mask=None,
          inputs_embeds=None,
          labels=None,
  ):
    outputs = []
    
    input_ids_1 = input_ids[0].to(device, dtype = torch.long)
    attention_mask_1 = attention_mask[0].to(device, dtype = torch.long)
    labels_1 = labels[0].to(device, dtype = torch.long)
    outputs.append(self.bert_model_1(input_ids_1,
                                     attention_mask=attention_mask_1, labels =labels_1))

    input_ids_2 = input_ids[1].to(device, dtype = torch.long)
    attention_mask_2 = attention_mask[1].to(device, dtype = torch.long)
    labels_2 = labels[1].to(device, dtype = torch.long)
    outputs.append(self.bert_model_2(input_ids_2,
                                     attention_mask=attention_mask_2, labels =labels_2))
    print(outputs)

    # just get the [CLS] embeddings
    last_hidden_states = torch.cat([output[1] for output in outputs], dim=1)
    print("Printing last hidden states")
    print(last_hidden_states)
    logits = self.cls(last_hidden_states)
    return logits

In [3]:
device = torch.device("cuda")

config = BertConfig()
model = BertEnsemble(config)
model.to(device)
learning_rate = 1e-05

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [{
  "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
  }]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

# Training code

In [None]:
# standard pytorch way of doing things
# 1. create a custom Dataset 
# 2. pass the dataset to a dataloader
# 3. iterate the dataloader and pass the inputs to the model

max_len = 256
batch_size = 6
grad_step = 1
initialization_input = (max_len, batch_size)

model1_tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased', add_prefix_space=True)
model2_tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased', add_prefix_space=True)

#Reading datasets and initializing data loaders
dataset_location = '../2022.07.07_task5/'

#Gives us tweet_id, sentence, and label for each dataset.
train_data = read_task5(dataset_location , split = 'train')
#test_data = read_task5(dataset_location , split = 'dev')#load test set
labels_to_ids = task5_labels_to_ids
#input_data = (train_data, dev_data, labels_to_ids)

dataloader_m1 = initialize_data(model1_tokenizer, initialization_input, train_data, labels_to_ids, shuffle = True)
dataloader_m2 = initialize_data(model2_tokenizer, initialization_input, train_data, labels_to_ids, shuffle = True)

epochs = 5
for epoch in range(epochs):
  # iterate the QA and the AQ inputs simultaneously
  for step, combined_batch in enumerate(zip(dataloader_m1, dataloader_m2)):
    batch_1, batch_2 = combined_batch
    # training so, dropout needed to avoid overfitting
    model.train()
    # move input to GPU
    inputs = {
        "input_ids": [batch_1['input_ids'], batch_2['input_ids']],
        "attention_mask": [batch_1['attention_mask'], batch_2['attention_mask']],
        "labels": [batch_1['labels'], batch_2['labels']]}
    print(type(inputs))
    
    output = model(**inputs)
     
# model outputs are always tuple in transformers (see doc)
    loss = output[0][2]
 
    # backpass
    loss.backward()
    print(f"epoch:{epoch}, loss:{loss}")
    
    # re-calculate the weights
    optimizer.step()
    # again set the grads to 0 for next epoch
    model.zero_grad()
  
  print("\n")

<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.2588, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.3483,  0.2485,  0.1671],
        [-0.3406, -0.0271, -0.1022],
        [-0.4085,  0.2256, -0.0616],
        [ 0.0439,  0.0538, -0.0476],
        [-0.1109,  0.2911,  0.0754],
        [ 0.0512,  0.0462, -0.2173]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(1.2196, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0508,  0.2251, -0.0457],
        [-0.1034,  0.2494, -0.1685],
        [-0.1249,  0.2395,  0.0141],
        [-0.1715,  0.1827, -0.2325],
        [-0.2362,  0.3657,  0.0971],
        [ 0.0620,  0.1640, -0.3860]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[-0.3483,  0.2485,  0.1671, -0.0508,  0.2251, -0.0457],
        [-0.3406, -0.0271, -0.1022, -0.1034,  0.2494, -0.1685],
        

epoch:0, loss:-0.17791691422462463
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.6328, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[2.9321, 1.1308, 1.6843],
        [3.1113, 1.2461, 1.5499],
        [3.3121, 1.1817, 1.3990],
        [2.9995, 1.4021, 1.5574],
        [2.8733, 1.2003, 1.7299],
        [3.1010, 1.1621, 1.5338]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(4.3609, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-2.1508,  2.7776, -0.7291],
        [-1.9377,  2.7988, -0.6852],
        [-2.5241,  3.0473, -0.7228],
        [-2.2964,  2.6774, -0.5662],
        [-2.4452,  2.8751, -0.5770],
        [-2.4181,  2.7319, -0.6555]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 2.9321,  1.1308,  1.6843, -2.1508,  2.7776, -0.7291],
        [ 3.1113,  1.2461,  1.5499, -1.9377,  2.7988, -0.6852]

epoch:0, loss:-0.26167526841163635
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6597, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[4.2108, 1.5439, 2.4225],
        [4.2962, 1.6063, 2.4506],
        [4.1846, 1.4194, 2.6070],
        [4.1374, 1.4106, 2.4692],
        [4.1869, 1.5059, 2.4124],
        [4.1078, 1.5233, 2.4083]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(1.1659, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-3.0037,  3.9510, -0.5614],
        [-3.1432,  3.7805, -0.6219],
        [-2.9730,  3.8102, -0.6911],
        [-2.8791,  3.8796, -0.5673],
        [-3.0711,  3.8137, -0.7747],
        [-3.1076,  3.8376, -0.7791]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 4.2108,  1.5439,  2.4225, -3.0037,  3.9510, -0.5614],
        [ 4.2962,  1.6063,  2.4506, -3.1432,  3.7805, -0.6219]

epoch:0, loss:-0.31249937415122986
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.4625, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[4.7480, 1.8749, 2.7455],
        [4.7920, 1.8228, 2.5980],
        [4.7866, 1.8071, 2.9938],
        [4.8308, 1.7399, 2.7655],
        [4.8778, 1.7987, 2.8968],
        [4.9816, 1.7921, 2.6735]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(4.8112, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-3.3566,  4.4263, -0.7442],
        [-3.2883,  4.3899, -0.6430],
        [-3.4284,  4.4104, -0.5559],
        [-3.3672,  4.4412, -0.5538],
        [-3.5931,  4.5528, -0.6479],
        [-3.1541,  4.3138, -0.8813]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 4.7480,  1.8749,  2.7455, -3.3566,  4.4263, -0.7442],
        [ 4.7920,  1.8228,  2.5980, -3.2883,  4.3899, -0.6430]

epoch:0, loss:-0.350986123085022
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5214, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[5.2720, 1.9372, 3.1063],
        [5.6215, 2.1153, 3.1005],
        [5.2766, 2.1109, 2.7818],
        [5.4172, 1.8002, 3.1404],
        [5.4382, 2.1813, 3.1332],
        [5.6060, 1.9655, 3.1076]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(8.7229, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-3.8403,  4.8598, -1.0236],
        [-3.8026,  4.6085, -0.6693],
        [-3.7426,  5.0716, -0.7642],
        [-3.7528,  4.6362, -1.0890],
        [-4.0569,  5.0219, -0.8203],
        [-4.1454,  4.7768, -0.8135]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 5.2720,  1.9372,  3.1063, -3.8403,  4.8598, -1.0236],
        [ 5.6215,  2.1153,  3.1005, -3.8026,  4.6085, -0.6693],


epoch:0, loss:-0.3713473975658417
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2755, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[5.7704, 2.0871, 3.1950],
        [5.8448, 2.3968, 3.1047],
        [5.6438, 2.2079, 3.3146],
        [5.7777, 2.3711, 3.3115],
        [5.6570, 1.8643, 3.3556],
        [5.6707, 2.2456, 3.3199]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(7.8306, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-4.2573,  4.9332, -1.0308],
        [-4.2207,  5.3601, -0.7476],
        [-4.3376,  5.2413, -0.7482],
        [-4.2566,  5.1266, -0.9241],
        [-3.8522,  5.2785, -0.8983],
        [-4.1329,  5.3555, -0.8928]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 5.7704,  2.0871,  3.1950, -4.2573,  4.9332, -1.0308],
        [ 5.8448,  2.3968,  3.1047, -4.2207,  5.3601, -0.7476],

epoch:0, loss:-0.40841788053512573
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.3707, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[6.1219, 2.1653, 3.4471],
        [6.0420, 2.2707, 3.2155],
        [6.1204, 2.4539, 3.4728],
        [6.2120, 2.4972, 3.5932],
        [6.0399, 2.1419, 3.4393],
        [6.1195, 2.3012, 3.1290]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(7.6650, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-4.6240,  5.6262, -0.8905],
        [-4.4800,  5.8267, -0.8790],
        [-4.4769,  5.4742, -1.0170],
        [-4.4014,  5.4169, -0.5590],
        [-4.2504,  5.5713, -1.0663],
        [-4.1961,  5.4025, -1.0514]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 6.1219,  2.1653,  3.4471, -4.6240,  5.6262, -0.8905],
        [ 6.0420,  2.2707,  3.2155, -4.4800,  5.8267, -0.8790]

epoch:0, loss:-0.4160290062427521
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7632, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[6.5283, 2.3633, 3.6902],
        [6.1654, 2.4617, 3.4539],
        [6.5771, 2.3255, 3.6700],
        [6.5692, 2.2730, 3.5003],
        [6.4384, 2.3271, 3.5513],
        [6.3624, 2.4405, 3.3414]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(4.5486, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-4.6152,  5.8052, -0.8687],
        [-4.5168,  5.5608, -0.9435],
        [-4.5599,  5.8787, -1.1655],
        [-4.7783,  5.6384, -0.9950],
        [-4.5071,  5.6755, -1.0582],
        [-4.5354,  5.6648, -1.0199]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 6.5283,  2.3633,  3.6902, -4.6152,  5.8052, -0.8687],
        [ 6.1654,  2.4617,  3.4539, -4.5168,  5.5608, -0.9435],

epoch:0, loss:-0.4426882863044739
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.8852, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[6.5527, 2.7617, 3.6206],
        [6.6302, 2.3654, 3.6918],
        [6.7280, 2.5668, 3.8241],
        [6.6151, 2.5408, 3.7568],
        [6.6341, 2.4861, 3.6691],
        [6.6957, 2.8105, 3.8899]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(8.3318, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-4.8017,  6.2163, -1.1892],
        [-4.9945,  5.9145, -1.1108],
        [-4.6943,  5.5714, -0.8944],
        [-4.7906,  6.0320, -1.0865],
        [-4.8322,  6.0923, -0.9022],
        [-4.8838,  6.0631, -1.1726]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 6.5527,  2.7617,  3.6206, -4.8017,  6.2163, -1.1892],
        [ 6.6302,  2.3654,  3.6918, -4.9945,  5.9145, -1.1108],

epoch:0, loss:-0.45988696813583374
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.5505, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[7.0021, 2.7384, 3.8122],
        [7.0629, 2.3980, 3.8740],
        [6.6990, 2.5819, 3.7740],
        [6.9025, 2.5366, 3.8603],
        [6.7662, 2.5581, 4.0874],
        [6.7937, 2.7060, 3.8744]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(10.5377, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-5.0950,  5.8482, -1.0841],
        [-4.8923,  6.2187, -0.9992],
        [-5.1955,  6.1598, -1.0782],
        [-5.0497,  6.1740, -1.3315],
        [-4.9367,  6.3059, -1.0777],
        [-4.9945,  6.2446, -1.4393]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 7.0021,  2.7384,  3.8122, -5.0950,  5.8482, -1.0841],
        [ 7.0629,  2.3980,  3.8740, -4.8923,  6.2187, -0.9992

epoch:0, loss:-0.489841103553772
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.5624, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[7.1472, 2.7331, 4.0434],
        [7.2413, 2.7668, 4.2322],
        [7.0197, 2.9615, 4.0251],
        [7.1407, 2.6726, 3.9210],
        [7.0474, 2.7657, 4.2473],
        [7.1513, 2.7982, 4.2134]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(10.3082, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-5.2050,  6.3870, -1.3335],
        [-5.3037,  6.4373, -0.9050],
        [-5.3303,  6.3139, -1.2542],
        [-5.4561,  6.5491, -1.2118],
        [-5.1592,  6.1824, -1.0092],
        [-5.1322,  6.5677, -1.3531]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 7.1472,  2.7331,  4.0434, -5.2050,  6.3870, -1.3335],
        [ 7.2413,  2.7668,  4.2322, -5.3037,  6.4373, -0.9050],

epoch:0, loss:-0.5000242590904236
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5921, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[7.1973, 2.9679, 4.2723],
        [7.0820, 2.3897, 4.1541],
        [7.2421, 3.0852, 3.9093],
        [7.2862, 2.8745, 4.2553],
        [7.3157, 2.8379, 4.1355],
        [7.1106, 2.7822, 4.0068]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(5.3728, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-5.3371,  6.7531, -1.3665],
        [-5.4936,  6.6110, -1.3625],
        [-5.3842,  6.7855, -1.3166],
        [-5.5124,  6.7963, -1.4470],
        [-5.5350,  6.8306, -1.2095],
        [-5.4527,  6.6297, -1.3027]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 7.1973,  2.9679,  4.2723, -5.3371,  6.7531, -1.3665],
        [ 7.0820,  2.3897,  4.1541, -5.4936,  6.6110, -1.3625],

epoch:0, loss:-0.5116162300109863
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.6598, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[7.7914, 2.8037, 4.0934],
        [7.6103, 2.8173, 4.1805],
        [7.7271, 3.0789, 4.3556],
        [7.5897, 3.0435, 4.4325],
        [7.7282, 3.0554, 4.2687],
        [7.4793, 2.6256, 4.4616]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(5.5508, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-5.5483,  6.9712, -1.3863],
        [-5.8678,  6.9386, -1.4850],
        [-5.6383,  6.8091, -1.1941],
        [-5.6247,  6.7692, -1.5230],
        [-5.7707,  6.7230, -1.2303],
        [-5.3191,  6.7137, -1.2342]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 7.7914,  2.8037,  4.0934, -5.5483,  6.9712, -1.3863],
        [ 7.6103,  2.8173,  4.1805, -5.8678,  6.9386, -1.4850],

epoch:0, loss:-0.533570408821106
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.9069, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[7.8405, 2.8633, 4.6784],
        [7.5842, 3.2657, 4.3416],
        [7.7074, 3.2516, 4.3929],
        [7.7907, 3.2338, 4.6252],
        [7.9225, 2.8695, 4.5557],
        [7.6885, 3.0718, 4.6302]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(11.4386, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-5.5617,  7.0818, -1.4301],
        [-5.7131,  6.9222, -1.3310],
        [-5.8840,  7.1944, -1.5142],
        [-5.7947,  7.0834, -1.5130],
        [-5.9265,  6.9768, -1.4119],
        [-6.1010,  7.1424, -1.4917]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 7.8405,  2.8633,  4.6784, -5.5617,  7.0818, -1.4301],
        [ 7.5842,  3.2657,  4.3416, -5.7131,  6.9222, -1.3310],

epoch:0, loss:-0.5530727505683899
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.0393, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[8.1103, 3.0751, 4.6589],
        [8.1688, 3.0808, 4.5334],
        [8.0513, 2.6807, 4.7159],
        [7.9602, 3.0839, 4.5523],
        [7.9662, 3.0955, 4.7329],
        [7.9598, 3.0720, 4.5838]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(10.1867, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-6.0299,  7.2921, -1.5642],
        [-6.0528,  7.3035, -1.9093],
        [-5.7840,  6.9589, -1.6524],
        [-5.9278,  6.9459, -1.5333],
        [-5.8052,  7.1624, -1.7804],
        [-5.9282,  7.2125, -1.5315]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 8.1103,  3.0751,  4.6589, -6.0299,  7.2921, -1.5642],
        [ 8.1688,  3.0808,  4.5334, -6.0528,  7.3035, -1.9093]

epoch:0, loss:-0.5816639065742493
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6441, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[8.1452, 3.0812, 4.6433],
        [8.1866, 3.2273, 4.8599],
        [7.9937, 3.2368, 4.7951],
        [8.5749, 3.3355, 4.5890],
        [7.9435, 3.3766, 4.6307],
        [8.1320, 3.3877, 4.6414]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(10.5152, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-6.0583,  7.3064, -1.6683],
        [-6.2936,  7.5577, -1.8525],
        [-5.9501,  7.2783, -1.8723],
        [-6.0633,  7.2123, -1.7613],
        [-6.0472,  7.7873, -1.5834],
        [-6.0318,  7.3997, -1.8449]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 8.1452,  3.0812,  4.6433, -6.0583,  7.3064, -1.6683],
        [ 8.1866,  3.2273,  4.8599, -6.2936,  7.5577, -1.8525]

epoch:0, loss:-0.5784471035003662
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.0387, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[8.2413, 3.5929, 4.8334],
        [8.3750, 3.4678, 4.9543],
        [8.4294, 3.5411, 4.7810],
        [8.3884, 3.4962, 5.2478],
        [8.4338, 3.1817, 4.9398],
        [8.4235, 3.3679, 4.8402]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(8.6619, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-6.4730,  7.7916, -1.6170],
        [-6.3171,  7.6528, -1.9404],
        [-6.1620,  7.5380, -1.7488],
        [-6.3134,  7.8736, -2.0820],
        [-6.1008,  7.6906, -1.5422],
        [-6.4215,  7.8107, -1.8442]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 8.2413,  3.5929,  4.8334, -6.4730,  7.7916, -1.6170],
        [ 8.3750,  3.4678,  4.9543, -6.3171,  7.6528, -1.9404],

epoch:0, loss:-0.6062086224555969
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2842, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[8.4560, 3.1501, 5.0505],
        [8.5520, 3.5366, 5.1407],
        [8.6231, 3.5542, 5.1566],
        [8.7337, 3.7480, 4.9136],
        [8.4257, 3.6364, 5.0380],
        [8.6512, 3.4931, 5.0445]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(8.8000, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-6.4466,  7.6200, -1.7335],
        [-6.6089,  7.8281, -2.1189],
        [-6.4882,  7.8372, -2.0004],
        [-6.4231,  7.6015, -2.0334],
        [-6.3883,  7.7759, -2.0694],
        [-6.5209,  7.8815, -1.9277]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 8.4560,  3.1501,  5.0505, -6.4466,  7.6200, -1.7335],
        [ 8.5520,  3.5366,  5.1407, -6.6089,  7.8281, -2.1189],

epoch:0, loss:-0.6349141001701355
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.2027, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[8.6559, 3.3566, 5.1663],
        [8.7655, 3.5588, 5.2275],
        [8.6868, 3.7188, 4.9619],
        [8.8802, 3.9023, 5.2893],
        [8.8982, 3.2307, 5.2119],
        [8.7024, 3.3128, 4.9352]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(11.2458, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-6.6369,  7.6543, -2.1976],
        [-6.6149,  7.7691, -1.9625],
        [-6.5375,  8.1765, -2.0115],
        [-6.4560,  8.1015, -2.0965],
        [-6.6606,  8.0573, -2.0185],
        [-6.2653,  7.9008, -1.9147]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 8.6559,  3.3566,  5.1663, -6.6369,  7.6543, -2.1976],
        [ 8.7655,  3.5588,  5.2275, -6.6149,  7.7691, -1.9625]

epoch:0, loss:-0.6473920941352844
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2704, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[8.8642, 3.7152, 5.3754],
        [8.7436, 3.8064, 5.0238],
        [8.8379, 4.2190, 5.3371],
        [8.8897, 4.0828, 5.4018],
        [8.9560, 3.7090, 5.2766],
        [9.0723, 3.8451, 5.1713]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(14.8821, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-6.7046,  7.9279, -1.9237],
        [-6.8694,  8.2686, -2.0035],
        [-6.5026,  8.0666, -2.3564],
        [-6.8109,  8.2428, -2.0449],
        [-6.8219,  8.1933, -2.2280],
        [-6.7789,  8.1047, -2.3303]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 8.8642,  3.7152,  5.3754, -6.7046,  7.9279, -1.9237],
        [ 8.7436,  3.8064,  5.0238, -6.8694,  8.2686, -2.0035]

epoch:0, loss:-0.6580385565757751
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.6663, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[9.1608, 3.8238, 5.5144],
        [8.9207, 3.4762, 5.4681],
        [9.0351, 3.9968, 5.2381],
        [8.9287, 3.8981, 5.4365],
        [9.0574, 3.9775, 5.2195],
        [8.9200, 3.6443, 5.5810]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(12.0144, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-6.7205,  8.5047, -1.9811],
        [-7.0838,  8.2279, -2.2602],
        [-7.3227,  8.4771, -2.2548],
        [-7.0618,  8.2201, -2.1539],
        [-7.1452,  8.1922, -1.8942],
        [-7.0472,  8.3759, -2.3685]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 9.1608,  3.8238,  5.5144, -6.7205,  8.5047, -1.9811],
        [ 8.9207,  3.4762,  5.4681, -7.0838,  8.2279, -2.2602]

epoch:0, loss:-0.6662132143974304
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9313, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[9.0293, 4.1295, 5.4565],
        [9.1038, 4.0082, 5.3800],
        [9.3411, 4.1434, 5.3899],
        [9.3570, 3.9539, 5.6328],
        [9.2597, 4.0759, 5.5061],
        [9.4133, 3.9972, 5.5795]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(7.8101, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-7.1830,  8.4029, -2.4935],
        [-7.1375,  8.4465, -2.5262],
        [-7.0768,  8.1859, -2.1685],
        [-7.0769,  8.4725, -2.3924],
        [-7.0754,  8.6900, -2.3887],
        [-6.9862,  8.7042, -2.4119]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 9.0293,  4.1295,  5.4565, -7.1830,  8.4029, -2.4935],
        [ 9.1038,  4.0082,  5.3800, -7.1375,  8.4465, -2.5262],

epoch:0, loss:-0.6897462606430054
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.1856, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[9.5358, 3.9461, 5.7076],
        [9.4130, 4.0535, 5.6407],
        [9.4265, 4.0624, 5.5868],
        [9.4147, 3.8903, 5.6282],
        [9.4824, 4.1647, 6.0035],
        [9.2702, 3.8335, 5.6279]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(12.3388, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-7.2114,  8.5195, -2.3513],
        [-7.3776,  8.6565, -2.5785],
        [-7.4333,  8.6382, -2.4591],
        [-7.1491,  8.7463, -2.5022],
        [-6.5182,  8.4786, -2.2134],
        [-7.1429,  8.5003, -2.4267]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 9.5358,  3.9461,  5.7076, -7.2114,  8.5195, -2.3513],
        [ 9.4130,  4.0535,  5.6407, -7.3776,  8.6565, -2.5785]

epoch:0, loss:-0.7066841721534729
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9577, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[9.3998, 4.1404, 5.8787],
        [9.7519, 4.4040, 5.7173],
        [9.6475, 4.0621, 5.9011],
        [9.7444, 3.9629, 5.8084],
        [9.5805, 3.9946, 5.8905],
        [9.4868, 4.2593, 5.6273]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(9.0556, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-7.5137,  8.6824, -2.5221],
        [-7.2725,  8.7997, -2.3156],
        [-7.0845,  8.5703, -2.5766],
        [-7.3944,  8.8815, -2.6094],
        [-7.3513,  8.8527, -2.5492],
        [-7.4548,  8.6744, -2.4683]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 9.3998,  4.1404,  5.8787, -7.5137,  8.6824, -2.5221],
        [ 9.7519,  4.4040,  5.7173, -7.2725,  8.7997, -2.3156],

epoch:0, loss:-0.7235301733016968
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.6425, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[9.5974, 4.3983, 5.8997],
        [9.6907, 4.0609, 5.9992],
        [9.4295, 4.1710, 5.4361],
        [9.6560, 4.4748, 6.0318],
        [9.8987, 4.6122, 5.8633],
        [9.6200, 4.3660, 5.8512]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(5.3576, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-7.3026,  8.7895, -2.6403],
        [-7.1561,  8.7779, -2.7400],
        [-7.3149,  8.7385, -2.6237],
        [-7.4770,  8.7608, -2.7425],
        [-7.4705,  8.9039, -2.8012],
        [-7.6382,  9.1100, -2.7052]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 9.5974,  4.3983,  5.8997, -7.3026,  8.7895, -2.6403],
        [ 9.6907,  4.0609,  5.9992, -7.1561,  8.7779, -2.7400],

epoch:0, loss:-0.732046902179718
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.8817, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 9.8545,  4.0980,  5.9736],
        [ 9.9294,  4.4225,  6.1864],
        [ 9.6896,  4.2592,  5.9905],
        [10.1362,  4.5008,  5.9002],
        [ 9.7477,  4.1724,  5.9605],
        [ 9.6827,  4.3928,  5.7279]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(12.9614, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-7.7552,  9.0371, -2.6520],
        [-7.6361,  9.0524, -2.5972],
        [-7.4233,  9.0329, -2.8759],
        [-7.5980,  9.0360, -2.6804],
        [-7.1948,  9.1123, -3.0444],
        [-7.5655,  8.9307, -3.0452]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 9.8545,  4.0980,  5.9736, -7.7552,  9.0371, -2.6520],
        [ 9.9294,  4.4225,  6.1864, -7

epoch:0, loss:-0.7614975571632385
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5947, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 9.9371,  4.2866,  6.3485],
        [ 9.8589,  4.5410,  5.9919],
        [ 9.8516,  4.5903,  5.9308],
        [10.1264,  4.6243,  6.0905],
        [ 9.9573,  4.5733,  6.3119],
        [10.1102,  4.8722,  6.0990]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(11.8137, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-7.9647,  9.2978, -2.6510],
        [-7.6067,  9.1619, -2.9675],
        [-7.5849,  9.2009, -3.0305],
        [-7.7905,  8.8821, -2.9437],
        [-7.9154,  9.3316, -2.6929],
        [-7.6437,  9.2618, -2.7500]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 9.9371,  4.2866,  6.3485, -7.9647,  9.2978, -2.6510],
        [ 9.8589,  4.5410,  5.9919, -

epoch:0, loss:-0.7725898027420044
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.8230, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[10.1537,  4.9059,  6.2239],
        [ 9.9719,  4.3718,  6.1374],
        [10.2053,  4.5566,  5.9869],
        [10.2180,  4.3909,  6.2026],
        [10.4198,  4.4720,  6.5079],
        [ 9.8701,  4.3915,  6.2913]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(9.7671, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-7.7181,  9.4234, -2.9288],
        [-7.7518,  9.4364, -3.0562],
        [-7.8438,  9.3697, -3.0834],
        [-7.7529,  9.2596, -2.8137],
        [-7.8402,  9.1784, -2.8476],
        [-7.5751,  9.3606, -2.8961]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[10.1537,  4.9059,  6.2239, -7.7181,  9.4234, -2.9288],
        [ 9.9719,  4.3718,  6.1374, -7

epoch:0, loss:-0.7848023176193237
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5257, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 9.9425,  4.6657,  6.3307],
        [10.1727,  4.6496,  6.1366],
        [10.4117,  4.5610,  6.2597],
        [10.0844,  4.7668,  6.4278],
        [10.3730,  4.8320,  6.5488],
        [10.1786,  4.5742,  6.2642]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(12.0523, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-8.1916,  9.3434, -3.2110],
        [-7.9237,  9.4006, -3.0279],
        [-7.7132,  9.6453, -2.9175],
        [-7.8679,  9.4235, -3.0249],
        [-7.9975,  9.6112, -2.9272],
        [-8.1852,  9.4360, -3.0339]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 9.9425,  4.6657,  6.3307, -8.1916,  9.3434, -3.2110],
        [10.1727,  4.6496,  6.1366, -

epoch:0, loss:-0.8075751662254333
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6415, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[10.5017,  4.9242,  6.3574],
        [10.4452,  4.7404,  6.1804],
        [10.7521,  4.6863,  6.5532],
        [10.2913,  4.6851,  6.4061],
        [10.6960,  4.8499,  6.6416],
        [10.5082,  4.8443,  6.4115]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(10.9221, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-8.3605,  9.5742, -3.2964],
        [-7.8889,  9.7928, -3.3339],
        [-8.0974,  9.1355, -3.2030],
        [-7.8579,  9.7008, -3.2424],
        [-8.2222,  9.3919, -3.1993],
        [-8.0129,  9.5073, -3.1163]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[10.5017,  4.9242,  6.3574, -8.3605,  9.5742, -3.2964],
        [10.4452,  4.7404,  6.1804, -

epoch:0, loss:-0.8238974809646606
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3775, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[10.7902,  4.8918,  6.3407],
        [10.8018,  4.7878,  6.6030],
        [10.1125,  4.6902,  6.8323],
        [10.4958,  4.5758,  6.6159],
        [10.7198,  5.1960,  6.4580],
        [10.5698,  4.9224,  6.3240]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(11.9096, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-8.2398,  9.7275, -3.5074],
        [-8.3284,  9.7011, -3.1214],
        [-8.0010,  9.4964, -3.0819],
        [-8.2437,  9.6644, -3.2557],
        [-8.4235,  9.6611, -3.2637],
        [-8.1643,  9.8783, -3.3762]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[10.7902,  4.8918,  6.3407, -8.2398,  9.7275, -3.5074],
        [10.8018,  4.7878,  6.6030, -

epoch:0, loss:-0.8144720196723938
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5014, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[10.4005,  5.1678,  6.3411],
        [10.8081,  4.9628,  6.7549],
        [10.5462,  4.8126,  6.6321],
        [10.8447,  4.9401,  6.4175],
        [10.5980,  4.8596,  6.4661],
        [10.6707,  5.0204,  6.8263]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(12.2082, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-8.2771, 10.1194, -3.4142],
        [-8.3360,  9.6828, -3.0117],
        [-7.9544, 10.0869, -3.3301],
        [-8.2560,  9.9306, -3.6725],
        [-8.3766, 10.0450, -3.3763],
        [-8.3259, 10.0864, -3.1757]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[10.4005,  5.1678,  6.3411, -8.2771, 10.1194, -3.4142],
        [10.8081,  4.9628,  6.7549, -

epoch:0, loss:-0.8328685164451599
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.7966, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[10.8598,  5.0530,  6.6300],
        [10.8090,  4.9932,  6.9734],
        [10.3817,  5.0980,  6.4200],
        [11.1010,  4.9935,  6.9080],
        [10.7676,  4.9618,  6.4405],
        [10.5192,  4.7707,  6.7828]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(9.1041, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-8.3768,  9.7742, -3.2555],
        [-8.4078,  9.9863, -3.7344],
        [-8.7473, 10.0022, -3.2026],
        [-8.2401,  9.4440, -3.5454],
        [-8.2871,  9.9036, -3.6891],
        [-8.4267, 10.0756, -3.3083]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[10.8598,  5.0530,  6.6300, -8.3768,  9.7742, -3.2555],
        [10.8090,  4.9932,  6.9734, -8

epoch:0, loss:-0.8491769433021545
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.0276, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[11.0009,  5.2608,  6.9585],
        [10.6685,  4.4631,  6.6492],
        [10.9264,  5.1235,  6.8691],
        [10.6006,  5.2335,  6.5935],
        [10.7799,  5.1682,  6.6388],
        [10.8992,  4.8579,  6.9820]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(15.3938, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-8.5538,  9.9386, -3.4712],
        [-8.7356, 10.0267, -3.4851],
        [-8.4359,  9.7851, -3.7588],
        [-8.6266, 10.2351, -3.6294],
        [-8.6135, 10.0266, -3.4992],
        [-8.2518,  9.7737, -3.5537]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[11.0009,  5.2608,  6.9585, -8.5538,  9.9386, -3.4712],
        [10.6685,  4.4631,  6.6492, -

epoch:0, loss:-0.8700042366981506
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.4757, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[11.0891,  5.5005,  6.7128],
        [10.9529,  5.1324,  6.6324],
        [11.0966,  5.0998,  6.6042],
        [11.1631,  4.9159,  6.9940],
        [10.3833,  5.1667,  6.8187],
        [11.1408,  5.0935,  6.9258]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(10.7995, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-8.5730,  9.9528, -3.6078],
        [-8.3209,  9.6520, -3.5166],
        [-8.5710,  9.9112, -3.4635],
        [-8.7496, 10.0969, -3.4145],
        [-8.6863, 10.5352, -3.8745],
        [-8.5665, 10.1465, -3.7623]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[11.0891,  5.5005,  6.7128, -8.5730,  9.9528, -3.6078],
        [10.9529,  5.1324,  6.6324, -

epoch:0, loss:-0.8824734091758728
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.2818, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[11.2939,  5.1399,  7.2220],
        [10.6046,  5.2883,  6.5497],
        [11.2653,  5.3380,  7.0367],
        [11.2453,  5.0951,  6.9011],
        [11.3049,  5.3709,  6.7962],
        [11.0689,  5.5510,  7.2410]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(15.7847, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-8.8713,  9.9837, -3.5909],
        [-8.5462, 10.1390, -3.8841],
        [-8.7508, 10.4028, -3.6878],
        [-8.4511, 10.2959, -3.9173],
        [-9.0247, 10.3876, -3.6630],
        [-8.6332, 10.3756, -3.7180]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[11.2939,  5.1399,  7.2220, -8.8713,  9.9837, -3.5909],
        [10.6046,  5.2883,  6.5497, -

epoch:0, loss:-0.9075533151626587
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.4423, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[11.4561,  5.0667,  7.1625],
        [10.9233,  5.4172,  6.5834],
        [11.2575,  5.3364,  7.1272],
        [11.3423,  5.8198,  6.9891],
        [11.2376,  5.3571,  7.1702],
        [11.0938,  5.1234,  7.1904]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(15.1519, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-8.8675, 10.3153, -3.9659],
        [-8.8187, 10.0580, -3.8764],
        [-8.9375, 10.1731, -3.6633],
        [-8.6952, 10.2245, -3.9320],
        [-8.8266, 10.4927, -3.9928],
        [-8.9613, 10.4149, -3.9256]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[11.4561,  5.0667,  7.1625, -8.8675, 10.3153, -3.9659],
        [10.9233,  5.4172,  6.5834, -

epoch:0, loss:-0.9247371554374695
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4337, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[11.4503,  5.7143,  7.1230],
        [11.2303,  5.6283,  7.0501],
        [11.4267,  5.5130,  7.2011],
        [11.3804,  5.8107,  6.8831],
        [11.6157,  5.4681,  7.1674],
        [11.5301,  5.5331,  7.0897]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(12.1775, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-9.3832, 10.2103, -4.1834],
        [-9.1676, 10.4574, -4.1615],
        [-9.3136, 10.4955, -4.1178],
        [-8.7494, 10.4035, -3.7686],
        [-9.0472, 10.6736, -3.9028],
        [-9.3069, 10.3668, -4.0940]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[11.4503,  5.7143,  7.1230, -9.3832, 10.2103, -4.1834],
        [11.2303,  5.6283,  7.0501, -

epoch:0, loss:-0.9233357906341553
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.2870, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[11.4781,  5.5329,  7.4217],
        [11.6264,  5.5907,  7.5048],
        [11.1986,  5.2385,  7.2949],
        [11.5024,  5.6950,  7.3712],
        [11.3594,  5.3274,  7.2534],
        [11.1926,  5.4144,  7.2476]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(15.3925, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-8.9231, 10.4463, -4.0845],
        [-9.3383, 10.5914, -3.9529],
        [-9.0891, 10.6698, -4.2974],
        [-9.3450, 10.7017, -4.0340],
        [-8.7096, 10.1282, -4.1536],
        [-8.7816, 10.4232, -3.9191]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[11.4781,  5.5329,  7.4217, -8.9231, 10.4463, -4.0845],
        [11.6264,  5.5907,  7.5048, -

epoch:0, loss:-0.9461562633514404
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4117, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[11.7187,  5.9345,  7.4230],
        [11.6673,  5.3638,  7.4730],
        [11.1783,  5.8441,  7.0689],
        [11.5826,  5.5187,  7.4407],
        [11.5185,  5.4366,  7.0424],
        [11.4231,  5.5407,  7.1424]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(19.7286, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-9.0970, 10.6265, -4.3249],
        [-9.0564, 10.4834, -4.2201],
        [-9.5063, 10.4318, -4.2052],
        [-9.0685, 10.5131, -4.4009],
        [-9.0821, 10.4498, -4.2164],
        [-9.4791, 10.5778, -4.2325]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[11.7187,  5.9345,  7.4230, -9.0970, 10.6265, -4.3249],
        [11.6673,  5.3638,  7.4730, -

epoch:0, loss:-0.9602506160736084
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.0311, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[11.5972,  5.7372,  7.4156],
        [11.9223,  5.6842,  7.5787],
        [11.8082,  5.5921,  7.2413],
        [11.4334,  5.9295,  7.0880],
        [11.9320,  5.6828,  7.3651],
        [11.4371,  5.5833,  7.4242]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(16.7533, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-9.1842, 10.9962, -4.3755],
        [-9.2701, 10.8047, -4.3554],
        [-8.9630, 10.8792, -4.3731],
        [-9.4620, 10.8224, -4.1804],
        [-9.0601, 10.8845, -4.1968],
        [-9.3047, 10.7308, -4.0974]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[11.5972,  5.7372,  7.4156, -9.1842, 10.9962, -4.3755],
        [11.9223,  5.6842,  7.5787, -

epoch:0, loss:-0.9883502125740051
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.0004, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[11.6294,  5.6588,  7.7694],
        [11.9264,  5.9720,  7.3848],
        [11.8355,  5.9598,  7.6337],
        [11.7494,  5.5187,  7.4227],
        [11.6684,  5.6420,  7.5038],
        [12.1237,  5.9173,  7.6258]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(12.6200, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-9.5904, 11.1771, -4.4022],
        [-9.1096, 11.0554, -4.5065],
        [-9.1414, 11.0928, -4.6349],
        [-9.0515, 10.9485, -4.2678],
        [-9.1959, 10.7800, -4.4231],
        [-9.6802, 11.0565, -4.4162]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[11.6294,  5.6588,  7.7694, -9.5904, 11.1771, -4.4022],
        [11.9264,  5.9720,  7.3848, -

epoch:0, loss:-0.9872353076934814
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.7140, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[11.6856,  5.8854,  7.5402],
        [11.6631,  5.6159,  7.6794],
        [11.8376,  5.6358,  7.9128],
        [11.7789,  5.9389,  7.6760],
        [11.8000,  5.7342,  7.7280],
        [11.5995,  5.8450,  7.3423]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(19.5617, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-9.3830, 10.9354, -4.7370],
        [-9.5459, 11.0802, -4.5841],
        [-9.5295, 10.8520, -4.5193],
        [-9.5062, 11.0396, -4.4305],
        [-9.2428, 10.7710, -4.3450],
        [-9.6826, 10.8775, -4.1590]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[11.6856,  5.8854,  7.5402, -9.3830, 10.9354, -4.7370],
        [11.6631,  5.6159,  7.6794, -

epoch:0, loss:-1.01519775390625
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.7591, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[11.4581,  5.4576,  7.1314],
        [12.0463,  5.4562,  7.5567],
        [11.9665,  5.8356,  7.3997],
        [11.7933,  5.6774,  7.5395],
        [11.6639,  5.7664,  7.6733],
        [12.0580,  6.1294,  7.3702]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(12.9599, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-9.4843, 11.0160, -4.5496],
        [-9.7655, 11.0278, -4.3694],
        [-9.4927, 11.3092, -4.6069],
        [-9.7066, 11.1353, -4.5555],
        [-9.4042, 10.3459, -4.2718],
        [-9.5119, 11.0723, -4.5517]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[11.4581,  5.4576,  7.1314, -9.4843, 11.0160, -4.5496],
        [12.0463,  5.4562,  7.5567, -9.

epoch:0, loss:-1.0031715631484985
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.7086, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[12.1520,  6.1054,  7.8830],
        [11.9239,  6.0983,  7.8385],
        [12.0180,  6.2557,  7.6993],
        [11.8344,  5.9160,  7.8072],
        [12.1059,  5.8170,  8.0587],
        [12.0628,  6.0490,  7.9150]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(16.4253, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-9.5706, 10.9978, -4.7397],
        [-9.5762, 11.3284, -5.0117],
        [-9.4930, 10.7103, -4.6153],
        [-9.3548, 11.1820, -4.8832],
        [-9.4694, 11.0666, -4.4646],
        [-9.7478, 11.0947, -4.4752]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[12.1520,  6.1054,  7.8830, -9.5706, 10.9978, -4.7397],
        [11.9239,  6.0983,  7.8385, -

epoch:0, loss:-1.0418213605880737
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.8109, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[12.1533,  5.9968,  7.6524],
        [12.1678,  6.2404,  7.4953],
        [12.4671,  6.3386,  7.8428],
        [12.3061,  6.2882,  7.6349],
        [11.9616,  6.1209,  7.8743],
        [12.2394,  5.8717,  8.0423]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.7880, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-9.3514, 10.9708, -4.8152],
        [-9.9001, 10.9506, -4.7750],
        [-9.4458, 11.3617, -4.9601],
        [-9.8337, 11.0970, -4.4338],
        [-9.8022, 11.2082, -5.0126],
        [-9.6682, 11.1382, -4.7464]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[12.1533,  5.9968,  7.6524, -9.3514, 10.9708, -4.8152],
        [12.1678,  6.2404,  7.4953, -

epoch:0, loss:-1.0379489660263062
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.3842, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[12.0313,  6.1692,  7.9100],
        [12.0546,  6.2747,  8.0566],
        [12.5548,  5.8669,  7.9855],
        [11.9727,  6.0068,  7.8853],
        [12.1800,  6.3692,  8.1826],
        [12.0680,  6.4768,  7.9967]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(14.2019, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-9.8393, 11.0662, -4.6736],
        [-9.8788, 11.5129, -4.9049],
        [-9.9277, 11.5382, -5.0044],
        [-9.8411, 11.3612, -4.7184],
        [-9.9209, 11.4089, -4.8464],
        [-9.7976, 11.2264, -5.0726]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[12.0313,  6.1692,  7.9100, -9.8393, 11.0662, -4.6736],
        [12.0546,  6.2747,  8.0566, -

epoch:0, loss:-1.0615198612213135
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.6701, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[12.4392,  6.5843,  7.7363],
        [12.5269,  6.2253,  7.6623],
        [12.0984,  6.2811,  8.3155],
        [12.1647,  6.2143,  8.1928],
        [12.1692,  6.3665,  8.2407],
        [12.4133,  6.3228,  7.8781]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(8.9268, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-9.4792, 11.0571, -4.7696],
        [-9.8721, 11.6163, -4.7939],
        [-9.9815, 11.4208, -5.0407],
        [-9.8709, 11.4084, -4.9899],
        [-9.8149, 11.5667, -4.6174],
        [-9.8293, 11.1355, -4.7358]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[12.4392,  6.5843,  7.7363, -9.4792, 11.0571, -4.7696],
        [12.5269,  6.2253,  7.6623, -9

[SequenceClassifierOutput(loss=tensor(2.1635, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[12.6796,  6.5182,  7.6915],
        [12.3274,  6.1942,  8.0975],
        [12.2663,  6.2886,  8.5487],
        [12.6867,  6.5717,  8.0763],
        [12.1473,  6.1393,  8.2180],
        [12.4162,  6.1292,  7.8614]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(11.8656, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.1342,  11.4824,  -4.7724],
        [-10.0207,  11.4140,  -4.9038],
        [ -9.9299,  11.6493,  -5.0340],
        [ -9.9699,  11.5804,  -4.7958],
        [-10.0550,  11.5610,  -5.0897],
        [ -9.9365,  11.4404,  -5.0773]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 12.6796,   6.5182,   7.6915, -10.1342,  11.4824,  -4.7724],
        [ 12.3274,   6.1942,   8.0975, -10.0207,  11.4140,  -4

epoch:0, loss:-1.0862590074539185
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.9797, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[12.6282,  6.7349,  8.0145],
        [12.2505,  6.3644,  8.0308],
        [12.3050,  6.6810,  8.1006],
        [12.4018,  6.6171,  8.4273],
        [12.3349,  6.1365,  7.7794],
        [12.4454,  6.7721,  8.0942]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(17.2463, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.2461,  11.6354,  -4.9266],
        [-10.0411,  11.5562,  -5.3102],
        [ -9.9559,  11.5618,  -4.9102],
        [ -9.9635,  11.4441,  -5.1705],
        [-10.1611,  11.6434,  -5.1792],
        [-10.0426,  11.5978,  -5.2882]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 12.6282,   6.7349,   8.0145, -10.2461,  11.6354,  -4.9266],
        [ 12.

epoch:0, loss:-1.0832853317260742
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.6863, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[12.7388,  6.2570,  8.6226],
        [12.6434,  6.5663,  8.6459],
        [12.5456,  6.5667,  8.3896],
        [12.1024,  6.1868,  8.5650],
        [12.1746,  6.4227,  8.2539],
        [12.8854,  6.4720,  8.1317]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(10.7748, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ -9.9950,  11.4671,  -4.8497],
        [ -9.5344,  11.1031,  -4.9762],
        [-10.0843,  11.5522,  -5.1082],
        [-10.1936,  11.4621,  -5.2726],
        [ -9.8873,  11.6238,  -5.4370],
        [ -9.8739,  11.6078,  -5.1563]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 12.7388,   6.2570,   8.6226,  -9.9950,  11.4671,  -4.8497],
        [ 12.

epoch:0, loss:-1.1074005365371704
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.7786, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[12.5478,  6.6166,  8.1947],
        [12.7299,  6.5061,  8.3223],
        [12.5660,  6.1778,  8.3241],
        [12.6376,  6.4827,  8.5054],
        [12.7586,  6.7305,  8.3037],
        [12.5925,  6.4452,  7.8801]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(14.4364, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.3707,  12.4444,  -5.4953],
        [ -9.8669,  11.2747,  -5.4412],
        [ -9.9003,  11.8906,  -5.4286],
        [-10.3004,  11.6104,  -5.0719],
        [ -9.9605,  11.6389,  -5.4824],
        [-10.2047,  11.8820,  -5.0866]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 12.5478,   6.6166,   8.1947, -10.3707,  12.4444,  -5.4953],
        [ 12.

epoch:0, loss:-1.1309915781021118
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7121, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[12.2460,  6.6745,  8.2782],
        [12.9375,  6.9714,  8.2943],
        [12.4981,  6.6414,  8.0913],
        [12.9756,  6.7663,  8.1187],
        [12.2441,  6.6129,  8.2909],
        [12.9442,  6.3962,  8.5566]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(10.8798, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.1759,  11.6098,  -5.2717],
        [-10.0126,  11.8458,  -5.2221],
        [ -9.9031,  11.7315,  -5.3404],
        [-10.2536,  12.0165,  -5.0968],
        [-10.2973,  11.9142,  -5.1093],
        [-10.1203,  11.9960,  -5.4633]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 12.2460,   6.6745,   8.2782, -10.1759,  11.6098,  -5.2717],
        [ 12.

epoch:0, loss:-1.149090051651001
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(4.8886, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.0252,  6.6729,  8.3681],
        [12.7631,  6.5614,  8.4837],
        [12.7358,  6.4525,  8.4127],
        [12.6191,  6.4296,  8.4121],
        [12.6790,  6.6409,  8.4789],
        [13.0019,  6.4858,  8.1816]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(14.6227, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ -9.9197,  12.0078,  -5.2405],
        [-10.3175,  11.9208,  -5.5458],
        [-10.1500,  11.7215,  -5.2391],
        [-10.1023,  11.0344,  -5.1301],
        [-10.2452,  11.7084,  -5.3083],
        [-10.0736,  11.6255,  -5.1063]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.0252,   6.6729,   8.3681,  -9.9197,  12.0078,  -5.2405],
        [ 12.7

epoch:0, loss:-1.1412949562072754
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.1409, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[12.5955,  6.8361,  8.2974],
        [12.9163,  6.6057,  8.5394],
        [13.0839,  6.9614,  8.6634],
        [13.0023,  6.5604,  8.4226],
        [12.7522,  6.4490,  8.9700],
        [12.9813,  6.8070,  8.3851]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(14.8189, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.4338,  12.0763,  -5.5196],
        [-10.4920,  12.2891,  -5.7477],
        [-10.0434,  11.9639,  -5.2019],
        [ -9.9713,  11.6789,  -5.2822],
        [-10.4154,  11.7582,  -5.7320],
        [-10.2684,  12.0401,  -5.5697]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 12.5955,   6.8361,   8.2974, -10.4338,  12.0763,  -5.5196],
        [ 12.

epoch:0, loss:-1.1431611776351929
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6872, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[12.5894,  6.7373,  8.9186],
        [12.3142,  6.4648,  8.5330],
        [12.7144,  6.8106,  8.3201],
        [12.7874,  6.6707,  8.6372],
        [12.8281,  6.8767,  8.6023],
        [12.7060,  6.7719,  8.5264]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(22.2437, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.4078,  12.0063,  -5.7252],
        [-10.3629,  12.0831,  -5.3772],
        [-10.5548,  11.7671,  -5.2053],
        [-10.3452,  11.6871,  -5.6754],
        [-10.3322,  11.8702,  -5.7384],
        [-10.0435,  12.0021,  -5.5893]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 12.5894,   6.7373,   8.9186, -10.4078,  12.0063,  -5.7252],
        [ 12.

epoch:0, loss:-1.1660077571868896
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.7294, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[12.8625,  7.0203,  8.6298],
        [12.5755,  6.7607,  8.5209],
        [12.6283,  6.7718,  8.5020],
        [12.9333,  6.8278,  8.6468],
        [12.9040,  6.6493,  8.8439],
        [12.8762,  6.7859,  8.4538]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(14.1677, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ -9.9321,  12.0465,  -5.5219],
        [-10.3180,  11.6462,  -5.3906],
        [-10.5204,  12.1184,  -5.4882],
        [-10.4951,  12.0788,  -5.7070],
        [-10.4433,  11.8622,  -5.4950],
        [-10.3188,  12.1745,  -5.5561]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 12.8625,   7.0203,   8.6298,  -9.9321,  12.0465,  -5.5219],
        [ 12.

epoch:0, loss:-1.1726744174957275
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.1362, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.0610,  6.9262,  8.5687],
        [12.5764,  7.0710,  8.6022],
        [13.2417,  6.9333,  8.7556],
        [13.1240,  6.6740,  8.9294],
        [12.9629,  6.6852,  8.6834],
        [12.9383,  6.6815,  8.6506]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.8562, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.9167,  12.0532,  -5.9765],
        [-10.6195,  11.7632,  -5.6029],
        [-10.2671,  12.0622,  -5.6011],
        [-10.7041,  12.0308,  -5.5970],
        [-10.1437,  11.8910,  -5.3849],
        [-10.7826,  12.0331,  -5.5930]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.0610,   6.9262,   8.5687, -10.9167,  12.0532,  -5.9765],
        [ 12.

epoch:0, loss:-1.1810435056686401
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.8307, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.4340,  7.0460,  8.6057],
        [12.9189,  6.9262,  8.8010],
        [12.9941,  7.0225,  8.7363],
        [13.2849,  7.0980,  8.7810],
        [13.4408,  7.1937,  8.9259],
        [13.2317,  6.8347,  8.7547]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(10.5879, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.2361,  12.0588,  -5.6637],
        [-10.4847,  11.6934,  -5.5007],
        [-10.4422,  12.1477,  -5.6843],
        [-10.7904,  12.3892,  -6.0548],
        [-10.7978,  12.3088,  -5.6191],
        [-10.7113,  11.9868,  -5.5354]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.4340,   7.0460,   8.6057, -10.2361,  12.0588,  -5.6637],
        [ 12.

epoch:0, loss:-1.2168235778808594
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9342, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[12.9408,  7.4551,  9.0183],
        [12.8550,  6.8985,  8.8136],
        [13.0927,  7.2593,  8.7441],
        [12.8177,  6.9565,  8.9436],
        [12.8761,  6.9134,  8.5800],
        [12.6423,  6.9800,  8.6040]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(15.3830, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.8117,  12.0220,  -5.6454],
        [-10.3521,  12.4623,  -5.6979],
        [-10.9516,  12.3052,  -5.7727],
        [-10.9205,  12.4726,  -5.9706],
        [-10.8711,  12.4775,  -5.8971],
        [-10.9263,  12.1554,  -5.6750]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 12.9408,   7.4551,   9.0183, -10.8117,  12.0220,  -5.6454],
        [ 12.

epoch:0, loss:-1.217397689819336
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.7780, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.1811,  7.3850,  8.7862],
        [13.1483,  7.0367,  9.0509],
        [12.9785,  6.9155,  8.6332],
        [13.1110,  7.1528,  8.9959],
        [13.0732,  7.3806,  8.7651],
        [13.1932,  7.1296,  8.6648]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(22.6804, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.7425,  11.7126,  -5.8830],
        [-10.7532,  11.8843,  -5.9430],
        [-10.7226,  12.0678,  -5.5539],
        [-10.6529,  12.3786,  -6.1635],
        [-10.7222,  11.8473,  -5.9049],
        [-10.6547,  11.9435,  -5.8634]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.1811,   7.3850,   8.7862, -10.7425,  11.7126,  -5.8830],
        [ 13.1

epoch:0, loss:-1.1983891725540161
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7354, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.5494,  7.5453,  8.8348],
        [13.1632,  7.1368,  8.9728],
        [13.0590,  7.2265,  8.7390],
        [13.5407,  7.2184,  9.0451],
        [13.2037,  7.3072,  9.0690],
        [13.4378,  7.1569,  8.8458]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(18.4054, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.7811,  12.2248,  -5.7614],
        [-10.7733,  12.4181,  -6.0514],
        [-10.6714,  12.1297,  -5.8416],
        [-10.9011,  12.6397,  -6.0572],
        [-10.6934,  12.3517,  -5.6682],
        [-11.0249,  12.3835,  -6.1610]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.5494,   7.5453,   8.8348, -10.7811,  12.2248,  -5.7614],
        [ 13.

epoch:0, loss:-1.2322442531585693
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7694, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.7670,  7.3777,  8.9230],
        [13.3654,  7.0034,  8.8270],
        [13.1818,  7.4932,  9.0176],
        [13.2827,  7.4116,  9.1727],
        [13.3494,  7.2838,  8.8091],
        [13.6292,  7.2019,  8.9640]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(23.3730, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.5858,  12.2805,  -5.6695],
        [-11.1139,  12.6192,  -6.0590],
        [-10.9919,  12.2695,  -5.8140],
        [-11.1238,  12.2459,  -5.8697],
        [-10.9885,  12.5555,  -5.7217],
        [-11.1239,  12.3396,  -5.7995]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.7670,   7.3777,   8.9230, -10.5858,  12.2805,  -5.6695],
        [ 13.

epoch:0, loss:-1.2411150932312012
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.7838, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.3351,  7.3959,  9.0190],
        [13.3360,  7.5573,  8.6542],
        [13.5319,  7.4099,  8.9226],
        [13.0716,  7.5353,  9.1925],
        [13.8681,  7.3385,  9.0087],
        [13.3334,  7.3846,  8.9286]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(10.9010, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.0988,  12.4591,  -6.0961],
        [-11.1342,  12.7433,  -6.1266],
        [-10.9134,  12.1661,  -6.2458],
        [-10.8656,  12.2340,  -6.0167],
        [-11.0168,  12.5143,  -6.2882],
        [-11.0298,  12.2899,  -6.1226]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.3351,   7.3959,   9.0190, -11.0988,  12.4591,  -6.0961],
        [ 13.

epoch:0, loss:-1.2603883743286133
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2092, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.6063,  7.5071,  8.9898],
        [13.6550,  7.4268,  9.2530],
        [13.4960,  7.2139,  9.3477],
        [13.4320,  7.5423,  8.9667],
        [13.2955,  7.0505,  8.9795],
        [13.2336,  7.2606,  8.5273]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(15.6434, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.7498,  12.7558,  -6.4608],
        [-10.7614,  12.8482,  -6.4713],
        [-11.0152,  12.4282,  -6.4681],
        [-10.5861,  12.3729,  -6.1859],
        [-11.1563,  12.4950,  -5.9799],
        [-11.2280,  12.4126,  -5.8361]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.6063,   7.5071,   8.9898, -10.7498,  12.7558,  -6.4608],
        [ 13.

epoch:0, loss:-1.292298674583435
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7759, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.8874,  7.8286,  9.4912],
        [13.8240,  7.7115,  8.9909],
        [13.5400,  7.1632,  9.3403],
        [13.7503,  7.6438,  9.1679],
        [13.6831,  7.4742,  9.1105],
        [13.1037,  7.5640,  8.7597]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(22.1379, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.3130,  12.5366,  -6.4447],
        [-11.2525,  12.5020,  -6.5485],
        [-10.7735,  12.5423,  -6.6742],
        [-10.7687,  12.4697,  -6.2171],
        [-11.2943,  12.5102,  -6.1176],
        [-11.4301,  12.7107,  -6.2100]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.8874,   7.8286,   9.4912, -11.3130,  12.5366,  -6.4447],
        [ 13.8

epoch:0, loss:-1.307532787322998
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4378, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.5723,  7.4326,  9.3442],
        [13.3555,  7.7466,  9.3336],
        [13.6623,  7.3910,  9.2489],
        [13.6023,  7.5393,  9.1764],
        [13.5031,  7.2032,  9.6580],
        [13.7530,  7.6703,  9.7324]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(19.8667, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.0606,  12.5993,  -6.4914],
        [-11.2093,  12.9909,  -6.3201],
        [-11.3778,  12.5260,  -5.9521],
        [-11.0651,  12.4778,  -6.2516],
        [-11.1020,  12.8762,  -6.5525],
        [-11.0128,  12.8805,  -6.3575]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.5723,   7.4326,   9.3442, -11.0606,  12.5993,  -6.4914],
        [ 13.3

epoch:0, loss:-1.3207426071166992
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.8026, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.4639,  7.2769,  9.2052],
        [13.5763,  7.6684,  9.2487],
        [13.7580,  7.6961,  9.2219],
        [13.8022,  7.5528,  9.2810],
        [14.0547,  7.7055,  9.4377],
        [13.6289,  7.4112,  9.2344]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(19.3008, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-10.8881,  12.6764,  -6.2051],
        [-11.1559,  12.7412,  -6.6594],
        [-11.2277,  12.6283,  -6.3283],
        [-11.4140,  12.7093,  -6.5792],
        [-11.0554,  12.7559,  -6.5392],
        [-11.4408,  13.1728,  -6.6598]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.4639,   7.2769,   9.2052, -10.8881,  12.6764,  -6.2051],
        [ 13.

epoch:0, loss:-1.3278058767318726
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.9883, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.3119,  7.2641,  9.4147],
        [14.2248,  7.9807,  9.4671],
        [13.7704,  7.8925,  9.1017],
        [13.8123,  7.5866,  9.4601],
        [13.8038,  7.7075,  8.9895],
        [13.0774,  7.5824,  9.1840]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(11.2178, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.0488,  12.7214,  -6.5485],
        [-11.2361,  12.7616,  -6.7420],
        [-11.4373,  12.4727,  -6.6606],
        [-11.3344,  12.6988,  -6.5600],
        [-11.3453,  13.2052,  -6.5326],
        [-11.5794,  13.1280,  -6.7536]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.3119,   7.2641,   9.4147, -11.0488,  12.7214,  -6.5485],
        [ 14.

epoch:0, loss:-1.3363558053970337
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.6109, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.5651,  7.4133,  9.5424],
        [14.0525,  7.5751,  9.2919],
        [14.0858,  7.7224,  9.2710],
        [14.0750,  7.7504,  9.4615],
        [14.0399,  7.8778,  9.6940],
        [13.8737,  7.4868,  9.2611]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(14.5384, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.3886,  12.7398,  -6.5924],
        [-11.5175,  12.8758,  -6.4667],
        [-11.3247,  12.9835,  -6.6196],
        [-11.5607,  12.7040,  -6.5384],
        [-11.3276,  12.8881,  -6.9478],
        [-11.3619,  12.9329,  -6.3564]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.5651,   7.4133,   9.5424, -11.3886,  12.7398,  -6.5924],
        [ 14.

epoch:0, loss:-1.3305561542510986
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.0140, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.9731,  7.9502,  9.5026],
        [13.7247,  7.7246,  9.4584],
        [13.9161,  7.8029,  9.3940],
        [14.0260,  7.8290,  9.2767],
        [14.0459,  7.6331,  9.5432],
        [13.6567,  8.1080,  9.3938]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.3532, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.5870,  12.5374,  -6.8871],
        [-11.4466,  13.0058,  -6.6003],
        [-11.5449,  12.9776,  -6.6054],
        [-11.6356,  12.8575,  -6.9269],
        [-11.6593,  12.9896,  -6.6617],
        [-11.4978,  13.0287,  -7.1083]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.9731,   7.9502,   9.5026, -11.5870,  12.5374,  -6.8871],
        [ 13.

epoch:0, loss:-1.3555312156677246
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.0140, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.0376,  7.9224,  9.2393],
        [13.9647,  7.7473,  9.8362],
        [14.0666,  7.7301,  9.3766],
        [13.9579,  8.2109,  9.6007],
        [13.6918,  7.4015,  9.5364],
        [14.1946,  8.0599,  9.5306]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(15.2965, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.3418,  12.6565,  -6.5778],
        [-11.3914,  12.7811,  -6.4450],
        [-11.2591,  12.8025,  -6.4443],
        [-11.5452,  12.8976,  -6.8236],
        [-11.2228,  12.6696,  -6.7195],
        [-11.3207,  12.6774,  -6.8733]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.0376,   7.9224,   9.2393, -11.3418,  12.6565,  -6.5778],
        [ 13.

epoch:0, loss:-1.3896727561950684
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9940, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.7246,  7.7925,  9.1409],
        [13.6977,  8.0224,  9.6451],
        [13.8856,  7.6780,  9.7457],
        [13.7925,  7.7864,  9.3178],
        [13.8964,  7.7841,  9.3528],
        [13.7781,  7.9078,  9.4512]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(16.4978, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.6198,  13.0227,  -6.9853],
        [-11.7017,  13.3114,  -7.0272],
        [-11.5473,  13.1162,  -6.8947],
        [-11.3522,  13.3157,  -7.0110],
        [-11.6418,  13.3378,  -6.8321],
        [-11.5630,  12.9940,  -7.1482]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.7246,   7.7925,   9.1409, -11.6198,  13.0227,  -6.9853],
        [ 13.

epoch:0, loss:-1.3768171072006226
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.3859, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.1602,  7.9731,  9.3366],
        [14.0236,  7.7273,  9.7098],
        [13.7140,  7.9829,  9.6391],
        [14.2023,  7.9752,  9.7781],
        [14.0772,  8.0558,  9.5553],
        [13.9841,  8.0578,  9.8254]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(23.0510, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.1904,  13.1808,  -6.5874],
        [-11.4017,  13.0309,  -7.0697],
        [-11.3588,  13.1990,  -7.0890],
        [-11.6009,  13.1549,  -7.0247],
        [-11.5265,  13.0862,  -7.0664],
        [-11.5553,  13.1536,  -6.8823]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.1602,   7.9731,   9.3366, -11.1904,  13.1808,  -6.5874],
        [ 14.

epoch:0, loss:-1.3981001377105713
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.7188, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.1133,  7.9608,  9.8656],
        [14.0426,  7.8669,  9.7885],
        [14.2889,  7.9640,  9.5719],
        [13.8091,  7.6681,  9.5741],
        [13.4928,  7.8370,  9.5694],
        [14.2638,  8.4913,  9.6683]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(16.5174, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.6011,  13.0849,  -6.9879],
        [-11.5685,  12.8862,  -6.7061],
        [-11.5284,  13.2577,  -6.9441],
        [-11.8145,  13.3629,  -7.0764],
        [-11.7313,  13.1927,  -7.0050],
        [-11.3558,  12.8228,  -6.7508]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.1133,   7.9608,   9.8656, -11.6011,  13.0849,  -6.9879],
        [ 14.

epoch:0, loss:-1.3792095184326172
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.0118, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.6220,  8.2763,  9.7325],
        [14.3840,  8.3142,  9.8821],
        [14.2909,  8.0539,  9.6299],
        [14.2036,  7.8366,  9.7090],
        [14.1449,  8.2059,  9.4940],
        [14.2542,  8.0657,  9.6463]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(15.1897, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.9217,  13.5003,  -7.0788],
        [-11.5141,  13.4951,  -6.9753],
        [-11.5834,  13.0169,  -7.1482],
        [-11.6253,  12.7352,  -6.7297],
        [-11.8692,  12.9722,  -6.8071],
        [-11.8915,  13.4934,  -7.2526]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.6220,   8.2763,   9.7325, -11.9217,  13.5003,  -7.0788],
        [ 14.

epoch:0, loss:-1.4114047288894653
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4498, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.4131,  8.4676,  9.9572],
        [13.9295,  8.0497,  9.7729],
        [14.0272,  7.9726,  9.7472],
        [14.3907,  7.9293,  9.7279],
        [14.0834,  8.0776,  9.6341],
        [14.0451,  7.9517,  9.4823]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(18.5041, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.4022,  13.2692,  -7.1966],
        [-11.7602,  13.0299,  -6.5526],
        [-11.4032,  13.0038,  -6.9752],
        [-11.6790,  13.1430,  -7.2439],
        [-12.0570,  13.3547,  -7.1112],
        [-11.8791,  13.1028,  -7.2753]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.4131,   8.4676,   9.9572, -11.4022,  13.2692,  -7.1966],
        [ 13.

epoch:0, loss:-1.4496427774429321
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.7195, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[13.8971,  8.0647,  9.8630],
        [14.2467,  8.1496,  9.7696],
        [14.3983,  8.3301, 10.0449],
        [13.9715,  8.1256,  9.9597],
        [14.4740,  8.2959, 10.0343],
        [14.6314,  8.0570,  9.8910]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(15.2521, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.9861,  13.6218,  -7.2803],
        [-11.7855,  13.4282,  -7.4975],
        [-12.3240,  13.6262,  -7.1665],
        [-12.0168,  13.4644,  -7.3939],
        [-11.8174,  13.2258,  -7.0871],
        [-11.8952,  13.2978,  -7.6068]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 13.8971,   8.0647,   9.8630, -11.9861,  13.6218,  -7.2803],
        [ 14.

epoch:0, loss:-1.4223449230194092
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.9535, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.3049,  8.2044,  9.8491],
        [14.3129,  8.1570,  9.5990],
        [13.9779,  8.1493,  9.7324],
        [14.5110,  8.2422,  9.8863],
        [14.0874,  8.6129,  9.9055],
        [14.3081,  7.8600, 10.1874]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(21.0734, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.4384,  13.6378,  -6.9447],
        [-12.0530,  13.7109,  -7.3817],
        [-11.6676,  13.4748,  -7.0475],
        [-11.8127,  13.3920,  -7.1377],
        [-12.0925,  13.5731,  -7.0549],
        [-11.7719,  13.5793,  -7.5321]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.3049,   8.2044,   9.8491, -11.4384,  13.6378,  -6.9447],
        [ 14.

epoch:0, loss:-1.454722285270691
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7550, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.3736,  7.9917, 10.0546],
        [14.3064,  7.9891,  9.8530],
        [14.3794,  8.1232,  9.9410],
        [14.2280,  7.8874,  9.6031],
        [14.9381,  8.1880, 10.0578],
        [14.5026,  7.7344, 10.1756]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(19.5250, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.8512,  13.2390,  -7.4420],
        [-12.0722,  13.4586,  -7.3956],
        [-11.8563,  13.4100,  -7.1493],
        [-12.1453,  13.7314,  -7.2340],
        [-11.8837,  13.2957,  -7.2618],
        [-11.9700,  13.3857,  -7.2178]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.3736,   7.9917,  10.0546, -11.8512,  13.2390,  -7.4420],
        [ 14.3

epoch:0, loss:-1.4805330038070679
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2790, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.4102,  8.4613, 10.2412],
        [14.3252,  8.4398,  9.6173],
        [14.2676,  8.7070, 10.3586],
        [14.1493,  8.5463,  9.6319],
        [14.5359,  8.0740,  9.8190],
        [14.2228,  8.2079,  9.8673]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(11.2996, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.4540,  13.4626,  -7.6339],
        [-12.4786,  13.9787,  -7.3484],
        [-12.0201,  13.5654,  -7.4117],
        [-11.9799,  13.4102,  -7.5672],
        [-11.7303,  13.5595,  -7.3251],
        [-12.0274,  13.8680,  -7.3670]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.4102,   8.4613,  10.2412, -11.4540,  13.4626,  -7.6339],
        [ 14.

epoch:0, loss:-1.484740138053894
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6409, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.5995,  8.3956, 10.4395],
        [14.6759,  8.6409, 10.4168],
        [14.3495,  8.3421, 10.0295],
        [14.3575,  8.5526,  9.8198],
        [14.1930,  8.3544, 10.2638],
        [14.2430,  8.2531, 10.5103]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(21.1804, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.7830,  13.4188,  -7.9249],
        [-12.3247,  13.4040,  -7.4036],
        [-11.5175,  13.6110,  -7.2567],
        [-12.0147,  13.4470,  -7.0127],
        [-11.8880,  13.5407,  -7.3926],
        [-12.2682,  13.5935,  -7.4178]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.5995,   8.3956,  10.4395, -11.7830,  13.4188,  -7.9249],
        [ 14.6

epoch:0, loss:-1.4644932746887207
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6150, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.7466,  8.4046, 10.2669],
        [14.4718,  8.3576, 10.1916],
        [14.0293,  8.6171,  9.9216],
        [14.5449,  8.6737,  9.9441],
        [14.3713,  8.3417, 10.1829],
        [14.7548,  8.1752,  9.9313]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.7596, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.3327,  13.6463,  -7.4797],
        [-12.1362,  13.8541,  -7.8462],
        [-12.1397,  13.6486,  -7.3805],
        [-12.3692,  13.6327,  -7.0373],
        [-12.3857,  13.3880,  -7.3944],
        [-12.2920,  13.3735,  -7.5494]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.7466,   8.4046,  10.2669, -12.3327,  13.6463,  -7.4797],
        [ 14.

epoch:0, loss:-1.5157538652420044
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4600, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.3759,  8.5765, 10.0625],
        [14.8701,  9.0992, 10.2117],
        [14.3666,  8.4158,  9.8085],
        [14.3211,  8.3062, 10.3760],
        [14.2754,  8.0563, 10.1554],
        [14.4890,  8.8683, 10.2865]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(7.9063, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-11.5588,  13.7096,  -7.6063],
        [-12.1955,  13.9208,  -7.5577],
        [-12.1882,  13.9432,  -7.7641],
        [-12.1110,  13.6195,  -7.7019],
        [-12.0009,  13.9293,  -7.6227],
        [-12.3927,  14.0385,  -7.7989]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.3759,   8.5765,  10.0625, -11.5588,  13.7096,  -7.6063],
        [ 14.8

epoch:0, loss:-1.527055025100708
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.6934, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.6534,  8.7217, 10.3290],
        [14.3921,  8.4149, 10.0259],
        [14.7811,  8.8425, 10.4509],
        [14.5522,  8.6430, 10.3609],
        [14.5570,  8.6625, 10.2857],
        [15.2390,  9.1182, 10.7189]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(21.6332, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.0260,  13.1707,  -7.7424],
        [-12.1229,  14.0208,  -7.7472],
        [-12.0927,  14.0426,  -7.7311],
        [-12.1045,  13.6400,  -7.4564],
        [-12.7195,  13.8409,  -7.8677],
        [-11.7832,  13.4324,  -7.8723]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.6534,   8.7217,  10.3290, -12.0260,  13.1707,  -7.7424],
        [ 14.3

epoch:0, loss:-1.5110142230987549
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2575, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.7940,  8.1925, 10.2531],
        [15.1225,  9.0241, 10.4331],
        [14.4528,  8.7529,  9.9982],
        [14.9858,  8.9509, 10.3531],
        [14.7447,  8.5704, 10.4224],
        [14.6409,  8.6147, 10.1145]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(11.4207, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.3490,  13.9823,  -7.8261],
        [-12.4214,  13.8759,  -7.6342],
        [-12.1408,  14.0723,  -7.6853],
        [-12.5560,  13.7661,  -7.9760],
        [-11.8603,  13.6263,  -7.1860],
        [-12.1410,  13.7623,  -7.8758]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.7940,   8.1925,  10.2531, -12.3490,  13.9823,  -7.8261],
        [ 15.

epoch:0, loss:-1.561151385307312
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4111, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.8676,  8.6765, 10.5511],
        [14.3157,  8.4757,  9.9905],
        [14.9749,  8.7443, 10.5805],
        [14.6026,  8.7089, 10.1229],
        [14.8361,  8.7673, 10.2395],
        [14.7562,  8.8509, 10.6145]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.4745, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.7031,  14.0244,  -7.8621],
        [-12.6289,  14.0481,  -7.7475],
        [-12.4489,  14.3300,  -7.6411],
        [-12.3262,  13.8335,  -7.6169],
        [-12.1679,  13.4300,  -7.4078],
        [-12.0053,  13.6102,  -7.8684]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.8676,   8.6765,  10.5511, -12.7031,  14.0244,  -7.8621],
        [ 14.3

epoch:0, loss:-1.549421787261963
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.0797, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.0112,  8.8853, 10.2923],
        [14.8896,  8.8664, 10.2413],
        [15.1103,  8.8590, 10.4497],
        [14.8774,  8.4824, 10.9474],
        [14.8357,  8.8429, 10.3290],
        [14.8833,  8.8064, 10.4886]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(17.6358, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.0960,  13.9570,  -7.8291],
        [-12.2637,  14.1142,  -7.8770],
        [-12.4060,  13.8078,  -7.8766],
        [-12.8175,  14.0764,  -7.9886],
        [-12.1040,  13.6691,  -7.9814],
        [-12.6084,  14.0458,  -7.9464]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.0112,   8.8853,  10.2923, -12.0960,  13.9570,  -7.8291],
        [ 14.8

epoch:0, loss:-1.5792523622512817
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9893, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.9601,  8.9383, 10.6415],
        [14.6248,  9.2926, 10.4696],
        [15.0650,  8.8964, 10.6116],
        [14.7589,  8.9224, 10.6705],
        [14.7620,  8.6652, 10.8735],
        [15.0934,  8.4516, 10.0294]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(12.4583, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.6078,  13.7997,  -7.8158],
        [-12.7590,  14.2450,  -8.1312],
        [-12.8983,  13.9413,  -7.8544],
        [-12.9246,  14.1289,  -7.9192],
        [-12.4801,  13.8767,  -7.4121],
        [-12.6225,  14.0475,  -8.0365]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.9601,   8.9383,  10.6415, -12.6078,  13.7997,  -7.8158],
        [ 14.

epoch:0, loss:-1.5588724613189697
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7226, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.6092,  8.6362, 10.7098],
        [14.6961,  8.7917, 10.8541],
        [14.9962,  8.8114, 10.5967],
        [14.9114,  8.7319, 10.3957],
        [14.8267,  9.0418, 10.5681],
        [15.0941,  9.0641, 10.6078]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(17.0158, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.5105,  14.3096,  -7.9769],
        [-12.8785,  14.2168,  -8.2668],
        [-12.5897,  13.7259,  -8.2611],
        [-13.0633,  14.2460,  -8.0422],
        [-12.4250,  13.7901,  -7.9081],
        [-12.3110,  13.8814,  -8.0692]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.6092,   8.6362,  10.7098, -12.5105,  14.3096,  -7.9769],
        [ 14.

epoch:0, loss:-1.6029311418533325
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4834, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[14.7087,  8.7753, 10.2396],
        [15.3295,  9.0910, 10.5420],
        [14.7577,  8.9507, 10.5203],
        [14.7686,  8.5541, 10.6000],
        [14.9369,  9.1179, 10.4978],
        [15.0612,  8.9081, 10.6880]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(24.4640, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.7275,  13.8796,  -8.2335],
        [-12.7984,  14.2643,  -8.1989],
        [-12.5797,  14.2329,  -8.1843],
        [-12.6424,  14.3798,  -8.0093],
        [-12.6812,  14.0048,  -8.0959],
        [-12.6561,  13.7521,  -8.0739]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 14.7087,   8.7753,  10.2396, -12.7275,  13.8796,  -8.2335],
        [ 15.

epoch:0, loss:-1.6046981811523438
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.0290, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.3103,  9.2315, 10.6847],
        [15.2626,  9.1264, 10.6454],
        [14.8513,  9.3266, 10.8273],
        [15.2745,  9.1575, 10.7596],
        [14.3896,  8.6794, 10.5741],
        [14.8910,  8.6364, 10.2281]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(26.9390, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.7441,  14.4945,  -8.3898],
        [-12.8574,  14.1880,  -8.3526],
        [-12.7676,  14.4043,  -8.1429],
        [-12.6649,  14.1456,  -8.2271],
        [-12.6510,  14.2573,  -8.0338],
        [-12.4891,  13.9699,  -8.0417]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.3103,   9.2315,  10.6847, -12.7441,  14.4945,  -8.3898],
        [ 15.

epoch:0, loss:-1.6345911026000977
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9773, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.1745,  9.4062, 10.6745],
        [15.0201,  9.3004, 10.9301],
        [15.4635,  9.3442, 10.8622],
        [15.1062,  8.9790, 10.8512],
        [14.9443,  9.0242, 10.4180],
        [15.0262,  9.0209, 10.9618]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(26.2494, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.4322,  14.3839,  -8.2980],
        [-12.1497,  13.6948,  -8.1414],
        [-12.5158,  14.4915,  -8.3815],
        [-12.9909,  14.4685,  -8.4698],
        [-12.9478,  14.4337,  -8.2703],
        [-12.8750,  14.1208,  -7.9908]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.1745,   9.4062,  10.6745, -12.4322,  14.3839,  -8.2980],
        [ 15.

epoch:0, loss:-1.6575394868850708
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(4.0249, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.1778,  9.1728, 10.6599],
        [14.9457,  9.0739, 11.0827],
        [14.9507,  9.1191, 10.8002],
        [15.2636,  9.3206, 10.5047],
        [15.0357,  9.0852, 10.9463],
        [15.5479,  9.5451, 11.1762]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.4903, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.7533,  14.3204,  -8.6695],
        [-12.8198,  14.4536,  -8.6101],
        [-12.9523,  14.5707,  -8.5340],
        [-12.9722,  13.7833,  -8.0269],
        [-12.6015,  14.0078,  -8.2595],
        [-12.5805,  14.4137,  -8.1282]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.1778,   9.1728,  10.6599, -12.7533,  14.3204,  -8.6695],
        [ 14.

epoch:0, loss:-1.646562099456787
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.1733, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.0847,  9.0822, 10.6005],
        [15.3901,  9.3220, 10.9693],
        [15.2217,  8.9324, 10.9239],
        [15.1034,  9.4430, 10.8891],
        [15.4695,  9.0728, 10.9562],
        [15.0125,  9.0956, 11.1651]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(22.1044, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.8830,  14.6669,  -8.3545],
        [-12.9843,  14.2433,  -8.4523],
        [-13.2890,  14.5669,  -8.6243],
        [-12.7691,  14.3439,  -8.5304],
        [-12.8276,  14.1723,  -8.3468],
        [-12.9005,  14.2185,  -8.4722]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.0847,   9.0822,  10.6005, -12.8830,  14.6669,  -8.3545],
        [ 15.3

epoch:0, loss:-1.680698037147522
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9694, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.4365,  9.5487, 10.7851],
        [15.3851,  9.3006, 10.6253],
        [15.0048,  9.2379, 10.5808],
        [14.8593,  9.1305, 10.7208],
        [15.2523,  9.2873, 11.0887],
        [15.4608,  9.1724, 10.9515]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(13.0134, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.8461,  14.1788,  -8.0828],
        [-13.1137,  14.3120,  -8.7172],
        [-12.9627,  14.4168,  -8.5135],
        [-12.6680,  14.6994,  -8.4885],
        [-13.2957,  14.2172,  -8.2025],
        [-12.8902,  14.2332,  -8.3951]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.4365,   9.5487,  10.7851, -12.8461,  14.1788,  -8.0828],
        [ 15.3

epoch:0, loss:-1.6503868103027344
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7398, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.4873,  9.4098, 11.0278],
        [14.9293,  9.5718, 10.8544],
        [15.4805,  9.4089, 11.1362],
        [15.2196,  9.4200, 10.6550],
        [14.9950,  9.3607, 10.8508],
        [15.3464,  9.2926, 10.6833]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(17.5342, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.0444,  14.6295,  -8.9089],
        [-12.8883,  14.0890,  -8.6268],
        [-12.8541,  14.4734,  -8.5381],
        [-13.2060,  14.7504,  -8.8852],
        [-12.8660,  14.5761,  -8.4996],
        [-12.8019,  14.3030,  -8.2948]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.4873,   9.4098,  11.0278, -13.0444,  14.6295,  -8.9089],
        [ 14.

epoch:0, loss:-1.6686094999313354
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2034, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.5082,  9.1374, 10.7955],
        [15.2647,  9.2706, 11.0597],
        [15.7411,  9.5740, 11.1605],
        [15.2333,  9.2336, 10.8796],
        [15.3601,  9.5669, 11.1633],
        [15.5356,  9.3874, 11.2884]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(17.5219, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.9108,  14.5872,  -8.6514],
        [-12.8376,  14.5192,  -8.5111],
        [-13.0747,  14.4878,  -8.7104],
        [-12.8198,  14.3538,  -8.6876],
        [-13.4011,  14.3357,  -8.5501],
        [-12.9071,  14.4554,  -8.6548]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.5082,   9.1374,  10.7955, -12.9108,  14.5872,  -8.6514],
        [ 15.

epoch:0, loss:-1.7004008293151855
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5062, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.2034,  9.4944, 10.7371],
        [15.5684,  9.4708, 11.1369],
        [14.9923,  9.1632, 10.9505],
        [15.4385,  9.3490, 10.8854],
        [15.1687,  9.4609, 10.7825],
        [15.6980,  9.6292, 11.1371]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(26.0206, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.1803,  14.2069,  -8.5256],
        [-13.2527,  14.8706,  -8.7599],
        [-13.0150,  14.6600,  -8.3332],
        [-13.3370,  14.3650,  -8.7292],
        [-13.0451,  14.3045,  -8.6537],
        [-12.9789,  14.1974,  -8.5445]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.2034,   9.4944,  10.7371, -13.1803,  14.2069,  -8.5256],
        [ 15.

epoch:0, loss:-1.6836872100830078
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.7121, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.5564,  9.5123, 11.2336],
        [15.7548,  9.7977, 11.0714],
        [15.5200,  9.7521, 10.6862],
        [15.6938,  9.2852, 11.0846],
        [15.5930,  9.6820, 11.1068],
        [15.3284,  9.3177, 11.1914]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(16.8169, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.8463,  14.4039,  -8.5516],
        [-13.0097,  14.8101,  -8.5330],
        [-13.2759,  14.3754,  -8.4762],
        [-13.3705,  14.6519,  -8.6211],
        [-13.2789,  14.4625,  -8.5150],
        [-12.8834,  14.4336,  -8.4904]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.5564,   9.5123,  11.2336, -12.8463,  14.4039,  -8.5516],
        [ 15.

epoch:0, loss:-1.720625638961792
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.8057, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.5672,  9.4783, 11.0032],
        [15.2674,  9.6133, 11.2226],
        [15.8029,  9.8210, 11.4975],
        [15.6845,  9.2694, 11.0363],
        [15.4983,  9.3851, 10.7419],
        [15.7298,  9.2416, 10.4967]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(17.8765, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.4841,  15.2236,  -8.8234],
        [-13.4716,  14.8969,  -9.0241],
        [-13.1589,  14.4802,  -8.8504],
        [-13.0143,  14.4831,  -8.9231],
        [-13.0262,  14.3786,  -8.7824],
        [-13.2611,  14.9072,  -8.7207]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.5672,   9.4783,  11.0032, -13.4841,  15.2236,  -8.8234],
        [ 15.2

epoch:0, loss:-1.7279541492462158
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.1439, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.1621,  9.4497, 11.0908],
        [15.6113,  9.7270, 11.3159],
        [15.7948,  9.7410, 11.3397],
        [15.9146,  9.5912, 11.3546],
        [15.1933,  9.3485, 11.0616],
        [15.9976,  9.8792, 11.2290]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(23.2627, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.2106,  14.8549,  -8.6706],
        [-13.2648,  14.7484,  -9.1129],
        [-13.1814,  14.5210,  -8.7740],
        [-13.0809,  14.2921,  -8.5476],
        [-13.3284,  14.9630,  -8.9094],
        [-13.3888,  14.7552,  -9.0124]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.1621,   9.4497,  11.0908, -13.2106,  14.8549,  -8.6706],
        [ 15.

epoch:0, loss:-1.7223790884017944
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7015, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.9363,  9.5859, 11.5571],
        [15.9256,  9.5742, 11.0205],
        [15.7358,  9.7105, 11.3355],
        [15.5678,  9.7904, 11.5208],
        [15.9094,  9.9970, 11.2757],
        [15.7261,  9.9833, 11.4112]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(17.1872, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.4888,  15.0292,  -8.8232],
        [-13.2084,  14.5728,  -8.8507],
        [-13.2277,  14.4097,  -8.8106],
        [-13.2123,  15.0317,  -9.0193],
        [-13.0304,  15.1705,  -9.0273],
        [-13.1072,  14.7572,  -8.9522]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.9363,   9.5859,  11.5571, -13.4888,  15.0292,  -8.8232],
        [ 15.

epoch:0, loss:-1.7794979810714722
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.6614, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.0202,  9.4529, 11.3978],
        [15.6542,  9.4625, 11.2996],
        [15.8970,  9.9404, 11.4420],
        [15.8094,  9.5363, 11.1720],
        [15.1497,  9.1373, 11.2344],
        [14.9777,  9.4482, 11.1076]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(23.3410, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-12.8387,  15.0423,  -8.8500],
        [-13.5569,  14.5375,  -8.4507],
        [-13.4447,  14.5009,  -8.9690],
        [-13.2816,  14.6493,  -8.7867],
        [-13.6704,  14.8726,  -9.2555],
        [-13.2837,  14.4618,  -8.8759]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.0202,   9.4529,  11.3978, -12.8387,  15.0423,  -8.8500],
        [ 15.

epoch:0, loss:-1.7821613550186157
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.7895, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.1777,  9.7800, 11.2078],
        [15.7807,  9.6323, 11.3043],
        [15.4839,  9.3460, 11.0700],
        [15.6853,  9.8369, 11.6351],
        [15.2609,  9.7273, 11.2586],
        [15.9516,  9.8778, 11.7590]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(17.4672, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.5446,  14.7186,  -8.8933],
        [-13.3540,  14.9307,  -9.0727],
        [-13.2116,  14.5364,  -8.8420],
        [-13.4958,  15.0069,  -8.8057],
        [-13.2036,  15.0035,  -9.1969],
        [-13.7654,  15.3375,  -9.1612]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.1777,   9.7800,  11.2078, -13.5446,  14.7186,  -8.8933],
        [ 15.

epoch:0, loss:-1.813863754272461
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(4.6931, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.1377, 10.1080, 11.4747],
        [16.0013,  9.5120, 11.6844],
        [15.8565, 10.1162, 11.6508],
        [15.7007,  9.8681, 11.8362],
        [15.9036,  9.9719, 11.5640],
        [15.3923,  9.5385, 11.3257]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(8.5932, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.3120,  14.9119,  -9.0265],
        [-13.6477,  14.8350,  -9.1269],
        [-13.9085,  14.8944,  -9.2117],
        [-13.5393,  14.5990,  -8.7362],
        [-13.6152,  14.8976,  -8.9113],
        [-13.6873,  14.8601,  -8.5589]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.1377,  10.1080,  11.4747, -13.3120,  14.9119,  -9.0265],
        [ 16.00

epoch:0, loss:-1.8480620384216309
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.0569, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.1943, 10.1179, 11.6292],
        [16.1024,  9.9268, 11.8533],
        [15.9933, 10.0748, 11.4148],
        [16.0087,  9.7380, 11.5273],
        [16.0060, 10.3634, 11.5007],
        [15.5356,  9.7038, 11.4680]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(18.2108, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.2301,  14.8973,  -8.9369],
        [-13.2227,  14.7745,  -9.1160],
        [-13.2666,  14.6109,  -8.7775],
        [-13.4247,  15.0668,  -9.2302],
        [-13.8749,  15.6739,  -9.2488],
        [-13.5993,  15.4624,  -9.2978]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.1943,  10.1179,  11.6292, -13.2301,  14.8973,  -8.9369],
        [ 16.

epoch:0, loss:-1.8358333110809326
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3473, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.0350,  9.7877, 11.7580],
        [16.1962,  9.9233, 11.9895],
        [16.0846, 10.2325, 11.6775],
        [15.7038, 10.2217, 11.8443],
        [16.1248, 10.1481, 11.4788],
        [15.7705,  9.7367, 11.6251]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(14.4582, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.5125,  15.0254,  -9.1214],
        [-13.4669,  15.4005,  -9.1270],
        [-13.7013,  15.1989,  -9.5024],
        [-13.0575,  14.1652,  -8.8185],
        [-13.5889,  15.5384,  -9.4494],
        [-13.6976,  15.2841,  -9.1515]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.0350,   9.7877,  11.7580, -13.5125,  15.0254,  -9.1214],
        [ 16.

epoch:0, loss:-1.833228588104248
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5150, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.8886,  9.8740, 11.7007],
        [16.3089, 10.3847, 12.0307],
        [16.1957, 10.0912, 11.5557],
        [16.0498, 10.0600, 11.6298],
        [16.0069, 10.1357, 11.5790],
        [15.8056, 10.0402, 11.8994]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(24.0458, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.0297,  15.0766,  -9.2031],
        [-13.7456,  14.9931,  -9.7036],
        [-13.6692,  15.2321,  -9.0976],
        [-13.5475,  14.8077,  -9.2788],
        [-13.9612,  15.1987,  -9.6115],
        [-13.8309,  15.2887,  -9.3278]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.8886,   9.8740,  11.7007, -14.0297,  15.0766,  -9.2031],
        [ 16.3

epoch:0, loss:-1.8492580652236938
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.8138, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.1811, 10.2330, 11.6004],
        [16.0830,  9.8510, 11.6506],
        [16.0178, 10.2612, 11.6477],
        [15.4974, 10.1137, 11.5561],
        [16.3413,  9.7966, 11.7646],
        [15.9247,  9.8782, 11.3121]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(18.6697, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.7185,  15.2075,  -9.0174],
        [-14.2453,  15.2672,  -9.1715],
        [-14.0649,  15.4181,  -9.1899],
        [-13.7255,  15.0031,  -9.5078],
        [-13.8012,  15.2376,  -9.3864],
        [-13.9467,  15.1220,  -9.3567]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.1811,  10.2330,  11.6004, -13.7185,  15.2075,  -9.0174],
        [ 16.

epoch:0, loss:-1.8772605657577515
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3666, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.9974, 10.0714, 11.6024],
        [16.1640, 10.0141, 11.7553],
        [16.1330, 10.2634, 11.9245],
        [16.2277, 10.4186, 11.7428],
        [16.2605,  9.9408, 11.3968],
        [15.9801,  9.7466, 11.2101]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(19.2608, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.6043,  15.3429,  -9.4364],
        [-13.6928,  15.1193,  -9.8266],
        [-13.2862,  15.2183,  -9.5462],
        [-14.1638,  15.7592,  -9.4492],
        [-14.1225,  15.5003,  -9.4530],
        [-13.4409,  14.7493,  -9.0107]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.9974,  10.0714,  11.6024, -13.6043,  15.3429,  -9.4364],
        [ 16.

epoch:0, loss:-1.8897159099578857
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.0930, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.1182, 10.2352, 11.7329],
        [16.4956, 10.3612, 11.8930],
        [15.9208, 10.3358, 11.6392],
        [16.3362, 10.1320, 11.7875],
        [15.7576, 10.0317, 11.6260],
        [16.4527,  9.9850, 12.0851]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(19.3974, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.9770,  15.6229,  -9.3577],
        [-13.9078,  15.5686,  -9.7884],
        [-13.7759,  15.3454,  -9.3356],
        [-13.6408,  15.1562,  -9.2115],
        [-13.8011,  15.4453,  -9.1946],
        [-13.5387,  14.9721,  -9.5663]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.1182,  10.2352,  11.7329, -13.9770,  15.6229,  -9.3577],
        [ 16.

epoch:0, loss:-1.907461166381836
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.0167, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[15.8012, 10.3832, 11.9842],
        [16.2980, 10.1033, 11.7993],
        [16.3144, 10.4927, 11.7683],
        [16.4729,  9.9824, 11.9692],
        [16.2668, 10.4314, 12.2325],
        [15.8173,  9.9378, 11.4056]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(19.4186, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.7253,  15.0941,  -9.4513],
        [-13.9658,  15.7145,  -9.7128],
        [-13.6176,  15.4666,  -9.9074],
        [-13.7601,  15.2233,  -9.2986],
        [-14.1717,  15.4529,  -9.7110],
        [-14.1870,  15.8375, -10.0576]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 15.8012,  10.3832,  11.9842, -13.7253,  15.0941,  -9.4513],
        [ 16.2

epoch:0, loss:-1.9390182495117188
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5839, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.3573, 10.3329, 11.8195],
        [16.0983, 10.4242, 11.9846],
        [16.2146, 10.4094, 11.6716],
        [16.0394, 10.5788, 11.2623],
        [16.1634, 10.3165, 11.5398],
        [16.6258, 10.1698, 11.9767]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(19.6769, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.0637,  15.5313,  -9.7299],
        [-14.2052,  15.2242,  -9.2009],
        [-14.0058,  15.3840,  -9.9625],
        [-13.9576,  15.4224,  -9.6219],
        [-14.0634,  15.5840,  -9.5460],
        [-14.0407,  15.6504,  -9.6839]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.3573,  10.3329,  11.8195, -14.0637,  15.5313,  -9.7299],
        [ 16.

epoch:0, loss:-1.9175994396209717
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7282, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.4087, 10.3244, 12.1057],
        [16.2570, 10.2755, 11.6756],
        [16.3811, 10.3530, 11.8855],
        [16.3226, 10.4309, 12.0242],
        [16.4179, 10.6101, 12.0787],
        [16.2308, 10.2803, 11.8943]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(24.8096, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.8999,  15.4274,  -9.6280],
        [-13.9805,  15.5686,  -9.7657],
        [-14.2004,  15.3011,  -9.9712],
        [-14.2939,  15.5426,  -9.7877],
        [-14.1740,  15.9061,  -9.7551],
        [-14.1012,  15.7893,  -9.7167]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.4087,  10.3244,  12.1057, -13.8999,  15.4274,  -9.6280],
        [ 16.

epoch:0, loss:-1.9397635459899902
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7071, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.0233, 10.2549, 12.3046],
        [16.1102, 10.0948, 12.0014],
        [16.6894, 10.5674, 11.9337],
        [15.8083, 10.3753, 12.1596],
        [16.6181, 10.5131, 12.0078],
        [16.1310,  9.8729, 12.0046]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.0007, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.0552,  15.7768,  -9.6986],
        [-14.2121,  15.5343,  -9.9264],
        [-14.3659,  15.8959,  -9.9350],
        [-14.5344,  15.6295, -10.2352],
        [-14.5369,  15.7777,  -9.8130],
        [-13.9989,  15.5554,  -9.6893]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.0233,  10.2549,  12.3046, -14.0552,  15.7768,  -9.6986],
        [ 16.

epoch:0, loss:-1.9371434450149536
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7413, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.6536, 10.4861, 12.0595],
        [16.7884, 10.5355, 11.9604],
        [16.9086, 10.3994, 12.3278],
        [16.5324, 10.6304, 12.1804],
        [16.3247,  9.8309, 11.9523],
        [16.4250, 10.6207, 11.7192]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.0326, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.5769,  15.3959,  -9.8173],
        [-14.7249,  15.6065, -10.1198],
        [-14.0216,  15.4268, -10.1651],
        [-14.2094,  15.5210,  -9.8846],
        [-14.5996,  15.8622,  -9.8153],
        [-13.9467,  15.4832,  -9.5635]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.6536,  10.4861,  12.0595, -14.5769,  15.3959,  -9.8173],
        [ 16.

epoch:0, loss:-1.9654581546783447
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.9228, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.3454, 10.1578, 12.0396],
        [16.1352, 10.3955, 11.9560],
        [16.4843, 10.4878, 12.3771],
        [16.3295, 10.3579, 12.1924],
        [16.6559, 10.5220, 12.4767],
        [16.1260, 10.8081, 12.2439]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(19.1659, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-13.8765,  15.5317,  -9.8701],
        [-14.0916,  15.7038,  -9.7181],
        [-14.4404,  15.6113, -10.1391],
        [-14.1746,  15.5411, -10.0268],
        [-14.2715,  15.7502,  -9.8599],
        [-13.7763,  15.4683,  -9.9946]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.3454,  10.1578,  12.0396, -13.8765,  15.5317,  -9.8701],
        [ 16.

epoch:0, loss:-1.9683419466018677
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.8484, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.7520, 10.2953, 12.2259],
        [16.7460, 10.2676, 12.0755],
        [16.6556, 10.8497, 12.3946],
        [16.3391, 10.8714, 12.0159],
        [17.0189, 10.7539, 12.5092],
        [16.1467, 10.1694, 11.8111]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(29.5292, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.1699,  15.7405,  -9.7924],
        [-14.3282,  15.9243, -10.0027],
        [-13.9385,  15.5886, -10.3391],
        [-14.5921,  15.7973,  -9.8694],
        [-14.5453,  15.8121,  -9.9368],
        [-14.2633,  16.0745, -10.4051]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.7520,  10.2953,  12.2259, -14.1699,  15.7405,  -9.7924],
        [ 16.

epoch:0, loss:-1.9945449829101562
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6520, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.6908, 11.2657, 12.7295],
        [16.1309, 10.7819, 11.8880],
        [16.3653, 10.9000, 12.6470],
        [16.3240, 10.2536, 11.9344],
        [16.9840, 11.0064, 12.2567],
        [16.4957, 10.9449, 12.7628]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(24.0923, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.1496,  15.9115, -10.1074],
        [-14.0825,  15.6799,  -9.8508],
        [-13.8966,  15.3547,  -9.6248],
        [-14.1236,  15.7617, -10.1462],
        [-14.3016,  15.6023,  -9.7644],
        [-14.2258,  15.7632, -10.0990]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.6908,  11.2657,  12.7295, -14.1496,  15.9115, -10.1074],
        [ 16.

epoch:0, loss:-1.9840409755706787
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.9596, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.2130, 10.8676, 12.0807],
        [16.3753, 10.7231, 12.4354],
        [16.7344, 10.7826, 12.4830],
        [16.6940, 10.4589, 12.0196],
        [16.1611, 10.4586, 11.7407],
        [16.6217, 10.6925, 12.3615]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.2147, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.5354,  15.8717, -10.0008],
        [-14.3559,  15.9473, -10.1487],
        [-14.1736,  15.7427, -10.0507],
        [-14.1176,  15.4841, -10.0329],
        [-14.5209,  16.0409, -10.0669],
        [-14.6840,  16.0337, -10.0888]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.2130,  10.8676,  12.0807, -14.5354,  15.8717, -10.0008],
        [ 16.

epoch:0, loss:-2.0120835304260254
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.0005, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.6647, 10.5554, 12.1481],
        [16.7262, 10.8070, 12.0207],
        [16.6798, 10.9370, 11.9068],
        [17.0001, 10.8471, 12.6142],
        [16.4711, 10.7664, 12.1950],
        [16.7716, 10.9817, 12.4159]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.1805, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.5327,  15.8164,  -9.9208],
        [-14.1343,  15.3435,  -9.9211],
        [-14.2259,  15.6943,  -9.9422],
        [-14.4998,  15.8410, -10.4679],
        [-14.5375,  15.8131, -10.0692],
        [-14.5754,  15.9892, -10.4692]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.6647,  10.5554,  12.1481, -14.5327,  15.8164,  -9.9208],
        [ 16.

epoch:0, loss:-2.002656936645508
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.9192, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.8195, 11.0628, 12.0784],
        [17.3090, 11.0776, 12.5951],
        [16.7004, 10.6905, 12.2271],
        [16.5281, 10.5065, 12.5505],
        [16.6153, 11.1124, 12.2733],
        [16.7763, 11.1081, 12.4164]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(18.9103, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.1721,  15.7377,  -9.9803],
        [-14.4492,  15.7343, -10.2650],
        [-14.6925,  16.1754, -10.6900],
        [-14.2795,  15.9667, -10.0250],
        [-14.4242,  16.1805, -10.2448],
        [-14.5097,  16.3745, -10.2249]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.8195,  11.0628,  12.0784, -14.1721,  15.7377,  -9.9803],
        [ 17.3

epoch:0, loss:-2.0223217010498047
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.8888, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.5892, 11.0287, 12.3964],
        [16.9101, 11.2456, 12.6720],
        [16.5571, 11.0369, 12.5456],
        [16.7164, 11.1598, 12.6467],
        [16.9467, 11.0384, 12.5808],
        [16.6068, 11.2033, 12.2363]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.5011, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.5941,  15.9981, -10.2896],
        [-14.7903,  16.3141, -10.4753],
        [-14.7684,  16.3850, -10.6444],
        [-14.2564,  15.9003, -10.0882],
        [-14.8309,  16.0729, -10.3267],
        [-14.3116,  16.1116, -10.0485]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.5892,  11.0287,  12.3964, -14.5941,  15.9981, -10.2896],
        [ 16.

epoch:0, loss:-2.0584659576416016
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.9313, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.4596, 11.3698, 12.1016],
        [17.0194, 10.9767, 12.4413],
        [16.6034, 10.7997, 12.0590],
        [16.3866, 10.8598, 12.5465],
        [16.8792, 11.2860, 12.9110],
        [16.5230, 10.7595, 12.3828]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.6897, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.9792,  16.4349, -10.2767],
        [-14.4403,  16.0048, -10.7548],
        [-14.6846,  16.2172, -10.6101],
        [-14.9669,  16.2042, -10.3973],
        [-14.5824,  15.9107, -10.3693],
        [-14.7730,  16.1110, -10.1830]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.4596,  11.3698,  12.1016, -14.9792,  16.4349, -10.2767],
        [ 17.

epoch:0, loss:-2.122615098953247
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7836, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.0720, 11.3060, 12.3513],
        [16.7922, 10.5111, 12.1739],
        [16.5753, 10.7858, 11.9636],
        [16.7365, 10.8101, 12.3551],
        [17.1856, 11.1789, 12.6088],
        [17.0053, 11.2596, 12.8572]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.8908, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.2527,  16.6793, -10.8417],
        [-14.4782,  16.1852, -10.0920],
        [-14.8629,  15.7937, -10.2745],
        [-14.7664,  16.0344, -10.6661],
        [-14.8657,  16.4260, -10.7657],
        [-14.6813,  16.1963, -10.4323]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.0720,  11.3060,  12.3513, -15.2527,  16.6793, -10.8417],
        [ 16.7

epoch:0, loss:-2.115262985229492
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4403, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[16.7548, 11.0890, 12.3756],
        [16.6062, 10.6748, 12.6474],
        [17.2935, 11.4783, 12.6604],
        [16.5300, 11.2522, 12.6341],
        [16.6472, 11.0047, 12.4108],
        [16.5905, 10.6512, 12.1072]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.8233, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.5799,  15.9801, -10.4754],
        [-14.6296,  15.9757, -10.6445],
        [-15.0171,  16.4125, -10.6338],
        [-14.5533,  15.5812, -10.4757],
        [-14.8002,  16.3036, -10.7913],
        [-14.9066,  16.3340, -10.6953]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 16.7548,  11.0890,  12.3756, -14.5799,  15.9801, -10.4754],
        [ 16.6

epoch:0, loss:-2.1233818531036377
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(4.3397, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.2211, 11.2661, 12.8740],
        [17.0228, 11.2500, 12.5591],
        [16.3217, 11.0708, 12.2803],
        [16.7178, 11.0299, 12.7648],
        [16.9258, 11.1495, 12.7356],
        [17.3705, 11.5034, 13.3788]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(18.6877, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.0570,  15.9069, -10.3085],
        [-14.9309,  16.3570, -10.8512],
        [-14.3539,  16.0622, -10.6073],
        [-14.7796,  16.4961, -10.7638],
        [-14.5998,  16.4334, -10.4350],
        [-15.1229,  16.6596, -11.0645]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.2211,  11.2661,  12.8740, -15.0570,  15.9069, -10.3085],
        [ 17.

epoch:0, loss:-2.110095739364624
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2752, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.2246, 11.2445, 12.3301],
        [16.8210, 10.9990, 12.6038],
        [17.1802, 11.2928, 12.7256],
        [17.2382, 11.3907, 12.9810],
        [17.5304, 11.1582, 12.7266],
        [16.9026, 10.9779, 12.5960]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.2931, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.6682,  16.1483, -10.8195],
        [-14.7805,  16.2083, -10.6538],
        [-14.7769,  16.2055, -10.5883],
        [-15.0938,  16.3266, -10.8896],
        [-14.9555,  16.5526, -10.6783],
        [-14.8183,  16.0553, -10.8898]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.2246,  11.2445,  12.3301, -14.6682,  16.1483, -10.8195],
        [ 16.8

epoch:0, loss:-2.1418581008911133
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.0428, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.2716, 11.7973, 12.8975],
        [17.0175, 11.1573, 12.6402],
        [16.7941, 11.3130, 12.6211],
        [17.0844, 11.5904, 13.0371],
        [17.1231, 11.3870, 12.8618],
        [17.8812, 11.7240, 12.9962]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.9569, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-14.7118,  15.8989, -10.1875],
        [-15.0558,  15.9734, -10.3903],
        [-14.5054,  15.9332, -10.4704],
        [-14.8905,  16.4208, -10.8228],
        [-15.1178,  16.4330, -11.0140],
        [-15.0022,  16.2370, -10.7327]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.2716,  11.7973,  12.8975, -14.7118,  15.8989, -10.1875],
        [ 17.

epoch:0, loss:-2.150310516357422
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.7159, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.1350, 11.1058, 12.7864],
        [17.0433, 11.2869, 13.0138],
        [16.7881, 11.0393, 12.7057],
        [17.3049, 11.2651, 12.9554],
        [17.3079, 11.4684, 12.8763],
        [17.1261, 11.1124, 12.5649]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(30.6900, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.2556,  16.6271, -10.7980],
        [-14.7871,  16.8592, -10.8279],
        [-14.9500,  16.6178, -11.2847],
        [-14.5484,  15.9664, -10.9043],
        [-15.1142,  16.4539, -10.7650],
        [-14.8612,  16.5565, -10.5931]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.1350,  11.1058,  12.7864, -15.2556,  16.6271, -10.7980],
        [ 17.0

epoch:0, loss:-2.1420135498046875
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6999, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.6599, 11.4256, 13.0954],
        [17.2319, 11.3752, 12.8996],
        [17.4840, 11.7201, 12.6917],
        [16.9425, 11.4028, 12.6862],
        [17.4089, 11.2949, 13.0242],
        [17.3787, 11.2789, 12.9883]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(31.3456, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.1537,  16.6070, -11.2298],
        [-14.6052,  16.3692, -10.4031],
        [-15.2620,  16.3481, -10.7555],
        [-14.7069,  16.4485, -10.6070],
        [-15.1365,  16.3545, -10.8522],
        [-14.8444,  16.2372, -10.9392]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.6599,  11.4256,  13.0954, -15.1537,  16.6070, -11.2298],
        [ 17.

epoch:0, loss:-2.178814172744751
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.8335, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.4123, 11.3860, 13.3645],
        [17.3407, 11.0191, 12.5495],
        [17.2276, 11.5302, 12.8582],
        [16.8211, 11.5287, 12.9827],
        [17.3802, 11.2771, 12.9653],
        [17.2992, 11.6754, 13.2305]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.3452, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.0197,  16.2900, -10.7960],
        [-15.2161,  16.6816, -11.2733],
        [-14.7583,  16.5639, -11.0053],
        [-14.8424,  16.0197, -10.6078],
        [-14.9445,  16.5371, -11.0217],
        [-15.4009,  16.9222, -11.4828]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.4123,  11.3860,  13.3645, -15.0197,  16.2900, -10.7960],
        [ 17.3

epoch:0, loss:-2.2090234756469727
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7365, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.7346, 11.1825, 13.2140],
        [17.1549, 11.3277, 12.8153],
        [17.4176, 11.5958, 12.8763],
        [17.4652, 11.3497, 12.8667],
        [17.4524, 11.5547, 12.6922],
        [17.4133, 11.3981, 12.9826]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.6031, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.4168,  16.7049, -11.2961],
        [-14.9158,  16.3874, -11.1280],
        [-15.1939,  16.2626, -11.0928],
        [-15.1812,  16.4327, -10.4857],
        [-14.6816,  16.5626, -10.8286],
        [-15.1011,  16.4650, -11.0684]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.7346,  11.1825,  13.2140, -15.4168,  16.7049, -11.2961],
        [ 17.

epoch:0, loss:-2.2438807487487793
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4411, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.3324, 11.1705, 13.4167],
        [17.2480, 11.2991, 13.0553],
        [17.0456, 11.1930, 12.3410],
        [17.2756, 11.3463, 13.1711],
        [17.3076, 11.6055, 13.0076],
        [17.5809, 11.4896, 12.8776]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.3122, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.1711,  16.6134, -10.8902],
        [-15.1424,  16.8041, -10.7734],
        [-15.2247,  16.6868, -10.8318],
        [-15.2147,  16.7988, -11.1496],
        [-15.7961,  17.3156, -11.1437],
        [-15.3257,  16.4118, -10.9211]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.3324,  11.1705,  13.4167, -15.1711,  16.6134, -10.8902],
        [ 17.

epoch:0, loss:-2.275846242904663
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.9472, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.7237, 11.8909, 12.8925],
        [17.2725, 11.4649, 12.8273],
        [17.6886, 11.4852, 13.1059],
        [17.6779, 11.8757, 12.7439],
        [17.4689, 11.4451, 12.9687],
        [17.7465, 11.4967, 13.1516]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(16.1429, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.8194,  17.1849, -11.5309],
        [-15.5015,  16.6346, -10.9868],
        [-15.1310,  16.5740, -11.1348],
        [-15.1337,  16.5835, -11.1166],
        [-15.3085,  16.3058, -10.9875],
        [-15.0916,  16.6392, -11.0668]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.7237,  11.8909,  12.8925, -15.8194,  17.1849, -11.5309],
        [ 17.2

epoch:0, loss:-2.2597947120666504
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6681, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.5012, 12.1079, 13.3049],
        [17.2150, 11.7972, 12.8544],
        [17.9438, 11.6431, 13.0002],
        [17.4241, 11.8515, 13.1058],
        [17.3688, 11.8729, 12.9015],
        [17.8881, 11.9348, 13.5455]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.7316, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.2305,  16.3904, -11.3889],
        [-15.3056,  16.9219, -11.4149],
        [-15.7895,  17.2349, -11.4516],
        [-15.3563,  16.7113, -11.2015],
        [-15.3745,  16.9227, -11.4008],
        [-15.4141,  16.9501, -11.3643]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.5012,  12.1079,  13.3049, -15.2305,  16.3904, -11.3889],
        [ 17.

epoch:0, loss:-2.23239803314209
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.9746, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.7203, 11.8427, 13.0386],
        [17.4520, 11.8517, 12.8456],
        [17.9865, 12.0503, 13.6042],
        [17.6602, 11.7062, 13.3477],
        [17.8635, 11.9171, 13.5424],
        [17.0545, 11.3523, 12.6432]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.7970, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.1430,  16.9118, -11.2155],
        [-15.6463,  17.2318, -11.5918],
        [-15.5773,  16.8032, -11.1481],
        [-15.2223,  16.6065, -11.0703],
        [-15.3600,  17.0664, -11.1819],
        [-15.3212,  16.5767, -11.3971]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.7203,  11.8427,  13.0386, -15.1430,  16.9118, -11.2155],
        [ 17.45

epoch:0, loss:-2.3020520210266113
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.6136, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.5497, 11.8823, 13.2832],
        [17.7162, 11.9932, 13.2858],
        [17.7546, 11.6637, 13.1328],
        [17.6595, 11.7700, 13.0612],
        [17.6987, 11.7561, 13.3523],
        [17.5965, 11.8894, 13.6637]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(32.0513, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.1351,  15.9820, -11.0961],
        [-15.1059,  16.5300, -11.3513],
        [-15.2975,  17.2600, -11.4376],
        [-15.2230,  16.7492, -11.4125],
        [-15.5330,  16.7529, -10.7817],
        [-15.5266,  17.2126, -11.4863]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.5497,  11.8823,  13.2832, -15.1351,  15.9820, -11.0961],
        [ 17.

epoch:0, loss:-2.2964906692504883
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.7513, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.3360, 11.4643, 13.2221],
        [17.4935, 12.0122, 13.1691],
        [17.6329, 12.3335, 13.2233],
        [17.3891, 11.4408, 12.6141],
        [17.8512, 11.9186, 13.4869],
        [17.6475, 11.8143, 13.3247]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(14.9916, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.6894,  17.1000, -11.6423],
        [-15.3543,  17.0965, -11.6542],
        [-15.4562,  17.1760, -11.2334],
        [-15.1541,  16.6759, -11.2510],
        [-15.3144,  16.8346, -11.1189],
        [-14.9644,  16.4436, -11.3164]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.3360,  11.4643,  13.2221, -15.6894,  17.1000, -11.6423],
        [ 17.

epoch:0, loss:-2.3484137058258057
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7737, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.9473, 12.0877, 13.7626],
        [17.5123, 12.1052, 13.3949],
        [17.4570, 11.7585, 13.1749],
        [17.6304, 11.7936, 13.0874],
        [17.6135, 12.1663, 13.1788],
        [17.7815, 11.7394, 13.3505]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(26.9552, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.4803,  16.5559, -11.0807],
        [-15.5306,  16.8107, -11.1538],
        [-15.7683,  16.9675, -11.6869],
        [-15.2652,  16.7922, -11.5141],
        [-15.3136,  16.5880, -11.5050],
        [-15.6519,  17.0644, -11.4476]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.9473,  12.0877,  13.7626, -15.4803,  16.5559, -11.0807],
        [ 17.

epoch:0, loss:-2.338444471359253
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.0002, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.8649, 11.9538, 13.4641],
        [17.6341, 12.1309, 12.9687],
        [17.5689, 11.6577, 13.2488],
        [17.7600, 12.1752, 13.6182],
        [18.0330, 12.0312, 13.5922],
        [17.9286, 12.0176, 13.3118]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.6387, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.5116,  16.5702, -11.3763],
        [-15.4427,  16.7085, -11.4305],
        [-15.9312,  17.1548, -11.8099],
        [-15.7343,  17.3570, -11.4060],
        [-15.6115,  16.6611, -11.5468],
        [-15.5403,  16.7720, -11.6174]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.8649,  11.9538,  13.4641, -15.5116,  16.5702, -11.3763],
        [ 17.6

epoch:0, loss:-2.337597608566284
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4328, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.9131, 12.0751, 13.8422],
        [17.9035, 11.9679, 13.7457],
        [17.8117, 11.6969, 13.6448],
        [17.5376, 11.7050, 13.2012],
        [17.6619, 11.8009, 13.3852],
        [17.7454, 11.6825, 13.2548]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(21.1625, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.9946,  16.8851, -11.6203],
        [-15.7597,  17.2632, -11.7075],
        [-15.8171,  16.8969, -11.5952],
        [-15.7292,  17.1710, -11.7962],
        [-15.6309,  17.3797, -11.6787],
        [-15.4163,  16.5579, -11.4332]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.9131,  12.0751,  13.8422, -15.9946,  16.8851, -11.6203],
        [ 17.9

epoch:0, loss:-2.3629658222198486
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7051, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.1595, 12.1372, 13.7438],
        [18.2057, 12.4242, 13.3721],
        [18.1485, 12.1333, 13.8478],
        [18.0275, 12.1443, 13.2532],
        [17.5565, 12.3624, 13.7522],
        [17.6151, 12.2057, 13.4850]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(21.8972, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.6708,  17.4051, -11.8933],
        [-15.6231,  17.2740, -11.6768],
        [-15.7932,  17.1415, -11.4950],
        [-16.0465,  17.5334, -11.7772],
        [-15.8103,  17.1364, -11.7183],
        [-15.5170,  16.9464, -11.6119]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.1595,  12.1372,  13.7438, -15.6708,  17.4051, -11.8933],
        [ 18.

epoch:0, loss:-2.423623561859131
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3931, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.6503, 11.5987, 13.2575],
        [17.9214, 11.9603, 13.9178],
        [17.7385, 11.8157, 13.4140],
        [17.3721, 12.1235, 13.4567],
        [17.7977, 11.8397, 13.6087],
        [17.5517, 11.9967, 13.4833]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(21.2732, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.8530,  17.1884, -11.3543],
        [-15.9739,  17.4031, -12.0532],
        [-15.6702,  16.7656, -11.2350],
        [-15.7014,  16.9180, -11.8825],
        [-16.0795,  17.0484, -11.7847],
        [-15.7339,  17.3812, -11.7630]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.6503,  11.5987,  13.2575, -15.8530,  17.1884, -11.3543],
        [ 17.9

epoch:0, loss:-2.410205602645874
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.9404, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.1556, 12.5707, 14.1228],
        [17.9924, 12.3030, 13.6539],
        [18.0679, 12.0335, 13.4377],
        [18.2506, 12.4884, 14.0081],
        [18.5034, 12.1036, 13.4867],
        [17.7226, 12.2983, 13.4968]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(16.7115, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.2709,  17.5838, -12.0530],
        [-15.8790,  17.4125, -11.9001],
        [-16.0359,  17.4091, -11.8847],
        [-16.3451,  17.2030, -11.7405],
        [-15.8486,  17.4273, -12.0534],
        [-15.4696,  17.3704, -11.6574]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.1556,  12.5707,  14.1228, -16.2709,  17.5838, -12.0530],
        [ 17.9

epoch:0, loss:-2.4317593574523926
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5864, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[17.6329, 12.3586, 13.7329],
        [18.0622, 11.9211, 13.4203],
        [18.1260, 12.2208, 14.1366],
        [18.1299, 12.5345, 13.7131],
        [17.9283, 12.2415, 13.9850],
        [17.6707, 12.3944, 13.6089]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(32.7837, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.0692,  17.1536, -12.1051],
        [-15.9795,  17.1524, -11.5943],
        [-16.2137,  18.0364, -12.2765],
        [-16.0751,  17.0324, -11.7216],
        [-15.6642,  17.2616, -12.1205],
        [-16.2421,  17.7589, -12.0429]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 17.6329,  12.3586,  13.7329, -16.0692,  17.1536, -12.1051],
        [ 18.

epoch:0, loss:-2.4085662364959717
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5691, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.0600, 12.4517, 13.7107],
        [17.8489, 11.9056, 13.8985],
        [17.9904, 12.2682, 13.8489],
        [18.1806, 12.8273, 14.0656],
        [18.0650, 12.5246, 13.9688],
        [18.3339, 12.5873, 13.6718]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(27.1174, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.2748,  17.6173, -11.9454],
        [-16.1844,  17.5529, -11.9715],
        [-16.1408,  17.5735, -12.0408],
        [-16.1274,  17.2728, -11.9203],
        [-15.9229,  17.3148, -12.2400],
        [-15.6613,  17.1609, -11.3242]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.0600,  12.4517,  13.7107, -16.2748,  17.6173, -11.9454],
        [ 17.

epoch:0, loss:-2.434593677520752
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7337, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.6376, 12.4061, 14.1849],
        [17.9957, 12.2106, 13.7253],
        [18.5633, 12.6855, 13.8623],
        [18.0905, 12.6317, 13.6140],
        [18.2820, 12.3611, 13.7594],
        [18.2803, 12.6307, 13.8857]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(26.9944, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.2863,  17.5279, -11.9839],
        [-15.5779,  17.4827, -12.2184],
        [-16.4318,  17.4649, -12.0128],
        [-15.5857,  16.7855, -11.8639],
        [-16.2746,  17.1728, -12.2007],
        [-15.8473,  17.7619, -12.2054]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.6376,  12.4061,  14.1849, -16.2863,  17.5279, -11.9839],
        [ 17.9

epoch:0, loss:-2.429993152618408
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5591, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.4125, 12.2222, 14.2573],
        [18.3829, 12.4941, 13.8385],
        [18.2611, 12.2574, 13.5462],
        [17.8442, 12.3931, 13.6005],
        [18.3291, 12.5840, 14.1712],
        [18.2030, 12.2951, 13.8635]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(32.3386, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.8665,  17.5171, -12.0605],
        [-16.1022,  17.7604, -12.0592],
        [-16.2540,  17.9483, -12.2807],
        [-15.9823,  17.4856, -11.8716],
        [-16.1156,  17.4997, -12.1105],
        [-15.7727,  17.5064, -12.5196]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.4125,  12.2222,  14.2573, -15.8665,  17.5171, -12.0605],
        [ 18.3

epoch:0, loss:-2.456425666809082
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.0406, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.3947, 12.6399, 14.0497],
        [18.1404, 12.5430, 13.3739],
        [18.1898, 12.7041, 13.5133],
        [18.8064, 12.6520, 14.3302],
        [18.1612, 12.9141, 13.8390],
        [18.1543, 12.8465, 13.6330]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(33.4989, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-15.7060,  17.5136, -11.9501],
        [-15.9846,  17.7297, -11.8134],
        [-16.0093,  17.5450, -11.4717],
        [-15.8910,  17.7080, -12.7223],
        [-15.8446,  17.3377, -11.9722],
        [-16.1088,  17.6153, -12.1406]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.3947,  12.6399,  14.0497, -15.7060,  17.5136, -11.9501],
        [ 18.1

epoch:0, loss:-2.498732328414917
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7911, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.3410, 12.1100, 13.6857],
        [18.0568, 12.4719, 14.0443],
        [18.0094, 12.7203, 13.5900],
        [18.4652, 12.3228, 13.9663],
        [18.2275, 12.1433, 13.6049],
        [18.2899, 12.5311, 13.9338]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.0864, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.2672,  17.6602, -12.1995],
        [-15.9365,  17.6452, -12.1436],
        [-16.0661,  17.2728, -11.7637],
        [-16.5383,  17.5447, -12.3383],
        [-16.0742,  17.7909, -12.1105],
        [-16.0026,  17.5848, -12.0552]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.3410,  12.1100,  13.6857, -16.2672,  17.6602, -12.1995],
        [ 18.0

epoch:0, loss:-2.5018482208251953
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.5120, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.6411, 12.9714, 14.6852],
        [18.4314, 12.9115, 14.0565],
        [18.3244, 12.6159, 13.9389],
        [17.7507, 12.3632, 13.9757],
        [18.4617, 12.5497, 14.0495],
        [18.4650, 12.7850, 13.7133]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(22.6894, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.1561,  17.6905, -12.1420],
        [-16.5340,  17.7848, -12.0638],
        [-16.4731,  17.6682, -12.3498],
        [-15.9024,  17.4999, -12.2424],
        [-16.0126,  17.4607, -12.2061],
        [-16.1860,  17.6437, -12.1334]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.6411,  12.9714,  14.6852, -16.1561,  17.6905, -12.1420],
        [ 18.

epoch:0, loss:-2.5273919105529785
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3393, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.6256, 13.0466, 14.1438],
        [18.3567, 12.3655, 13.9744],
        [18.4652, 12.9377, 13.6413],
        [18.3252, 13.2518, 14.2983],
        [18.2761, 12.8434, 14.0182],
        [18.2960, 12.7292, 13.9145]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.4530, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.5285,  17.4785, -12.2982],
        [-16.0874,  17.9546, -12.4524],
        [-16.3984,  17.5603, -12.2776],
        [-16.2157,  17.5663, -12.3030],
        [-16.6371,  17.9706, -12.6418],
        [-16.4309,  18.4974, -12.6014]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.6256,  13.0466,  14.1438, -16.5285,  17.4785, -12.2982],
        [ 18.

epoch:0, loss:-2.5474655628204346
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.9875, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.8603, 12.8343, 13.8719],
        [18.2596, 13.0013, 14.1809],
        [18.6863, 12.7957, 14.3221],
        [18.4696, 12.5320, 13.9180],
        [18.5604, 12.6957, 14.0047],
        [18.1125, 12.8038, 14.1610]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(22.7298, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.1251,  18.3404, -12.7548],
        [-16.5392,  18.0643, -12.5413],
        [-16.8179,  18.0286, -12.8334],
        [-16.4087,  17.7477, -12.5269],
        [-16.0069,  17.7811, -12.6602],
        [-16.3191,  17.5119, -12.4133]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.8603,  12.8343,  13.8719, -17.1251,  18.3404, -12.7548],
        [ 18.

epoch:0, loss:-2.5391228199005127
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5382, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.2663, 12.6199, 13.8903],
        [18.1792, 13.1513, 13.8787],
        [18.7082, 12.6230, 14.3393],
        [18.6307, 12.8306, 14.5356],
        [18.6698, 12.7335, 14.5717],
        [18.3806, 12.9983, 13.9781]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(27.9384, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.6479,  17.9437, -12.4635],
        [-16.5718,  17.8105, -12.4693],
        [-16.3547,  17.9901, -12.6822],
        [-16.4825,  17.5751, -12.6229],
        [-16.1507,  17.6950, -12.3772],
        [-16.5943,  17.8687, -12.7211]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.2663,  12.6199,  13.8903, -16.6479,  17.9437, -12.4635],
        [ 18.

epoch:0, loss:-2.5639543533325195
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.5286, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.8935, 13.0378, 14.4814],
        [18.3471, 12.8594, 13.9459],
        [17.8351, 12.9261, 13.3887],
        [18.5975, 12.9135, 14.4814],
        [18.3023, 12.7411, 14.1916],
        [18.5123, 12.7269, 14.0286]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.8433, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.7298,  18.0568, -12.4717],
        [-16.4697,  17.5550, -12.5005],
        [-16.8274,  18.0695, -12.5382],
        [-16.1949,  17.3921, -12.3891],
        [-16.5761,  18.1702, -12.5543],
        [-16.5817,  18.0238, -12.7190]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.8935,  13.0378,  14.4814, -16.7298,  18.0568, -12.4717],
        [ 18.

epoch:0, loss:-2.5784101486206055
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6731, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.5644, 12.8976, 14.1420],
        [18.5131, 12.9725, 14.3296],
        [18.2271, 12.9738, 14.1192],
        [18.6408, 12.9504, 14.2466],
        [18.6080, 12.4182, 14.3832],
        [18.8472, 13.2005, 14.4671]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(16.7726, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.5187,  18.2291, -12.8907],
        [-16.7580,  18.3204, -13.0290],
        [-16.6098,  17.9348, -12.6113],
        [-16.6895,  17.9788, -12.8694],
        [-16.5024,  18.1996, -12.7127],
        [-17.0253,  17.7886, -12.5049]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.5644,  12.8976,  14.1420, -16.5187,  18.2291, -12.8907],
        [ 18.

epoch:0, loss:-2.6201729774475098
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.1561, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.0225, 13.4414, 14.4685],
        [19.0565, 13.0727, 14.5952],
        [18.7811, 13.2205, 14.9834],
        [18.7597, 13.2129, 14.7564],
        [19.0303, 13.1662, 14.7350],
        [18.5732, 13.3099, 14.6020]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.5526, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.3464,  17.9223, -12.5749],
        [-16.7862,  17.9869, -12.8091],
        [-16.9465,  18.2490, -12.8457],
        [-16.8216,  18.4923, -12.8391],
        [-16.7110,  18.1438, -12.9937],
        [-16.9734,  18.5903, -13.1743]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.0225,  13.4414,  14.4685, -16.3464,  17.9223, -12.5749],
        [ 19.

epoch:0, loss:-2.6224567890167236
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.9702, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.3151, 12.7926, 14.4420],
        [18.8546, 12.8364, 14.4803],
        [18.6549, 13.1921, 14.7570],
        [18.3986, 12.7099, 13.8912],
        [18.7617, 13.0286, 14.5400],
        [18.7391, 13.2373, 14.5228]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(22.7885, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.0269,  17.5503, -12.7064],
        [-16.7021,  18.1067, -12.8798],
        [-16.4649,  17.5759, -12.7363],
        [-16.8985,  18.3277, -12.9885],
        [-17.0630,  18.2834, -12.8850],
        [-16.8370,  18.3351, -12.9492]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.3151,  12.7926,  14.4420, -16.0269,  17.5503, -12.7064],
        [ 18.

epoch:0, loss:-2.6356654167175293
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5450, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.9405, 13.1810, 14.5651],
        [18.3146, 12.8004, 14.2484],
        [18.8210, 13.2352, 14.3759],
        [18.8120, 13.2257, 14.3510],
        [18.6085, 12.2676, 14.5975],
        [18.5708, 13.2344, 14.5262]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(23.1838, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.9299,  18.0648, -12.6139],
        [-16.7198,  18.0379, -12.7223],
        [-16.7254,  17.7003, -12.6926],
        [-16.8482,  18.1665, -13.1042],
        [-16.9204,  18.0045, -12.7873],
        [-17.2195,  18.2902, -13.0773]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.9405,  13.1810,  14.5651, -16.9299,  18.0648, -12.6139],
        [ 18.

epoch:0, loss:-2.6957626342773438
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3659, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.0223, 13.0585, 14.7116],
        [18.5814, 13.0828, 14.4645],
        [18.6869, 12.9048, 14.6237],
        [18.9941, 13.0249, 14.8617],
        [18.9963, 13.2738, 14.2319],
        [18.9946, 13.2996, 14.3529]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(16.2373, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.5366,  18.3067, -12.6726],
        [-16.6864,  18.3926, -12.8500],
        [-16.8295,  17.9390, -12.9062],
        [-16.6706,  18.1621, -13.0829],
        [-16.7462,  18.0261, -13.0916],
        [-16.7895,  18.2743, -12.8758]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.0223,  13.0585,  14.7116, -16.5366,  18.3067, -12.6726],
        [ 18.

epoch:0, loss:-2.680757522583008
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9529, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.6129, 13.4643, 14.4806],
        [18.1704, 13.1086, 14.1437],
        [18.8360, 13.2439, 14.4718],
        [18.7897, 12.9362, 14.2827],
        [18.6878, 13.0939, 14.9147],
        [18.6539, 13.1929, 14.6030]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(17.4579, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.1636,  18.1050, -13.1945],
        [-16.7667,  17.9959, -12.8354],
        [-17.1317,  18.7527, -12.9904],
        [-17.0821,  18.2512, -12.5744],
        [-16.2723,  18.3792, -12.8267],
        [-16.9786,  18.4934, -13.4622]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.6129,  13.4643,  14.4806, -17.1636,  18.1050, -13.1945],
        [ 18.1

epoch:0, loss:-2.679893970489502
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4118, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.9513, 13.4126, 14.5934],
        [18.8900, 13.3514, 14.6556],
        [18.8855, 12.9567, 14.4176],
        [19.1448, 13.2364, 14.8384],
        [18.8389, 13.3719, 14.8099],
        [18.7078, 13.0553, 14.3464]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(23.2756, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.4987,  18.3084, -13.0139],
        [-16.8774,  18.3955, -12.8910],
        [-16.6995,  18.1060, -12.9210],
        [-16.7624,  18.0055, -12.8190],
        [-16.8381,  18.1018, -12.4909],
        [-16.8033,  18.2339, -13.3568]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.9513,  13.4126,  14.5934, -16.4987,  18.3084, -13.0139],
        [ 18.8

epoch:0, loss:-2.742875576019287
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7478, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.7622, 13.1087, 14.3612],
        [19.0787, 12.8789, 14.3962],
        [19.3866, 13.5050, 14.8754],
        [19.3439, 13.2870, 14.8764],
        [18.4256, 13.0009, 14.3500],
        [19.4647, 13.3910, 14.6245]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(23.2628, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.8161,  18.7377, -13.4219],
        [-16.8672,  18.1705, -12.8007],
        [-16.5260,  18.4295, -12.6932],
        [-16.7553,  17.8867, -12.8945],
        [-16.6044,  18.3171, -13.2374],
        [-16.6000,  17.8595, -12.5668]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.7622,  13.1087,  14.3612, -16.8161,  18.7377, -13.4219],
        [ 19.0

epoch:0, loss:-2.6907541751861572
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5095, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.0722, 13.4649, 14.8194],
        [18.5413, 13.2578, 14.6454],
        [19.4179, 13.1052, 15.0400],
        [19.1938, 13.2556, 14.7762],
        [19.0480, 13.5748, 14.6730],
        [18.7184, 13.2749, 14.6777]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(21.0256, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.5311,  18.9576, -13.4008],
        [-16.9518,  18.3223, -12.9474],
        [-16.6670,  18.2065, -12.9724],
        [-16.9700,  18.4729, -13.2967],
        [-16.8880,  17.9307, -12.8250],
        [-16.9894,  18.1703, -13.0055]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.0722,  13.4649,  14.8194, -17.5311,  18.9576, -13.4008],
        [ 18.

epoch:0, loss:-2.7690176963806152
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3428, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[18.7330, 13.3678, 14.5792],
        [19.0453, 13.7391, 15.1024],
        [19.2385, 13.6269, 14.9526],
        [19.1116, 13.5248, 15.0651],
        [19.2388, 13.3155, 14.5403],
        [18.7751, 13.1660, 14.7493]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(35.8065, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.7107,  18.1445, -13.2773],
        [-17.7061,  18.4855, -13.4311],
        [-17.3253,  18.9812, -13.3192],
        [-17.2020,  18.5612, -13.3226],
        [-17.1893,  18.8506, -13.3740],
        [-17.1025,  18.5804, -13.2417]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 18.7330,  13.3678,  14.5792, -16.7107,  18.1445, -13.2773],
        [ 19.

epoch:0, loss:-2.779337167739868
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.6917, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.3691, 13.6782, 15.1824],
        [18.7950, 13.6313, 15.0029],
        [18.7052, 13.2428, 14.2403],
        [18.8243, 13.3054, 14.6074],
        [19.1328, 13.8253, 14.8575],
        [19.2691, 13.5245, 15.0353]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(23.7389, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.0736,  18.2507, -12.8407],
        [-17.4353,  18.7696, -13.1982],
        [-16.8260,  18.4735, -13.0266],
        [-17.2577,  18.8741, -13.4725],
        [-17.1867,  18.1365, -13.7350],
        [-17.3996,  18.2784, -13.4783]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.3691,  13.6782,  15.1824, -17.0736,  18.2507, -12.8407],
        [ 18.7

epoch:0, loss:-2.7503199577331543
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.6743, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.3009, 13.6110, 14.8808],
        [18.9310, 13.7729, 15.1557],
        [18.7829, 13.6061, 14.8117],
        [19.0556, 13.5273, 14.7705],
        [18.7782, 13.7271, 15.0167],
        [19.4129, 13.1219, 14.9907]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.6184, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.3200,  18.4955, -13.2588],
        [-17.3210,  18.6919, -12.9387],
        [-17.5356,  18.8871, -13.6926],
        [-17.3110,  18.7427, -13.3974],
        [-17.2367,  18.4191, -13.3687],
        [-17.0943,  18.4932, -13.5487]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.3009,  13.6110,  14.8808, -17.3200,  18.4955, -13.2588],
        [ 18.

epoch:0, loss:-2.811506986618042
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5666, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.2113, 13.3392, 14.9798],
        [19.2404, 13.9527, 15.0753],
        [19.2859, 13.5076, 15.3181],
        [19.2010, 13.4923, 14.6567],
        [19.5174, 13.7417, 15.0752],
        [19.0742, 14.0224, 15.2633]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(35.3836, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.0792,  18.9105, -13.5293],
        [-17.3041,  18.9781, -13.7054],
        [-17.0988,  18.5234, -13.4305],
        [-16.9755,  18.6795, -12.9259],
        [-17.5715,  18.9492, -13.6123],
        [-17.5718,  18.2590, -13.6120]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.2113,  13.3392,  14.9798, -17.0792,  18.9105, -13.5293],
        [ 19.2

epoch:0, loss:-2.8114261627197266
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.0171, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.2674, 13.5473, 14.9181],
        [19.0378, 13.6042, 14.8515],
        [19.3597, 13.6056, 14.8656],
        [19.0819, 13.5807, 15.0838],
        [19.7252, 14.0107, 15.0566],
        [19.3999, 13.7561, 15.1906]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.6260, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.3405,  18.8258, -13.3019],
        [-17.0208,  18.7119, -13.5995],
        [-17.5803,  18.6396, -13.6284],
        [-17.0320,  18.6418, -13.4057],
        [-17.3517,  18.5403, -13.0495],
        [-17.6291,  18.6477, -13.7065]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.2674,  13.5473,  14.9181, -17.3405,  18.8258, -13.3019],
        [ 19.

epoch:0, loss:-2.8114662170410156
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9799, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.2906, 13.1699, 14.9212],
        [19.4112, 13.6379, 15.3531],
        [19.5648, 13.8909, 15.5855],
        [19.3690, 13.7386, 15.0446],
        [19.0926, 13.9275, 14.9605],
        [19.6015, 13.4374, 14.9206]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(24.2403, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.2266,  18.4861, -13.5635],
        [-17.4915,  19.0570, -13.6509],
        [-17.9169,  19.6025, -13.9852],
        [-17.6352,  18.7645, -13.7716],
        [-17.7864,  18.9947, -13.7251],
        [-17.7708,  19.0391, -14.0343]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.2906,  13.1699,  14.9212, -17.2266,  18.4861, -13.5635],
        [ 19.

epoch:0, loss:-2.8054988384246826
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.9001, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.2943, 13.8302, 15.0259],
        [19.5518, 13.6674, 15.1101],
        [19.0037, 14.0640, 15.1966],
        [19.2888, 13.6961, 15.0754],
        [19.7176, 13.9773, 15.3633],
        [19.2710, 13.8709, 15.1231]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.4296, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.7046,  18.9953, -14.0905],
        [-17.0239,  18.5837, -13.6133],
        [-17.7601,  19.0449, -13.8790],
        [-17.8511,  19.1739, -13.8128],
        [-16.7523,  18.6930, -13.1007],
        [-17.5991,  19.3959, -13.8716]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.2943,  13.8302,  15.0259, -17.7046,  18.9953, -14.0905],
        [ 19.

epoch:0, loss:-2.864137649536133
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7332, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.3925, 13.8940, 15.1609],
        [19.0101, 13.9732, 15.1990],
        [19.4674, 13.7834, 15.0425],
        [19.6588, 14.1799, 15.6423],
        [19.2882, 13.4434, 14.9211],
        [19.1317, 14.0804, 15.3979]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(12.1341, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-16.9663,  18.4340, -13.3375],
        [-17.5669,  18.7535, -13.6331],
        [-17.2916,  18.7893, -13.4942],
        [-17.1975,  18.5839, -13.2035],
        [-17.6325,  19.0190, -14.1065],
        [-17.9406,  19.0824, -13.7704]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.3925,  13.8940,  15.1609, -16.9663,  18.4340, -13.3375],
        [ 19.0

epoch:0, loss:-2.876413583755493
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.6000, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.4956, 13.8484, 15.3490],
        [19.6870, 13.7141, 15.2028],
        [19.7932, 13.8519, 14.5638],
        [19.4514, 13.9934, 14.8685],
        [18.9169, 13.4805, 15.1292],
        [18.9260, 13.8292, 15.0221]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(35.9902, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.3366,  18.7386, -13.4669],
        [-17.5511,  18.5595, -13.9063],
        [-17.3575,  18.8416, -13.6833],
        [-17.6634,  19.1782, -13.8469],
        [-17.5786,  18.9908, -13.8355],
        [-18.2154,  19.6730, -14.0547]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.4956,  13.8484,  15.3490, -17.3366,  18.7386, -13.4669],
        [ 19.6

epoch:0, loss:-2.9616751670837402
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.8573, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.4023, 13.9142, 15.5420],
        [19.3910, 13.7342, 15.2286],
        [19.4741, 14.3794, 15.4523],
        [19.3070, 13.7562, 15.0609],
        [19.1279, 13.5890, 15.1772],
        [19.8689, 13.9903, 15.1890]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(30.0228, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.1321,  19.4329, -14.3026],
        [-17.3225,  19.0568, -13.8513],
        [-17.5436,  19.0079, -13.5781],
        [-17.4583,  18.7836, -13.6871],
        [-17.5841,  18.9350, -13.8705],
        [-17.8084,  19.1428, -13.7213]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.4023,  13.9142,  15.5420, -18.1321,  19.4329, -14.3026],
        [ 19.

epoch:0, loss:-2.8871803283691406
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.4379, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.5338, 13.8814, 15.4416],
        [19.6064, 14.0841, 15.0937],
        [20.0578, 14.3365, 15.4005],
        [19.7484, 14.0391, 15.3455],
        [19.3790, 13.9724, 15.1725],
        [19.7673, 14.0356, 15.1634]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(30.7519, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.5388,  19.0143, -13.9424],
        [-17.9850,  19.4254, -14.3518],
        [-18.0526,  18.9884, -13.8078],
        [-17.7479,  18.9908, -13.4874],
        [-17.9646,  18.9611, -13.7215],
        [-17.5896,  18.9920, -14.0857]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.5338,  13.8814,  15.4416, -17.5388,  19.0143, -13.9424],
        [ 19.

epoch:0, loss:-2.9297962188720703
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.0054, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.0472, 14.1573, 15.3701],
        [19.3656, 14.0668, 15.0682],
        [19.6826, 14.0411, 15.3320],
        [19.7574, 14.2288, 14.7535],
        [19.7275, 14.2244, 15.5525],
        [19.6248, 14.4821, 15.6954]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(29.9867, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.6689,  19.2381, -13.9317],
        [-17.7094,  19.4595, -14.0049],
        [-17.2856,  18.8222, -14.0436],
        [-17.3802,  19.2613, -13.9761],
        [-17.5103,  18.7031, -13.8680],
        [-17.8160,  18.9834, -13.6642]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.0472,  14.1573,  15.3701, -17.6689,  19.2381, -13.9317],
        [ 19.

epoch:0, loss:-2.977074146270752
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6807, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.4690, 14.0606, 15.0783],
        [19.9286, 14.0159, 15.4820],
        [19.8205, 14.3978, 15.3910],
        [19.9943, 14.2066, 15.5279],
        [19.9653, 14.4096, 15.6085],
        [19.4877, 14.0466, 15.3890]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(24.5892, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.5217,  19.2094, -14.0285],
        [-18.0594,  19.1477, -14.3470],
        [-17.6331,  19.1880, -14.1406],
        [-17.5702,  18.9620, -13.7046],
        [-17.4672,  18.7289, -13.8263],
        [-18.1979,  19.5888, -14.2056]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.4690,  14.0606,  15.0783, -17.5217,  19.2094, -14.0285],
        [ 19.9

epoch:0, loss:-2.9695184230804443
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9240, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.7102, 14.2751, 15.6994],
        [19.5710, 14.0320, 15.0482],
        [19.8810, 14.2692, 15.7102],
        [19.3366, 13.7192, 15.1502],
        [19.8308, 14.0370, 15.4209],
        [19.6033, 14.2678, 15.3815]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.1409, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.8352,  18.8388, -14.3209],
        [-17.8006,  19.5017, -14.1467],
        [-17.5109,  19.3245, -14.3912],
        [-17.4684,  18.6758, -13.6412],
        [-17.7678,  19.2552, -14.1223],
        [-17.4765,  19.1538, -13.7125]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.7102,  14.2751,  15.6994, -17.8352,  18.8388, -14.3209],
        [ 19.

epoch:0, loss:-3.0106008052825928
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.1990, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.8325, 14.3057, 16.0878],
        [19.8772, 14.4936, 15.6021],
        [19.9989, 14.2531, 15.8124],
        [19.3591, 14.3858, 15.2714],
        [19.8729, 14.4059, 15.2631],
        [19.5479, 14.1808, 15.4564]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.3634, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.8340,  18.9060, -14.2112],
        [-17.5067,  18.9030, -13.9949],
        [-17.7826,  19.4379, -14.6759],
        [-17.5562,  18.7309, -13.7142],
        [-17.9296,  19.0334, -14.1521],
        [-18.2567,  19.4101, -14.2812]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.8325,  14.3057,  16.0878, -17.8340,  18.9060, -14.2112],
        [ 19.

epoch:0, loss:-3.035569667816162
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.8595, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.5976, 14.4338, 15.6961],
        [20.0677, 14.3821, 15.7926],
        [20.1732, 14.5567, 15.7506],
        [19.6911, 14.2339, 15.3697],
        [19.3707, 14.0192, 15.3042],
        [19.8334, 14.2555, 15.8847]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(30.3525, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.7595,  19.2911, -14.3788],
        [-18.0443,  19.2783, -14.3641],
        [-18.1665,  19.3804, -14.1326],
        [-17.7418,  18.9858, -14.3598],
        [-17.6806,  19.1689, -14.0352],
        [-17.7624,  19.2512, -14.0557]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.5976,  14.4338,  15.6961, -17.7595,  19.2911, -14.3788],
        [ 20.0

epoch:0, loss:-2.962273359298706
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.6050, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.2042, 14.4542, 15.7437],
        [20.0521, 14.2855, 15.7645],
        [19.8036, 14.2712, 15.8861],
        [19.8798, 14.1373, 15.8003],
        [19.8227, 14.6387, 15.8288],
        [19.7773, 14.5271, 15.6263]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.9856, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-17.6449,  19.6029, -14.3668],
        [-18.1838,  19.7103, -14.4939],
        [-18.4812,  20.0523, -14.9537],
        [-17.9642,  19.3233, -13.8137],
        [-17.9219,  19.2426, -14.4394],
        [-18.1471,  19.1666, -14.3057]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.2042,  14.4542,  15.7437, -17.6449,  19.6029, -14.3668],
        [ 20.0

epoch:0, loss:-2.942486047744751
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4438, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.7012, 14.6914, 16.0968],
        [19.6922, 14.0231, 15.3189],
        [19.8628, 14.2470, 15.6222],
        [20.2539, 14.8969, 15.7119],
        [19.6050, 14.2912, 15.7928],
        [19.9791, 14.5382, 15.8102]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(37.0189, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.3834,  19.3301, -14.4054],
        [-18.4228,  19.7657, -14.6806],
        [-17.6408,  19.5516, -14.3636],
        [-18.3411,  19.1115, -14.4662],
        [-18.1604,  19.5714, -14.5087],
        [-18.0707,  19.5060, -14.6175]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.7012,  14.6914,  16.0968, -18.3834,  19.3301, -14.4054],
        [ 19.6

epoch:0, loss:-3.0405333042144775
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.8980, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.9841, 14.7402, 16.0534],
        [19.9484, 14.8155, 16.2501],
        [20.3307, 14.7506, 15.7760],
        [19.7860, 14.5318, 15.7457],
        [20.2317, 14.2356, 15.8319],
        [19.8721, 14.5470, 16.0941]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(24.6589, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.1674,  19.8817, -14.5617],
        [-18.1798,  19.4717, -14.6963],
        [-18.0654,  19.4626, -14.3145],
        [-18.4218,  19.4371, -14.5469],
        [-17.6931,  19.2815, -14.1021],
        [-18.2367,  19.6407, -14.6814]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.9841,  14.7402,  16.0534, -18.1674,  19.8817, -14.5617],
        [ 19.

epoch:0, loss:-3.081613540649414
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.9261, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.1178, 14.5241, 15.8779],
        [19.9274, 14.5244, 15.8105],
        [20.1643, 14.6297, 15.8151],
        [19.9783, 14.2318, 15.4049],
        [20.5503, 14.8385, 16.1558],
        [20.3611, 14.4302, 15.8703]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(31.5402, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.2200,  19.6240, -14.4907],
        [-18.2507,  19.7651, -14.3777],
        [-17.8262,  19.5996, -14.2981],
        [-18.0050,  19.9300, -14.9008],
        [-18.4453,  19.2918, -14.6853],
        [-18.0933,  19.6161, -14.5569]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.1178,  14.5241,  15.8779, -18.2200,  19.6240, -14.4907],
        [ 19.9

epoch:0, loss:-3.0750958919525146
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.7114, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.7079, 14.2308, 15.7561],
        [20.1023, 14.5965, 16.0438],
        [20.4161, 14.6703, 15.5467],
        [20.1468, 14.5189, 15.7116],
        [19.7377, 14.4495, 15.6247],
        [20.7024, 14.9342, 16.3452]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(31.0635, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.1024,  19.3089, -14.4679],
        [-18.2972,  19.7637, -14.8670],
        [-18.4718,  20.1758, -14.9628],
        [-18.2358,  19.4983, -14.5007],
        [-18.5713,  19.7954, -14.8962],
        [-18.3376,  19.9004, -14.5942]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.7079,  14.2308,  15.7561, -18.1024,  19.3089, -14.4679],
        [ 20.

epoch:0, loss:-3.0920677185058594
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3142, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.6945, 14.2090, 15.6472],
        [20.2312, 14.6780, 15.8660],
        [19.6775, 14.8231, 15.7464],
        [20.0282, 14.8435, 15.6667],
        [20.1890, 14.8536, 16.1124],
        [19.8410, 14.6455, 15.7834]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(29.5469, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.2961,  19.7693, -14.5711],
        [-17.9707,  19.8156, -14.4070],
        [-18.1649,  19.2926, -14.4742],
        [-18.4050,  19.6374, -14.9453],
        [-17.5531,  19.2479, -14.0992],
        [-18.6155,  19.4632, -14.6365]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.6945,  14.2090,  15.6472, -18.2961,  19.7693, -14.5711],
        [ 20.

epoch:0, loss:-3.1205036640167236
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.9166, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.5865, 14.9169, 16.1666],
        [20.6934, 14.4865, 15.9215],
        [20.3165, 14.6342, 16.1316],
        [20.0406, 14.5752, 15.7718],
        [19.9637, 14.4448, 15.8580],
        [20.2485, 15.2116, 16.0650]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(24.8750, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.9432,  20.0083, -14.9764],
        [-18.5066,  19.3916, -14.7560],
        [-18.4456,  19.6207, -14.8100],
        [-18.4020,  19.5672, -14.4671],
        [-18.4814,  19.7608, -14.6503],
        [-18.5005,  20.0612, -14.5495]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.5865,  14.9169,  16.1666, -18.9432,  20.0083, -14.9764],
        [ 20.

epoch:0, loss:-3.1328017711639404
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4696, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.2750, 15.0266, 16.1676],
        [20.0638, 14.8975, 15.4636],
        [20.0997, 14.7951, 15.6840],
        [20.4483, 14.9417, 16.1538],
        [20.0822, 14.7053, 15.8702],
        [20.3756, 14.9618, 15.9814]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(24.4706, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.5050,  20.0494, -14.6072],
        [-18.2034,  19.9926, -14.7795],
        [-18.7906,  20.1296, -15.2551],
        [-17.9014,  19.5890, -14.4020],
        [-18.6731,  19.7127, -14.8322],
        [-18.3967,  19.9435, -14.6335]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.2750,  15.0266,  16.1676, -18.5050,  20.0494, -14.6072],
        [ 20.

epoch:0, loss:-3.1044037342071533
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9273, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.9613, 14.8516, 16.1345],
        [20.3396, 15.0378, 16.1358],
        [20.3422, 15.0479, 16.2232],
        [20.3854, 14.9481, 16.0779],
        [20.5731, 15.1308, 16.2474],
        [20.2434, 15.2087, 16.1608]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(37.0979, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.6527,  19.8265, -14.9893],
        [-18.2137,  19.6072, -14.8622],
        [-18.6005,  19.9550, -14.7908],
        [-18.7912,  19.9418, -14.7809],
        [-18.2189,  19.4858, -14.7374],
        [-18.4115,  20.0007, -14.7748]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.9613,  14.8516,  16.1345, -18.6527,  19.8265, -14.9893],
        [ 20.

epoch:0, loss:-3.112396478652954
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9034, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.5349, 14.8410, 16.2600],
        [20.2966, 14.9936, 16.1710],
        [20.2255, 15.0741, 16.1765],
        [20.1565, 14.5863, 15.9765],
        [20.5125, 15.1610, 16.2170],
        [20.3777, 14.9096, 16.1764]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(24.4405, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.0930,  19.6064, -14.5222],
        [-18.4411,  20.1903, -15.1355],
        [-18.3182,  20.0343, -14.5798],
        [-18.3316,  19.5751, -15.0076],
        [-18.6536,  19.9199, -15.0287],
        [-18.6620,  20.1860, -15.1773]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.5349,  14.8410,  16.2600, -18.0930,  19.6064, -14.5222],
        [ 20.2

epoch:0, loss:-3.2385170459747314
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.4398, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[19.9979, 15.0763, 16.1209],
        [20.9089, 15.7501, 16.2535],
        [19.8433, 14.6033, 15.9177],
        [20.2582, 14.7222, 16.5203],
        [20.3923, 15.1801, 16.4675],
        [20.3554, 14.9649, 15.8448]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(31.4530, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.1155,  19.4421, -14.7696],
        [-18.4880,  20.1829, -15.3930],
        [-18.0586,  19.5979, -14.7180],
        [-18.7290,  20.2273, -15.4106],
        [-19.2455,  20.5059, -15.2247],
        [-18.8267,  20.1452, -14.8888]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 19.9979,  15.0763,  16.1209, -18.1155,  19.4421, -14.7696],
        [ 20.

epoch:0, loss:-3.1705269813537598
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7183, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.3030, 14.7589, 16.1196],
        [20.1527, 15.3279, 16.1735],
        [20.6276, 15.2235, 16.3287],
        [20.1526, 15.0936, 16.6108],
        [20.3923, 14.8009, 15.7987],
        [20.6100, 14.9996, 16.1053]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(18.8005, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.5450,  20.0662, -15.0667],
        [-18.7655,  20.3533, -15.0802],
        [-18.2898,  19.9359, -14.7919],
        [-18.8896,  20.1456, -14.9579],
        [-18.9182,  19.8837, -14.8444],
        [-18.5020,  20.1332, -15.3134]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.3030,  14.7589,  16.1196, -18.5450,  20.0662, -15.0667],
        [ 20.

epoch:0, loss:-3.2629685401916504
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3746, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.5665, 15.0443, 16.1476],
        [20.5442, 14.7801, 16.1753],
        [20.7624, 14.7690, 16.1822],
        [20.2689, 14.9509, 16.1456],
        [20.9061, 15.1511, 16.4478],
        [20.3778, 14.8812, 16.3981]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.1491, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.8159,  19.6462, -15.0823],
        [-18.2898,  20.0027, -15.2353],
        [-18.4538,  19.9257, -14.7878],
        [-18.3516,  20.0148, -14.8387],
        [-19.0208,  20.2467, -14.6103],
        [-18.5384,  19.9511, -14.9300]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.5665,  15.0443,  16.1476, -18.8159,  19.6462, -15.0823],
        [ 20.

epoch:0, loss:-3.213564872741699
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9493, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.8757, 15.2834, 16.3238],
        [20.6509, 15.1938, 16.1498],
        [20.8024, 15.2895, 16.5486],
        [20.1737, 14.7248, 16.0156],
        [20.9560, 15.1960, 16.5297],
        [20.2319, 14.9066, 16.1538]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(29.9041, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.3640,  19.6618, -15.0087],
        [-19.0147,  20.2610, -14.9933],
        [-18.8548,  19.8277, -15.1999],
        [-18.5167,  20.2787, -15.0502],
        [-19.3543,  20.7134, -15.4012],
        [-18.3904,  19.7059, -15.1105]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.8757,  15.2834,  16.3238, -18.3640,  19.6618, -15.0087],
        [ 20.6

epoch:0, loss:-3.2828867435455322
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.8570, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.4567, 15.0733, 16.2594],
        [20.6515, 15.0529, 16.4334],
        [21.3032, 15.6282, 16.5105],
        [20.7207, 15.1471, 16.0738],
        [20.4980, 14.9990, 16.5448],
        [20.4383, 14.9992, 16.1987]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(30.8560, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.1271,  19.7729, -14.9087],
        [-18.9759,  20.1032, -15.4807],
        [-18.1830,  19.4613, -14.9049],
        [-18.7645,  19.6013, -15.3550],
        [-18.5579,  20.1910, -15.1835],
        [-18.1128,  19.9322, -14.6049]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.4567,  15.0733,  16.2594, -18.1271,  19.7729, -14.9087],
        [ 20.

epoch:0, loss:-3.3202743530273438
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4997, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.6035, 15.3468, 16.3049],
        [20.4410, 15.0752, 16.1175],
        [21.0782, 15.6579, 16.6285],
        [20.8741, 15.1626, 16.6862],
        [20.4473, 15.2669, 16.0752],
        [20.5553, 15.2619, 16.1123]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(37.6136, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.7478,  20.2617, -15.1506],
        [-18.6618,  19.9770, -15.4079],
        [-19.0507,  20.1220, -15.1103],
        [-18.8119,  19.9583, -14.9027],
        [-18.7891,  20.3301, -14.9216],
        [-18.8454,  19.9023, -15.1330]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.6035,  15.3468,  16.3049, -18.7478,  20.2617, -15.1506],
        [ 20.

epoch:0, loss:-3.316230535507202
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7749, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.3481, 15.7554, 16.8098],
        [21.1564, 15.7026, 16.7742],
        [21.2601, 15.2190, 16.7172],
        [20.7422, 15.4113, 16.3599],
        [20.3930, 15.4152, 16.5929],
        [20.6049, 14.9371, 16.2762]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(39.0849, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.6296,  20.0774, -15.2903],
        [-18.7712,  20.2207, -15.2226],
        [-18.8238,  20.2822, -15.0306],
        [-19.3722,  20.4077, -15.4148],
        [-18.9303,  20.4059, -15.1616],
        [-18.5626,  20.0260, -14.9154]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.3481,  15.7554,  16.8098, -18.6296,  20.0774, -15.2903],
        [ 21.1

epoch:0, loss:-3.3335378170013428
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7717, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.5864, 15.1089, 16.4002],
        [21.1595, 15.6974, 16.6519],
        [20.9564, 15.5429, 16.7673],
        [20.2124, 15.2598, 16.3504],
        [20.7571, 15.3839, 16.7744],
        [20.9881, 15.4172, 16.7530]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.0554, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.4306,  20.6007, -15.4610],
        [-18.7467,  20.4886, -15.3559],
        [-18.7368,  20.3477, -14.8675],
        [-19.0349,  20.7320, -15.5415],
        [-18.8463,  20.4424, -15.3293],
        [-19.2742,  20.4851, -15.3440]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.5864,  15.1089,  16.4002, -19.4306,  20.6007, -15.4610],
        [ 21.

epoch:0, loss:-3.3823418617248535
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.8058, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.5792, 15.3565, 16.7844],
        [21.0845, 15.5920, 16.3797],
        [21.0402, 15.3484, 16.8795],
        [21.1866, 15.6205, 16.8560],
        [20.4061, 15.2595, 16.1726],
        [20.6361, 15.1766, 16.6775]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(32.0133, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.9366,  20.5275, -15.7568],
        [-18.3077,  20.0331, -15.4015],
        [-18.6094,  19.9668, -15.0230],
        [-19.3739,  20.3623, -15.4461],
        [-19.0455,  20.3433, -15.4070],
        [-18.8242,  20.2537, -15.3589]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.5792,  15.3565,  16.7844, -18.9366,  20.5275, -15.7568],
        [ 21.

epoch:0, loss:-3.3559255599975586
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7156, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.1116, 15.9055, 16.9435],
        [21.2335, 15.6534, 17.1485],
        [20.8916, 15.6780, 16.7675],
        [19.9533, 14.8377, 16.2142],
        [21.0823, 15.7526, 16.7871],
        [20.8806, 15.8272, 16.1835]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(39.2790, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.9391,  20.4625, -15.4661],
        [-19.4904,  20.4066, -15.6778],
        [-18.5930,  19.9334, -14.9678],
        [-18.7025,  20.1859, -15.3203],
        [-19.0866,  20.2183, -15.6149],
        [-19.0860,  20.5700, -15.6561]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.1116,  15.9055,  16.9435, -18.9391,  20.4625, -15.4661],
        [ 21.

epoch:0, loss:-3.3653271198272705
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.5688, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.0995, 15.3982, 16.6125],
        [20.8920, 15.6318, 16.9758],
        [21.0077, 15.6413, 16.6678],
        [21.1734, 15.6585, 17.2332],
        [21.0502, 16.0416, 17.0413],
        [21.0332, 15.5287, 16.8916]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(39.2058, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.9923,  20.5630, -15.7554],
        [-19.3995,  20.6431, -15.6945],
        [-19.3659,  20.7209, -15.6254],
        [-18.7769,  20.3090, -15.2799],
        [-18.9996,  20.6702, -15.7920],
        [-19.3043,  20.6975, -15.7565]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.0995,  15.3982,  16.6125, -18.9923,  20.5630, -15.7554],
        [ 20.

epoch:0, loss:-3.394116163253784
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.8689, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.6467, 15.5369, 16.2146],
        [21.5078, 15.8778, 16.8833],
        [20.8928, 15.4808, 16.9457],
        [21.1518, 15.0735, 17.0055],
        [21.0310, 15.4280, 16.5084],
        [20.5214, 15.1190, 16.1038]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(33.3508, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-18.8590,  20.7442, -15.5756],
        [-19.2815,  20.7589, -15.3701],
        [-19.0759,  20.9905, -15.5578],
        [-19.0595,  20.5607, -15.6971],
        [-19.4230,  21.0342, -15.5518],
        [-19.2251,  20.6955, -15.6395]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.6467,  15.5369,  16.2146, -18.8590,  20.7442, -15.5756],
        [ 21.5

epoch:0, loss:-3.452342987060547
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6085, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.0582, 15.8701, 16.5148],
        [20.9986, 15.6670, 16.5075],
        [20.4320, 15.4312, 17.0067],
        [20.5535, 15.4533, 16.2148],
        [21.8525, 15.9434, 17.0996],
        [20.7741, 15.9411, 16.6105]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(26.6323, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.0693,  20.4258, -15.6040],
        [-19.3668,  20.5698, -15.7787],
        [-19.0429,  20.6358, -15.7774],
        [-19.5036,  21.1800, -15.9849],
        [-19.0977,  20.6634, -15.6234],
        [-18.8759,  20.0804, -15.3479]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.0582,  15.8701,  16.5148, -19.0693,  20.4258, -15.6040],
        [ 20.9

epoch:0, loss:-3.4763267040252686
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6303, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.0532, 15.2594, 16.3951],
        [21.0128, 15.6943, 16.8111],
        [20.8863, 15.1425, 16.5714],
        [20.8813, 16.0975, 16.9632],
        [20.6247, 15.4978, 16.7112],
        [21.4470, 15.7058, 17.2352]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(32.8138, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.1385,  20.6491, -15.5838],
        [-18.7711,  20.9486, -15.8541],
        [-19.4846,  20.8972, -15.6756],
        [-18.8101,  20.4809, -15.5945],
        [-19.8042,  21.1365, -15.8311],
        [-19.0305,  20.4359, -15.5711]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.0532,  15.2594,  16.3951, -19.1385,  20.6491, -15.5838],
        [ 21.

epoch:0, loss:-3.467524766921997
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.4811, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.4524, 15.7588, 16.9880],
        [20.5252, 15.0735, 16.7208],
        [21.0526, 15.7047, 17.2744],
        [20.7803, 15.5856, 16.8526],
        [21.1088, 15.3821, 16.8523],
        [21.5246, 15.8761, 17.2120]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(32.8259, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.5127,  21.0352, -15.6962],
        [-19.1347,  20.6976, -15.5201],
        [-19.0537,  20.3640, -15.8539],
        [-19.4037,  21.0404, -15.8304],
        [-19.4102,  20.5030, -16.1472],
        [-19.2519,  20.6232, -15.8432]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.4524,  15.7588,  16.9880, -19.5127,  21.0352, -15.6962],
        [ 20.5

epoch:0, loss:-3.4637691974639893
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.6543, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.4071, 15.9597, 17.2520],
        [21.3093, 15.6977, 17.5191],
        [21.0344, 15.7933, 16.9959],
        [20.9997, 15.5943, 17.0014],
        [21.1716, 15.7290, 17.1605],
        [21.0064, 16.1716, 16.7872]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(32.5062, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.1658,  20.5912, -15.7580],
        [-18.9472,  20.4659, -15.7032],
        [-18.3757,  20.0314, -15.2056],
        [-20.0013,  20.9550, -16.0598],
        [-19.2634,  20.8142, -15.7877],
        [-19.3543,  20.4906, -15.6922]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.4071,  15.9597,  17.2520, -19.1658,  20.5912, -15.7580],
        [ 21.

epoch:0, loss:-3.4571874141693115
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.2507, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.3917, 16.0582, 17.0304],
        [21.6831, 15.9913, 17.3725],
        [21.0821, 15.6802, 16.7839],
        [21.0386, 15.9528, 17.1092],
        [21.0355, 15.4439, 16.9868],
        [21.6803, 15.9335, 16.9515]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(39.1621, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.0958,  20.7425, -15.6335],
        [-19.6366,  20.6683, -15.6270],
        [-19.9712,  21.0535, -16.2411],
        [-19.6028,  21.1759, -16.0103],
        [-19.1231,  20.8262, -15.6274],
        [-19.8318,  20.8473, -15.8246]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.3917,  16.0582,  17.0304, -19.0958,  20.7425, -15.6335],
        [ 21.

epoch:0, loss:-3.4881088733673096
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.1293, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.4069, 15.5948, 16.8416],
        [21.0041, 15.3841, 16.6281],
        [21.1402, 16.1498, 17.4151],
        [21.6056, 16.2169, 16.8380],
        [21.4442, 16.1173, 16.9942],
        [21.0282, 15.5931, 16.8909]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(40.3909, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.6171,  20.8496, -15.6024],
        [-20.2089,  21.0091, -16.3502],
        [-19.7427,  20.9233, -16.2181],
        [-19.5794,  20.4364, -16.0150],
        [-19.1917,  20.4160, -15.5660],
        [-19.5404,  20.8306, -16.1818]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.4069,  15.5948,  16.8416, -19.6171,  20.8496, -15.6024],
        [ 21.

epoch:0, loss:-3.573012113571167
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3923, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[20.8791, 15.8448, 16.9055],
        [21.2983, 15.8630, 16.9113],
        [21.2209, 15.9418, 17.3301],
        [20.7717, 15.5164, 16.5820],
        [21.5278, 16.3681, 17.6048],
        [21.0511, 15.9369, 17.2840]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(39.4435, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.8877,  20.8875, -16.1367],
        [-19.4030,  20.6007, -15.9744],
        [-19.9900,  21.3463, -16.8700],
        [-19.8638,  20.9848, -16.1030],
        [-19.0548,  20.6927, -15.5729],
        [-19.9455,  21.2125, -16.4885]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 20.8791,  15.8448,  16.9055, -19.8877,  20.8875, -16.1367],
        [ 21.2

epoch:0, loss:-3.565258264541626
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.6206, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.0239, 16.3714, 17.8054],
        [21.4077, 16.6536, 17.4675],
        [21.6649, 16.1102, 17.4483],
        [20.9012, 16.0847, 16.9794],
        [21.8618, 16.2489, 17.5818],
        [21.8049, 16.5018, 17.4150]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.9321, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.2498,  20.6565, -15.7069],
        [-19.5378,  21.2587, -16.2308],
        [-19.4861,  20.7698, -15.9526],
        [-19.7245,  21.4545, -16.6294],
        [-19.8471,  21.3684, -16.6095],
        [-19.3645,  20.7839, -15.2532]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.0239,  16.3714,  17.8054, -19.2498,  20.6565, -15.7069],
        [ 21.4

epoch:0, loss:-3.5538313388824463
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.6570, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.5281, 16.2712, 17.0036],
        [21.2601, 16.2745, 17.5286],
        [21.4600, 16.0403, 17.4664],
        [21.2254, 16.3600, 17.4401],
        [21.4680, 16.1023, 17.6691],
        [21.3989, 16.0717, 17.0425]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(19.9845, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.9430,  21.4172, -16.5012],
        [-19.4873,  21.0673, -16.2014],
        [-20.0554,  20.8358, -16.3844],
        [-19.6110,  20.8340, -16.2978],
        [-19.8987,  21.3174, -16.5857],
        [-20.1596,  21.0732, -16.2577]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.5281,  16.2712,  17.0036, -19.9430,  21.4172, -16.5012],
        [ 21.

epoch:0, loss:-3.650567054748535
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9106, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.6382, 16.4020, 17.3117],
        [21.9916, 16.6624, 17.8238],
        [20.8089, 16.0176, 17.2363],
        [21.7521, 16.3020, 17.7797],
        [21.4579, 15.9123, 17.2636],
        [21.5217, 16.3943, 17.1613]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(32.7402, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.2048,  21.1166, -15.6420],
        [-19.6208,  21.0571, -16.2543],
        [-19.8261,  21.4440, -16.5947],
        [-19.4304,  20.5419, -15.6806],
        [-19.7889,  20.9318, -16.4325],
        [-19.6288,  21.0000, -16.0314]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.6382,  16.4020,  17.3117, -19.2048,  21.1166, -15.6420],
        [ 21.9

epoch:0, loss:-3.6209604740142822
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.0210, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.7524, 16.2620, 17.7782],
        [21.5445, 16.2285, 17.5612],
        [21.7134, 16.5457, 17.2067],
        [21.9914, 16.4568, 17.4130],
        [21.1923, 16.4998, 17.1378],
        [21.8348, 16.8111, 17.7509]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(26.8539, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.7064,  20.3513, -15.7483],
        [-19.8633,  21.4025, -16.3409],
        [-19.8705,  21.0602, -16.6473],
        [-19.4632,  21.1882, -16.5101],
        [-19.6913,  21.5038, -16.2525],
        [-19.7621,  21.4918, -16.0873]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.7524,  16.2620,  17.7782, -19.7064,  20.3513, -15.7483],
        [ 21.

epoch:0, loss:-3.6465952396392822
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.6171, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.5578, 16.2414, 17.2381],
        [21.4583, 16.1100, 16.8497],
        [21.7914, 16.5056, 17.8176],
        [21.3800, 16.0534, 17.1964],
        [21.7753, 16.0525, 17.4720],
        [21.4731, 16.5484, 17.1371]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(27.1271, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.9799,  21.2507, -16.1820],
        [-19.7652,  20.9526, -16.4307],
        [-19.5823,  21.0657, -16.2334],
        [-19.9638,  21.4216, -16.6649],
        [-19.5537,  20.8978, -16.0448],
        [-19.7529,  21.1924, -16.3889]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.5578,  16.2414,  17.2381, -19.9799,  21.2507, -16.1820],
        [ 21.

epoch:1, loss:-3.6536009311676025
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5856, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.5682, 16.5071, 17.2744],
        [21.6116, 16.4594, 17.5756],
        [21.7194, 16.5364, 16.9638],
        [21.9738, 16.5871, 17.8582],
        [21.4053, 16.6165, 17.5030],
        [21.7252, 16.5132, 17.4893]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(40.5189, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.9107,  21.6378, -16.5507],
        [-20.1238,  21.3274, -16.5558],
        [-20.0626,  21.4031, -16.2631],
        [-19.7426,  20.6329, -15.8756],
        [-20.2352,  20.9023, -16.5126],
        [-19.6910,  20.8037, -16.2109]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.5682,  16.5071,  17.2744, -19.9107,  21.6378, -16.5507],
        [ 21.

epoch:1, loss:-3.6638739109039307
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6372, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.5886, 16.1255, 16.9930],
        [21.9733, 16.4421, 17.6535],
        [21.5854, 16.2035, 17.7454],
        [21.8109, 16.5514, 17.7753],
        [21.6459, 16.5533, 17.3831],
        [21.6436, 16.2949, 17.5807]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(26.9142, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.1554,  21.5647, -16.9849],
        [-20.0277,  21.1067, -16.1491],
        [-19.7219,  21.4836, -15.9653],
        [-19.6106,  21.2636, -16.8690],
        [-19.9881,  21.2340, -16.0851],
        [-19.8044,  21.4360, -16.3743]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.5886,  16.1255,  16.9930, -20.1554,  21.5647, -16.9849],
        [ 21.

epoch:1, loss:-3.6470632553100586
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.4466, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.7174, 16.4843, 17.4529],
        [21.5365, 16.3463, 17.2156],
        [21.7649, 16.6685, 17.5014],
        [21.2815, 16.0035, 16.9739],
        [21.4102, 16.3252, 17.3956],
        [21.5340, 16.5775, 17.6429]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.9360, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.6323,  21.9911, -17.0343],
        [-19.9164,  21.3351, -16.8989],
        [-20.3796,  21.5277, -16.9240],
        [-20.2082,  21.5331, -16.8496],
        [-20.4029,  21.7086, -17.1701],
        [-20.1077,  21.6441, -17.2081]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.7174,  16.4843,  17.4529, -20.6323,  21.9911, -17.0343],
        [ 21.

epoch:1, loss:-3.707339286804199
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9748, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.4099, 15.8731, 16.9137],
        [21.9983, 16.8234, 17.4995],
        [22.1157, 16.6126, 17.4702],
        [21.7320, 16.7049, 17.6669],
        [21.8120, 16.0645, 17.2902],
        [22.0847, 16.8158, 17.7865]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(26.5092, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.9356,  21.1497, -16.7287],
        [-19.9693,  21.3541, -16.4574],
        [-20.5274,  21.1997, -16.7818],
        [-20.5132,  22.0686, -17.2375],
        [-20.4986,  21.6886, -16.7430],
        [-19.9868,  21.2469, -16.7884]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.4099,  15.8731,  16.9137, -19.9356,  21.1497, -16.7287],
        [ 21.9

epoch:1, loss:-3.76430082321167
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.8932, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.2224, 16.3225, 17.9483],
        [21.8722, 16.8537, 18.2292],
        [22.0349, 16.7056, 17.9755],
        [21.8153, 16.5585, 17.7873],
        [21.7952, 16.2493, 17.7498],
        [21.9805, 16.3382, 17.5477]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.6904, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-19.8538,  21.2166, -16.6286],
        [-19.5422,  21.3895, -16.5547],
        [-20.3508,  21.0336, -16.8185],
        [-20.0793,  21.6084, -16.6261],
        [-20.3644,  21.5515, -16.9097],
        [-20.1802,  21.3348, -16.6012]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.2224,  16.3225,  17.9483, -19.8538,  21.2166, -16.6286],
        [ 21.87

epoch:1, loss:-3.791496753692627
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4864, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.1424, 16.8065, 18.0277],
        [22.2178, 16.6571, 17.6068],
        [22.0635, 16.4268, 17.6500],
        [21.8298, 16.9549, 18.0793],
        [22.3054, 16.5752, 18.0813],
        [21.4111, 16.2918, 17.4622]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.0652, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.1587,  21.3537, -16.8918],
        [-20.6245,  22.1682, -16.8997],
        [-20.3327,  21.3869, -17.0136],
        [-20.3471,  21.5428, -16.9203],
        [-20.2330,  21.6260, -16.8153],
        [-20.4414,  21.9252, -16.9605]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.1424,  16.8065,  18.0277, -20.1587,  21.3537, -16.8918],
        [ 22.2

epoch:1, loss:-3.802527666091919
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5939, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.9125, 16.7481, 18.0307],
        [21.8143, 16.8302, 18.1123],
        [22.3118, 17.3745, 18.1352],
        [21.9705, 16.4822, 17.9863],
        [21.8500, 16.7164, 17.7462],
        [22.3120, 16.9917, 17.7987]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(33.6805, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.7195,  21.7855, -16.8762],
        [-20.2044,  21.6524, -16.7165],
        [-20.4216,  21.5463, -16.3680],
        [-20.1687,  21.2602, -16.7481],
        [-20.3070,  21.6977, -16.6039],
        [-20.4184,  21.6551, -16.7941]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.9125,  16.7481,  18.0307, -20.7195,  21.7855, -16.8762],
        [ 21.8

epoch:1, loss:-3.8268468379974365
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7022, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.4342, 17.2778, 18.4588],
        [21.5881, 16.6064, 17.4904],
        [22.3330, 16.8661, 18.1641],
        [22.1225, 17.1994, 17.4176],
        [22.1558, 16.7025, 18.0752],
        [21.9993, 17.0828, 18.1540]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(27.4886, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.4744,  21.4568, -16.7194],
        [-20.4204,  21.8993, -16.9717],
        [-20.3267,  21.7546, -17.0259],
        [-20.3429,  22.1429, -16.9726],
        [-20.3662,  21.7586, -17.1646],
        [-20.2629,  21.4972, -17.0002]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.4342,  17.2778,  18.4588, -20.4744,  21.4568, -16.7194],
        [ 21.

epoch:1, loss:-3.8406624794006348
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.0197, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.5337, 17.0919, 18.1823],
        [22.4549, 16.8409, 17.9307],
        [22.1682, 16.8371, 17.7879],
        [22.0149, 16.8402, 17.9948],
        [21.5687, 16.1324, 17.4716],
        [21.6584, 16.7621, 17.6505]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(35.1641, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.2205,  22.0042, -16.8717],
        [-20.4386,  21.9188, -17.6161],
        [-20.6465,  21.7783, -16.8587],
        [-20.5942,  21.8376, -17.0960],
        [-20.2630,  21.7278, -17.0763],
        [-20.0316,  21.5144, -16.6717]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.5337,  17.0919,  18.1823, -20.2205,  22.0042, -16.8717],
        [ 22.

epoch:1, loss:-3.829265832901001
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7124, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.2121, 17.2337, 18.3495],
        [22.2703, 16.9845, 18.1211],
        [22.0589, 16.5313, 18.0612],
        [22.3784, 16.8037, 18.0452],
        [22.0420, 16.6328, 18.1365],
        [22.1394, 16.7854, 17.4535]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(35.0968, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.7170,  21.8345, -17.2748],
        [-20.4021,  21.8030, -16.4568],
        [-20.4186,  21.8579, -17.2456],
        [-20.0145,  21.5960, -16.8939],
        [-20.0638,  21.4967, -16.6370],
        [-20.2802,  21.6574, -16.9192]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.2121,  17.2337,  18.3495, -20.7170,  21.8345, -17.2748],
        [ 22.2

epoch:1, loss:-3.8532912731170654
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5416, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.4704, 17.3218, 18.1223],
        [22.4116, 16.8363, 17.9655],
        [22.6615, 17.3305, 18.5010],
        [22.2103, 17.1491, 17.8876],
        [22.1256, 17.0850, 18.3323],
        [21.3341, 16.5737, 17.8595]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(35.2165, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.5043,  21.6268, -16.9269],
        [-20.7204,  21.6040, -16.7949],
        [-20.5095,  21.8663, -17.3261],
        [-20.5561,  21.6366, -16.9820],
        [-20.3852,  21.7281, -17.1789],
        [-20.6468,  21.8397, -17.2721]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.4704,  17.3218,  18.1223, -20.5043,  21.6268, -16.9269],
        [ 22.

epoch:1, loss:-3.8923745155334473
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.8647, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[21.9959, 16.9195, 17.7819],
        [21.7787, 16.7321, 17.7438],
        [22.0014, 17.2496, 18.0256],
        [22.1827, 16.8324, 17.9035],
        [22.2584, 17.0065, 18.2884],
        [21.9082, 16.6412, 18.1383]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(41.4721, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.6212,  22.1393, -17.3296],
        [-20.8298,  21.9668, -17.1534],
        [-20.6081,  22.3204, -17.1387],
        [-20.5929,  21.8252, -17.1583],
        [-20.7110,  21.6305, -17.0927],
        [-20.3795,  22.1178, -17.6158]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 21.9959,  16.9195,  17.7819, -20.6212,  22.1393, -17.3296],
        [ 21.

epoch:1, loss:-3.913505792617798
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3064, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.1470, 17.0352, 18.0458],
        [22.7819, 17.5077, 18.2378],
        [22.5632, 17.1612, 18.5086],
        [21.9783, 17.0303, 17.7136],
        [22.7607, 16.8987, 18.4490],
        [21.4351, 16.3696, 17.6683]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(34.8930, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.7408,  21.7886, -17.1006],
        [-20.3497,  21.8320, -17.3393],
        [-20.5313,  22.0923, -17.4126],
        [-20.5753,  22.1066, -17.0697],
        [-20.5928,  21.8093, -17.4520],
        [-20.4750,  21.8412, -17.2796]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.1470,  17.0352,  18.0458, -20.7408,  21.7886, -17.1006],
        [ 22.7

epoch:1, loss:-3.9142279624938965
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6710, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.4562, 17.0564, 17.9221],
        [22.2569, 17.2941, 18.4238],
        [22.4698, 17.4452, 18.4900],
        [21.8040, 17.2505, 17.8919],
        [22.3285, 17.5974, 18.6300],
        [22.4095, 17.2643, 18.0572]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.1904, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.2022,  21.8754, -17.0343],
        [-21.0054,  22.7270, -17.5438],
        [-20.9782,  22.3492, -17.7077],
        [-20.8082,  22.2461, -17.2433],
        [-20.9015,  21.8560, -17.0496],
        [-20.7390,  21.9598, -16.9926]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.4562,  17.0564,  17.9221, -20.2022,  21.8754, -17.0343],
        [ 22.

epoch:1, loss:-3.957214832305908
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7443, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.6914, 17.1671, 18.3460],
        [22.2611, 17.2705, 17.8016],
        [22.9540, 17.2027, 18.3526],
        [22.4225, 17.2496, 18.1908],
        [21.7096, 16.8217, 17.9993],
        [22.1818, 17.0830, 17.9736]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(27.6545, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.8181,  22.2216, -17.2372],
        [-21.4574,  22.6808, -17.9832],
        [-20.5986,  22.1871, -17.1093],
        [-21.4436,  22.4059, -18.0749],
        [-20.8871,  22.1144, -17.6246],
        [-20.1787,  21.5638, -16.9323]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.6914,  17.1671,  18.3460, -20.8181,  22.2216, -17.2372],
        [ 22.2

epoch:1, loss:-4.009000301361084
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4829, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.6050, 17.4389, 18.5936],
        [22.4155, 17.3387, 18.3745],
        [22.8977, 16.9496, 18.2586],
        [22.3808, 17.4816, 18.6827],
        [22.7143, 17.0445, 18.1793],
        [22.9707, 17.5487, 18.6373]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(20.9392, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.2417,  22.3550, -17.5806],
        [-20.6270,  22.1922, -17.1104],
        [-20.9734,  22.2557, -17.6437],
        [-20.9994,  22.0058, -17.1241],
        [-20.7889,  21.6516, -17.0507],
        [-20.6675,  22.1931, -17.4049]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.6050,  17.4389,  18.5936, -21.2417,  22.3550, -17.5806],
        [ 22.4

epoch:1, loss:-3.970778465270996
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6407, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.0925, 17.6230, 18.9577],
        [22.5633, 17.2525, 18.6686],
        [22.6778, 17.5091, 18.6331],
        [22.6331, 17.0497, 18.2622],
        [22.3271, 17.5066, 18.5267],
        [22.6106, 17.2703, 18.3471]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(42.6028, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.8297,  21.9749, -17.5015],
        [-20.9185,  22.1242, -17.6908],
        [-20.9398,  22.3006, -17.5550],
        [-20.9448,  22.3687, -17.3085],
        [-21.0309,  21.9239, -17.6105],
        [-21.2316,  22.4493, -17.7346]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.0925,  17.6230,  18.9577, -20.8297,  21.9749, -17.5015],
        [ 22.5

epoch:1, loss:-4.0129570960998535
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.0225, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.6306, 17.7130, 18.3845],
        [22.5393, 17.2048, 18.1126],
        [22.1773, 17.0093, 18.4335],
        [22.0967, 17.5257, 18.2710],
        [22.1601, 16.6673, 18.2124],
        [22.4941, 17.1893, 17.7192]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(35.8928, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.0005,  22.2841, -17.3364],
        [-20.4794,  21.7325, -17.2710],
        [-20.7066,  22.1898, -17.7463],
        [-20.7117,  22.2659, -17.5310],
        [-21.0574,  22.2057, -17.4495],
        [-21.2322,  22.4684, -17.5348]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.6306,  17.7130,  18.3845, -21.0005,  22.2841, -17.3364],
        [ 22.

epoch:1, loss:-4.004147052764893
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5719, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.5235, 17.2291, 17.9918],
        [22.6031, 17.4152, 18.2814],
        [22.7254, 17.7815, 18.7496],
        [22.3174, 16.9915, 18.2590],
        [22.6177, 17.4218, 18.5077],
        [22.6112, 17.5000, 18.7105]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.9244, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.3070,  22.5815, -17.7246],
        [-21.2941,  22.4072, -17.3241],
        [-21.1343,  22.4945, -18.0220],
        [-20.3987,  22.0378, -17.3957],
        [-20.9768,  22.2160, -17.7521],
        [-21.0460,  22.4744, -17.8715]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.5235,  17.2291,  17.9918, -21.3070,  22.5815, -17.7246],
        [ 22.6

epoch:1, loss:-4.0481486320495605
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7432, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.2683, 17.3424, 18.5646],
        [22.5639, 17.4333, 18.3614],
        [23.3157, 17.9319, 18.8566],
        [22.6657, 17.5187, 18.8842],
        [22.5873, 17.5779, 18.9323],
        [22.4806, 17.1593, 18.3695]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(35.3759, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.0082,  22.3792, -17.8992],
        [-21.1131,  22.6795, -17.5484],
        [-21.0907,  22.6388, -17.9512],
        [-21.3872,  22.5216, -17.9900],
        [-21.3487,  22.7232, -17.5397],
        [-21.6255,  23.1050, -17.8966]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.2683,  17.3424,  18.5646, -21.0082,  22.3792, -17.8992],
        [ 22.

epoch:1, loss:-4.077288627624512
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9256, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.7001, 17.8893, 18.7522],
        [22.6658, 17.2427, 18.6633],
        [23.0499, 17.8303, 18.4532],
        [22.9686, 17.6732, 18.0531],
        [23.0861, 17.8003, 18.8845],
        [22.0769, 17.4796, 18.3381]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(35.5334, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.5034,  22.6603, -18.0027],
        [-21.3794,  22.4931, -18.1915],
        [-21.3135,  22.3372, -17.7699],
        [-21.1757,  22.4062, -17.6986],
        [-20.3798,  22.0174, -17.3406],
        [-21.0101,  22.1653, -17.7704]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.7001,  17.8893,  18.7522, -21.5034,  22.6603, -18.0027],
        [ 22.6

epoch:1, loss:-4.125563621520996
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.2302, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.4604, 17.8096, 18.6873],
        [23.2737, 18.0087, 18.7915],
        [22.4961, 17.3387, 17.9545],
        [22.3311, 17.4314, 18.5911],
        [22.4559, 17.4291, 18.4607],
        [22.3804, 17.3049, 18.4618]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(14.3644, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.8103,  22.0872, -17.0749],
        [-20.8671,  22.4220, -17.2138],
        [-21.5517,  23.0145, -18.3969],
        [-21.5740,  22.6656, -18.1427],
        [-21.4369,  22.5458, -17.7167],
        [-20.8515,  22.1952, -17.4778]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.4604,  17.8096,  18.6873, -20.8103,  22.0872, -17.0749],
        [ 23.2

epoch:1, loss:-4.103748798370361
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4023, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.8638, 17.6612, 18.6743],
        [23.0347, 17.6361, 19.0978],
        [22.5015, 17.2862, 18.3832],
        [22.5817, 17.6820, 18.7027],
        [22.8054, 18.1650, 19.1338],
        [22.7782, 17.8192, 19.1261]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(35.8451, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.2427,  22.7292, -18.2242],
        [-20.5395,  22.0503, -17.6827],
        [-21.2530,  22.5322, -17.8010],
        [-21.1683,  22.5243, -17.7538],
        [-21.1961,  22.6919, -17.8254],
        [-21.7264,  23.1265, -18.4617]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.8638,  17.6612,  18.6743, -21.2427,  22.7292, -18.2242],
        [ 23.0

epoch:1, loss:-4.087610244750977
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7617, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.5321, 17.2700, 18.4727],
        [23.0148, 17.9914, 18.6361],
        [23.0331, 17.7194, 18.8500],
        [22.8638, 18.1196, 18.7493],
        [23.0942, 17.9494, 18.8636],
        [22.9957, 17.8120, 18.6524]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(43.4640, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.2164,  22.3963, -17.7193],
        [-21.4159,  22.7977, -17.7062],
        [-21.2246,  22.6854, -17.9867],
        [-21.0292,  22.3443, -17.8816],
        [-21.6836,  22.8793, -18.3332],
        [-21.3934,  22.8657, -18.1824]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.5321,  17.2700,  18.4727, -21.2164,  22.3963, -17.7193],
        [ 23.0

epoch:1, loss:-4.154701232910156
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.1750, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.7045, 17.5478, 18.7008],
        [23.0465, 17.8245, 19.0949],
        [23.4805, 18.0123, 19.0169],
        [23.1900, 18.1117, 19.0515],
        [22.5874, 17.2585, 18.5188],
        [22.8747, 18.0439, 18.6168]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.6337, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.7016,  22.3065, -17.6422],
        [-21.5966,  22.8978, -18.0311],
        [-22.0498,  23.0353, -18.4052],
        [-20.8458,  21.9342, -17.5713],
        [-20.8739,  22.2042, -17.9208],
        [-21.0511,  22.5558, -18.1149]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.7045,  17.5478,  18.7008, -20.7016,  22.3065, -17.6422],
        [ 23.0

epoch:1, loss:-4.221515655517578
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.7087, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.5252, 18.0050, 19.2166],
        [23.1700, 17.7396, 19.0810],
        [22.5625, 17.5378, 18.9450],
        [23.0659, 17.9909, 19.0700],
        [22.7210, 17.5546, 18.4564],
        [22.7744, 17.6013, 18.5730]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(43.1048, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.3645,  22.4693, -17.9245],
        [-21.4049,  22.2489, -17.8572],
        [-21.5182,  22.4770, -17.7117],
        [-20.9155,  22.6512, -17.7544],
        [-20.7974,  22.4619, -18.0997],
        [-21.3899,  22.5975, -17.7225]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.5252,  18.0050,  19.2166, -21.3645,  22.4693, -17.9245],
        [ 23.1

epoch:1, loss:-4.179627895355225
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2979, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.6674, 17.6470, 18.9683],
        [22.6426, 17.8831, 18.9037],
        [22.9475, 17.9636, 18.9270],
        [23.2551, 18.0417, 18.9059],
        [22.7725, 17.9675, 19.0193],
        [23.0743, 18.2241, 19.1333]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(43.5891, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.9854,  22.4731, -17.9463],
        [-21.2626,  22.7632, -18.1690],
        [-20.9774,  22.6667, -17.7612],
        [-21.4181,  22.9560, -18.0395],
        [-21.8996,  23.2565, -18.1693],
        [-21.3234,  22.5920, -17.8321]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.6674,  17.6470,  18.9683, -20.9854,  22.4731, -17.9463],
        [ 22.6

epoch:1, loss:-4.20905876159668
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6244, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.8804, 17.9753, 18.8247],
        [23.2584, 18.1650, 19.0863],
        [23.3765, 18.1400, 19.2820],
        [22.6495, 18.2972, 19.2540],
        [23.1664, 18.0614, 18.8469],
        [22.6055, 17.7349, 18.6064]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.8923, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.9079,  23.3205, -18.9742],
        [-21.5941,  22.8820, -18.0881],
        [-21.6745,  22.6675, -17.8847],
        [-21.1546,  22.5891, -18.2890],
        [-21.0627,  22.7170, -18.4696],
        [-21.3069,  22.8192, -18.0101]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.8804,  17.9753,  18.8247, -21.9079,  23.3205, -18.9742],
        [ 23.25

epoch:1, loss:-4.217188835144043
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.3676, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.0810, 18.1491, 18.7586],
        [22.9875, 18.0124, 19.3618],
        [23.1940, 17.7914, 18.6004],
        [22.9616, 17.5009, 18.8100],
        [22.8673, 18.2617, 18.9708],
        [23.0300, 17.9011, 18.7101]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.8384, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-20.9789,  22.9595, -18.4402],
        [-21.0467,  23.0485, -17.8033],
        [-21.0858,  22.3831, -17.6665],
        [-21.6033,  22.8945, -18.0516],
        [-21.5843,  22.8228, -18.4222],
        [-21.8273,  23.2031, -18.5369]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.0810,  18.1491,  18.7586, -20.9789,  22.9595, -18.4402],
        [ 22.9

epoch:1, loss:-4.22088623046875
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5015, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.7882, 18.5418, 19.6535],
        [23.4692, 18.2653, 19.2565],
        [23.9157, 18.3702, 19.5318],
        [23.0121, 17.8569, 18.6529],
        [22.8885, 17.8501, 18.7955],
        [23.0405, 17.7420, 19.1511]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.8341, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.2595,  23.6897, -18.9066],
        [-21.4365,  22.8711, -18.2158],
        [-21.9607,  23.4082, -18.6778],
        [-21.7628,  22.8506, -17.8229],
        [-21.7052,  23.0028, -18.5658],
        [-21.5625,  22.9095, -18.2740]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.7882,  18.5418,  19.6535, -22.2595,  23.6897, -18.9066],
        [ 23.46

epoch:1, loss:-4.2027268409729
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7132, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.9406, 18.0435, 19.2403],
        [23.3982, 18.2037, 19.3331],
        [23.3389, 18.2479, 18.7798],
        [23.3736, 18.2840, 19.2264],
        [23.2918, 18.2366, 19.0797],
        [23.4055, 18.0688, 19.2817]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.7421, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.6449,  23.4431, -18.6232],
        [-21.8082,  22.9727, -18.4396],
        [-21.2727,  22.6794, -18.3101],
        [-21.7236,  23.2554, -18.6581],
        [-21.5765,  23.3102, -18.3930],
        [-22.2769,  23.5315, -19.0675]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.9406,  18.0435,  19.2403, -21.6449,  23.4431, -18.6232],
        [ 23.398

epoch:1, loss:-4.291661739349365
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3762, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[22.8662, 18.1086, 19.1241],
        [23.6432, 18.2432, 19.3566],
        [23.7520, 18.7284, 18.9927],
        [23.7547, 18.1522, 19.3370],
        [23.7611, 18.2527, 19.3997],
        [23.7632, 18.4418, 19.3998]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.5667, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.8376,  23.2770, -18.1732],
        [-21.6842,  23.2916, -18.6815],
        [-21.9610,  23.4733, -18.8079],
        [-21.2403,  22.3251, -17.9459],
        [-21.3682,  22.6068, -18.1944],
        [-22.2884,  23.5296, -18.5899]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 22.8662,  18.1086,  19.1241, -21.8376,  23.2770, -18.1732],
        [ 23.6

epoch:1, loss:-4.263555526733398
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5390, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.0570, 18.1420, 18.9526],
        [23.4892, 18.2868, 19.2790],
        [23.6097, 18.3649, 19.3924],
        [23.4534, 18.5927, 19.5633],
        [23.6445, 18.2898, 19.4024],
        [23.3649, 18.1976, 19.1863]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(37.1911, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.5421,  22.5379, -18.2651],
        [-21.6584,  22.6817, -18.2491],
        [-21.7547,  22.5547, -18.5422],
        [-21.9193,  23.2935, -18.3464],
        [-21.8913,  23.3133, -18.5080],
        [-21.6969,  22.9763, -18.4864]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.0570,  18.1420,  18.9526, -21.5421,  22.5379, -18.2651],
        [ 23.4

epoch:1, loss:-4.310044288635254
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.9286, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.6733, 18.2388, 19.1937],
        [22.7814, 17.6653, 18.7356],
        [23.4173, 18.2624, 18.9840],
        [22.9050, 17.9010, 18.9675],
        [23.2079, 18.3774, 19.6023],
        [22.9960, 18.0727, 18.7024]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(14.8047, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.3711,  23.5489, -19.0182],
        [-21.0101,  23.2603, -18.1602],
        [-21.6762,  22.9330, -18.4929],
        [-22.5342,  23.1944, -19.0526],
        [-21.6141,  22.6047, -18.0545],
        [-22.2285,  23.0515, -18.4066]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.6733,  18.2388,  19.1937, -22.3711,  23.5489, -19.0182],
        [ 22.7

epoch:1, loss:-4.387251377105713
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.8428, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.1954, 18.2747, 18.9137],
        [23.1231, 18.1535, 19.1178],
        [23.1067, 17.9159, 19.0725],
        [23.5079, 18.1653, 19.1938],
        [23.0183, 18.1683, 19.1350],
        [23.3255, 18.3583, 19.0918]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.2426, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.9635,  23.4271, -18.2581],
        [-21.8895,  23.0994, -18.8043],
        [-21.5904,  23.1554, -18.5433],
        [-21.5957,  22.8075, -18.5934],
        [-21.5730,  23.1447, -18.4223],
        [-22.1880,  23.5796, -18.8523]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.1954,  18.2747,  18.9137, -21.9635,  23.4271, -18.2581],
        [ 23.1

epoch:1, loss:-4.3568034172058105
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.9415, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.9584, 18.4024, 19.2583],
        [23.7625, 18.8618, 19.7780],
        [23.5079, 18.2922, 18.9144],
        [23.1438, 18.1841, 19.7171],
        [24.0563, 18.6610, 19.6788],
        [23.3051, 18.2600, 19.1997]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(37.0803, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.3302,  23.3924, -19.0668],
        [-21.5184,  22.7638, -18.3777],
        [-22.0443,  23.0816, -18.9985],
        [-21.8810,  23.2084, -18.6325],
        [-21.5319,  23.0121, -18.6825],
        [-21.9127,  23.0903, -18.9098]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.9584,  18.4024,  19.2583, -22.3302,  23.3924, -19.0668],
        [ 23.

epoch:1, loss:-4.459785461425781
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.8297, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.5461, 18.5265, 19.8564],
        [22.7283, 18.2819, 19.0621],
        [23.6502, 18.6781, 19.7202],
        [23.2940, 17.8599, 19.1523],
        [23.5659, 18.6502, 19.7422],
        [23.7331, 18.5989, 19.7519]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.1419, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.3145,  23.5056, -18.7652],
        [-21.7082,  23.1642, -18.6021],
        [-21.9663,  23.3294, -18.5330],
        [-22.2569,  23.5829, -18.7748],
        [-21.9388,  23.0172, -18.6664],
        [-21.8940,  23.3688, -18.9168]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.5461,  18.5265,  19.8564, -22.3145,  23.5056, -18.7652],
        [ 22.7

epoch:1, loss:-4.398132801055908
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5005, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.7453, 18.1180, 19.1906],
        [23.8095, 18.5345, 19.4722],
        [23.9671, 18.4586, 19.8313],
        [23.3340, 18.2238, 19.3511],
        [23.8311, 18.7307, 19.6897],
        [23.4270, 18.5972, 19.4285]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(30.2302, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.1068,  23.2738, -18.3222],
        [-22.1337,  23.2700, -18.7870],
        [-22.2046,  23.0245, -18.5151],
        [-21.9500,  23.4514, -19.0301],
        [-22.0475,  23.3226, -18.8330],
        [-22.3182,  23.6524, -18.5556]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.7453,  18.1180,  19.1906, -22.1068,  23.2738, -18.3222],
        [ 23.8

epoch:1, loss:-4.470249176025391
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4578, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.1079, 18.8410, 19.6715],
        [23.9082, 18.7774, 19.6948],
        [23.6931, 18.5414, 19.4297],
        [23.2052, 18.1676, 18.8794],
        [23.9114, 18.6347, 19.7640],
        [23.3203, 18.1602, 18.9155]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(29.4378, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.1795,  23.1899, -18.6045],
        [-22.1153,  23.3538, -19.0509],
        [-22.7349,  24.2279, -19.4109],
        [-22.1046,  23.4441, -19.2019],
        [-21.8068,  22.6038, -18.7689],
        [-22.0223,  23.7019, -19.1992]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.1079,  18.8410,  19.6715, -22.1795,  23.1899, -18.6045],
        [ 23.9

epoch:1, loss:-4.453127384185791
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5624, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.1935, 18.9048, 20.0162],
        [23.5189, 18.7304, 19.4767],
        [24.1067, 19.1598, 19.8169],
        [23.6973, 18.6842, 19.5955],
        [23.9419, 18.4813, 19.1990],
        [23.5476, 18.7479, 19.3647]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(38.0161, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-21.7812,  23.3955, -18.6365],
        [-22.3636,  23.7891, -18.9877],
        [-21.9720,  23.7269, -19.0009],
        [-21.8416,  23.2291, -18.9671],
        [-22.4242,  23.5732, -19.0829],
        [-22.4565,  23.8055, -19.3038]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.1935,  18.9048,  20.0162, -21.7812,  23.3955, -18.6365],
        [ 23.5

epoch:1, loss:-4.53164005279541
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.1091, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.4809, 18.7338, 19.2847],
        [23.6580, 18.9505, 19.4832],
        [23.2700, 18.1550, 19.4497],
        [22.8555, 18.0476, 18.6494],
        [23.6337, 18.4774, 19.3044],
        [23.4969, 18.2129, 19.3589]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(45.4425, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.1815,  23.7084, -19.0849],
        [-22.1801,  23.2572, -19.0606],
        [-22.7163,  23.8753, -19.3834],
        [-22.1187,  23.7377, -18.8872],
        [-22.3526,  24.0126, -19.3611],
        [-22.1410,  23.4931, -18.8331]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.4809,  18.7338,  19.2847, -22.1815,  23.7084, -19.0849],
        [ 23.65

epoch:1, loss:-4.527385711669922
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.1055, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.5855, 18.8401, 19.4277],
        [23.6944, 18.3563, 19.4279],
        [23.3473, 18.4534, 19.2995],
        [23.9848, 18.7225, 19.4094],
        [23.5187, 18.3567, 19.5102],
        [24.1104, 19.2631, 19.9209]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(30.7575, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.1592,  23.8935, -19.2230],
        [-22.3679,  23.4916, -19.4082],
        [-22.1830,  23.4355, -19.0468],
        [-22.3031,  23.6181, -18.8703],
        [-22.7493,  24.0453, -19.6343],
        [-22.3974,  23.6819, -19.1722]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.5855,  18.8401,  19.4277, -22.1592,  23.8935, -19.2230],
        [ 23.6

epoch:1, loss:-4.512554168701172
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6784, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.6381, 19.4243, 20.4060],
        [23.5617, 18.2284, 19.4376],
        [23.9917, 18.6895, 19.5383],
        [23.2913, 18.5861, 19.7555],
        [23.6507, 18.2538, 19.7562],
        [23.6270, 18.9415, 19.8794]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(30.2976, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.8735,  23.9733, -19.3724],
        [-23.1093,  24.5391, -19.7670],
        [-21.7332,  23.0920, -18.8770],
        [-22.2097,  23.9661, -19.0490],
        [-22.3745,  23.9097, -19.6771],
        [-22.4891,  23.5034, -19.1010]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.6381,  19.4243,  20.4060, -22.8735,  23.9733, -19.3724],
        [ 23.5

epoch:1, loss:-4.5062127113342285
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6459, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.2024, 18.9476, 20.2765],
        [23.3853, 18.7783, 19.5605],
        [23.7089, 18.6053, 20.0252],
        [23.8075, 19.0600, 20.0302],
        [24.0723, 18.9709, 19.8728],
        [23.4450, 18.4804, 19.4134]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(38.0949, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.5693,  24.1812, -19.4014],
        [-22.4112,  23.7515, -19.1261],
        [-23.2597,  24.5350, -20.1537],
        [-22.4411,  23.6096, -19.1635],
        [-21.8727,  23.6560, -19.1103],
        [-22.2632,  23.1625, -18.6484]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.2024,  18.9476,  20.2765, -22.5693,  24.1812, -19.4014],
        [ 23.

epoch:1, loss:-4.553940296173096
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4924, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.6628, 18.7816, 19.5838],
        [24.2714, 18.8811, 20.1361],
        [22.9180, 18.1882, 18.9538],
        [23.8424, 19.1642, 19.7958],
        [23.8267, 19.0954, 19.3735],
        [24.0531, 18.9373, 19.9505]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(15.3322, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.1100,  23.5541, -19.1774],
        [-22.3417,  23.6871, -19.1090],
        [-22.2030,  22.9815, -18.9780],
        [-22.0157,  23.7553, -19.1999],
        [-22.6418,  23.6875, -18.9964],
        [-22.7019,  23.4219, -19.1955]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.6628,  18.7816,  19.5838, -22.1100,  23.5541, -19.1774],
        [ 24.2

epoch:1, loss:-4.657135009765625
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3605, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.6402, 18.6818, 19.8560],
        [24.0039, 18.8209, 19.4255],
        [23.9406, 19.2353, 20.0131],
        [24.1381, 19.1137, 20.1624],
        [23.6942, 18.8814, 19.8885],
        [24.0355, 19.0362, 19.8475]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(37.0520, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.2430,  23.7573, -19.5529],
        [-22.9557,  24.3470, -19.4109],
        [-22.6569,  23.9211, -19.3570],
        [-22.4205,  23.8439, -19.5912],
        [-22.4453,  24.1735, -19.1278],
        [-22.8564,  23.7346, -19.2628]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.6402,  18.6818,  19.8560, -22.2430,  23.7573, -19.5529],
        [ 24.0

epoch:1, loss:-4.6255388259887695
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4281, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.9802, 18.8928, 19.8539],
        [23.7894, 18.8268, 19.9009],
        [23.5719, 18.8200, 19.9085],
        [24.3601, 19.1942, 20.2161],
        [24.3343, 18.9533, 19.9137],
        [23.9370, 19.2246, 19.5223]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(30.5239, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.4495,  23.7031, -19.7784],
        [-22.8121,  24.0278, -19.8833],
        [-22.0855,  23.0896, -19.0009],
        [-22.4062,  23.5679, -19.2457],
        [-22.6495,  23.9229, -19.3673],
        [-23.2987,  24.3478, -19.6029]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.9802,  18.8928,  19.8539, -22.4495,  23.7031, -19.7784],
        [ 23.

epoch:1, loss:-4.634825229644775
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(4.6059, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.8856, 18.5373, 19.3790],
        [23.6297, 18.6958, 19.7479],
        [24.4062, 19.7928, 20.4717],
        [24.1098, 19.2325, 20.0327],
        [24.6673, 19.6014, 20.3144],
        [24.0796, 19.2314, 20.0429]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(30.8237, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.6870,  23.9118, -19.7102],
        [-22.1084,  23.2822, -19.0080],
        [-22.4397,  24.2586, -19.0160],
        [-22.7517,  23.6334, -19.5121],
        [-22.5776,  23.8907, -19.5526],
        [-22.5771,  23.8829, -19.3078]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.8856,  18.5373,  19.3790, -22.6870,  23.9118, -19.7102],
        [ 23.6

epoch:1, loss:-4.665884494781494
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6194, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[23.7313, 18.9715, 20.1693],
        [24.4457, 19.5452, 20.1660],
        [23.9875, 19.0080, 20.0239],
        [23.5638, 18.7678, 19.7408],
        [24.0268, 19.0836, 19.9818],
        [24.1157, 19.0424, 20.0796]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(38.8440, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.2421,  23.7650, -19.3452],
        [-23.2863,  24.5369, -20.1851],
        [-23.2590,  24.4482, -19.6206],
        [-22.2834,  23.5424, -19.0019],
        [-22.8716,  23.7654, -19.2155],
        [-22.8576,  24.0292, -19.6102]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 23.7313,  18.9715,  20.1693, -22.2421,  23.7650, -19.3452],
        [ 24.4

epoch:1, loss:-4.613236904144287
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2127, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.1758, 19.3695, 20.2402],
        [24.5831, 19.4008, 20.6865],
        [24.0178, 19.2389, 19.8392],
        [24.1203, 19.0792, 20.1344],
        [23.1998, 18.7683, 19.4540],
        [23.7846, 18.9243, 20.1379]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(37.3052, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.0505,  23.4154, -19.1001],
        [-22.8568,  24.0118, -19.9638],
        [-22.8699,  24.2951, -19.8228],
        [-23.0030,  23.9210, -19.6703],
        [-22.8101,  23.8703, -19.2277],
        [-22.7518,  24.2228, -19.4523]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.1758,  19.3695,  20.2402, -22.0505,  23.4154, -19.1001],
        [ 24.5

epoch:1, loss:-4.699347972869873
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.8069, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.4244, 19.3556, 20.3927],
        [24.4328, 19.4274, 20.1353],
        [24.5991, 19.5037, 20.2713],
        [23.8390, 19.2173, 19.7996],
        [23.9492, 19.1619, 20.3618],
        [24.6142, 19.3918, 20.3987]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(38.1753, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.6538,  24.1233, -19.6632],
        [-22.9004,  24.3461, -20.1556],
        [-22.7490,  23.9335, -19.5623],
        [-22.8884,  24.0705, -19.6438],
        [-23.0128,  24.1091, -19.8715],
        [-23.2497,  24.4747, -19.7995]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.4244,  19.3556,  20.3927, -22.6538,  24.1233, -19.6632],
        [ 24.4

epoch:1, loss:-4.6890788078308105
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5435, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.3466, 19.4681, 20.5464],
        [24.4040, 19.2413, 20.0396],
        [24.4750, 19.2737, 20.2871],
        [24.3088, 19.3783, 20.1896],
        [24.3468, 19.2526, 20.4602],
        [24.3543, 19.2100, 20.3982]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(39.1857, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.3243,  24.0410, -19.3106],
        [-22.5808,  23.6508, -19.2446],
        [-23.5388,  24.5277, -19.8192],
        [-23.0300,  24.2483, -20.0368],
        [-22.9734,  24.1995, -19.4177],
        [-23.1264,  23.9772, -19.8783]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.3466,  19.4681,  20.5464, -22.3243,  24.0410, -19.3106],
        [ 24.

epoch:1, loss:-4.757044315338135
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5104, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.1890, 19.1044, 20.7384],
        [24.4493, 19.3349, 20.3497],
        [24.3260, 19.4931, 19.9340],
        [23.8439, 19.1534, 20.0291],
        [24.1324, 19.3248, 19.8199],
        [24.3378, 19.1420, 19.7350]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(38.1139, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.2753,  24.4591, -20.2300],
        [-22.7939,  24.1950, -19.5250],
        [-22.8075,  24.1331, -19.7170],
        [-22.6071,  24.4106, -19.8699],
        [-22.8060,  24.3290, -20.1543],
        [-22.3550,  23.6352, -19.0998]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.1890,  19.1044,  20.7384, -23.2753,  24.4591, -20.2300],
        [ 24.4

epoch:1, loss:-4.709874153137207
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3781, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.5611, 19.5634, 20.3057],
        [23.7730, 18.7663, 19.9921],
        [24.9941, 19.6674, 21.3908],
        [24.5064, 19.5207, 20.3856],
        [24.3000, 19.6401, 20.2543],
        [24.0710, 19.3250, 20.3486]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(39.4074, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.3757,  24.5670, -20.0715],
        [-23.1832,  24.3463, -19.6521],
        [-23.0780,  24.4099, -20.1428],
        [-23.4969,  24.6564, -20.2423],
        [-22.7470,  23.9983, -19.7437],
        [-22.6152,  24.1235, -19.8006]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.5611,  19.5634,  20.3057, -23.3757,  24.5670, -20.0715],
        [ 23.7

epoch:1, loss:-4.7470316886901855
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5225, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.2521, 19.7378, 20.4553],
        [23.9056, 19.0053, 20.1494],
        [24.1634, 19.6643, 20.3447],
        [24.2838, 19.3444, 20.2810],
        [24.8483, 19.8803, 20.8404],
        [24.8642, 19.6829, 20.5368]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(38.7599, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.6292,  24.2739, -19.6989],
        [-23.0336,  24.2617, -19.8574],
        [-22.7389,  24.0591, -19.7924],
        [-22.6238,  24.2751, -19.8753],
        [-22.5900,  24.5089, -20.0258],
        [-22.6301,  24.2975, -19.7298]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.2521,  19.7378,  20.4553, -22.6292,  24.2739, -19.6989],
        [ 23.

epoch:1, loss:-4.761230945587158
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5578, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.7334, 19.7708, 20.5714],
        [24.9059, 20.0250, 20.9000],
        [24.8862, 19.4091, 20.7295],
        [24.9454, 19.8363, 20.6939],
        [24.2579, 19.3399, 20.3773],
        [25.5047, 20.1198, 21.1786]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(39.7606, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.1355,  24.4103, -20.1756],
        [-23.5130,  24.7521, -19.8913],
        [-23.4341,  24.7317, -20.2738],
        [-23.3851,  24.8037, -20.2090],
        [-23.2226,  24.1588, -19.9328],
        [-22.7193,  24.4864, -19.8290]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.7334,  19.7708,  20.5714, -23.1355,  24.4103, -20.1756],
        [ 24.9

epoch:1, loss:-4.812996864318848
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5886, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.8657, 19.8485, 20.8723],
        [24.6877, 19.7835, 20.5191],
        [24.8811, 19.7081, 20.7589],
        [24.2963, 19.6697, 20.2889],
        [24.4164, 19.5820, 20.4040],
        [24.8753, 19.6055, 20.4794]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(38.5096, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.7578,  24.3060, -19.9824],
        [-23.0897,  24.5448, -20.4038],
        [-23.2915,  24.9062, -20.0767],
        [-23.2933,  24.3002, -19.7466],
        [-22.9415,  23.6007, -19.5073],
        [-23.3815,  24.3327, -20.5767]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.8657,  19.8485,  20.8723, -22.7578,  24.3060, -19.9824],
        [ 24.6

epoch:1, loss:-4.865992546081543
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.8329, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.1927, 19.3334, 20.2810],
        [24.7371, 19.7256, 20.7361],
        [24.8953, 19.8764, 20.3557],
        [24.1685, 19.3811, 20.1365],
        [24.4972, 19.5464, 20.3000],
        [24.6311, 19.7366, 20.3833]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(31.3538, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-22.9387,  24.2831, -20.0122],
        [-23.6192,  24.5952, -20.1685],
        [-22.9163,  24.0261, -19.6168],
        [-23.8414,  24.9216, -20.2901],
        [-23.1197,  24.4565, -19.9796],
        [-23.4254,  24.5559, -20.3422]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.1927,  19.3334,  20.2810, -22.9387,  24.2831, -20.0122],
        [ 24.7

epoch:1, loss:-4.933612823486328
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.1016, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.6855, 19.8912, 20.2591],
        [24.2755, 19.5673, 20.8761],
        [24.3128, 19.3540, 20.2794],
        [24.3863, 19.2833, 20.1013],
        [25.0250, 19.6024, 20.6451],
        [24.6171, 19.7826, 21.0765]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(24.1849, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.5044,  24.6151, -20.4028],
        [-23.5104,  24.2950, -20.0905],
        [-23.4361,  24.5486, -20.2791],
        [-23.6387,  24.9175, -20.4583],
        [-23.2358,  24.3437, -19.6249],
        [-23.5994,  24.9687, -20.6485]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.6855,  19.8912,  20.2591, -23.5044,  24.6151, -20.4028],
        [ 24.2

epoch:1, loss:-4.872290134429932
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5347, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.5550, 19.5006, 20.6553],
        [24.8256, 19.6830, 20.1937],
        [24.8263, 19.9302, 20.6994],
        [24.6524, 19.5256, 20.5987],
        [25.3419, 20.3235, 21.0665],
        [24.7667, 19.3333, 20.7849]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(38.5637, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.6899,  25.0089, -20.4595],
        [-23.3256,  24.6693, -20.0858],
        [-23.4154,  25.2361, -20.3266],
        [-23.2386,  24.5985, -20.1335],
        [-22.9019,  24.4061, -19.9358],
        [-23.4284,  24.5956, -20.7790]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.5550,  19.5006,  20.6553, -23.6899,  25.0089, -20.4595],
        [ 24.8

epoch:1, loss:-4.976897716522217
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6964, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.5074, 19.0552, 20.4829],
        [24.7652, 19.7978, 20.5329],
        [24.6394, 20.0195, 20.6171],
        [24.7392, 19.5770, 20.5945],
        [25.0156, 19.5096, 20.8305],
        [24.1345, 19.6066, 20.3638]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(23.5789, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.2782,  24.3843, -20.0039],
        [-23.8053,  24.7940, -20.3970],
        [-23.6522,  24.6732, -20.2140],
        [-23.1434,  24.5924, -20.2683],
        [-23.4987,  25.0707, -20.8273],
        [-23.1989,  24.8144, -19.7855]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.5074,  19.0552,  20.4829, -23.2782,  24.3843, -20.0039],
        [ 24.7

epoch:1, loss:-4.98072624206543
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5318, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.8489, 20.0639, 20.7029],
        [24.7728, 20.1162, 20.4400],
        [25.0466, 19.8524, 20.6324],
        [25.4137, 20.0666, 20.7940],
        [24.5925, 19.8813, 21.0726],
        [24.5905, 19.8886, 20.7848]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(39.7724, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.8767,  24.8823, -20.7051],
        [-23.7092,  25.0694, -20.4998],
        [-23.1252,  24.7620, -20.6015],
        [-23.3798,  24.5039, -20.3234],
        [-23.7682,  24.7878, -20.5681],
        [-23.4753,  25.0599, -20.5162]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.8489,  20.0639,  20.7029, -23.8767,  24.8823, -20.7051],
        [ 24.77

epoch:1, loss:-4.99041223526001
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5540, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.1388, 20.2036, 20.8848],
        [24.6573, 20.2088, 20.8953],
        [25.0796, 19.8595, 20.7927],
        [25.2136, 20.1592, 20.7044],
        [25.2775, 20.3499, 20.9951],
        [24.9273, 20.0911, 21.1385]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(31.2616, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.2903,  24.6700, -20.1676],
        [-23.9495,  24.9341, -20.7824],
        [-23.9321,  25.0128, -20.6516],
        [-23.5732,  24.7633, -20.5290],
        [-23.7773,  24.7028, -20.2448],
        [-23.0175,  24.7090, -19.8626]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.1388,  20.2036,  20.8848, -23.2903,  24.6700, -20.1676],
        [ 24.65

epoch:1, loss:-5.073183059692383
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.0903, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.6620, 19.8531, 20.8791],
        [24.6461, 19.8565, 20.7742],
        [24.4693, 19.8281, 21.1683],
        [25.3588, 20.2462, 20.7157],
        [24.3845, 19.4903, 20.5711],
        [24.6004, 19.6397, 20.7284]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(23.8845, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.1450,  24.2991, -20.0072],
        [-22.5522,  23.9322, -19.8302],
        [-23.8410,  25.0257, -20.6424],
        [-24.2322,  25.3674, -20.5174],
        [-23.6224,  25.0918, -20.6053],
        [-22.9678,  24.1808, -20.0588]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.6620,  19.8531,  20.8791, -23.1450,  24.2991, -20.0072],
        [ 24.6

epoch:1, loss:-5.037075519561768
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2396, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.9663, 20.3720, 21.1874],
        [25.1097, 20.0514, 20.6864],
        [25.2581, 20.5262, 20.9184],
        [24.7972, 20.0700, 20.9513],
        [24.7827, 19.8381, 20.8923],
        [24.8838, 19.7227, 20.6373]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(39.6807, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.6605,  24.9896, -20.4039],
        [-23.1095,  24.8226, -20.3702],
        [-23.6621,  24.9121, -20.3407],
        [-23.8158,  25.2000, -20.6841],
        [-23.6069,  24.9350, -20.0331],
        [-23.2297,  24.4775, -20.2559]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.9663,  20.3720,  21.1874, -23.6605,  24.9896, -20.4039],
        [ 25.1

epoch:1, loss:-5.022544860839844
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7887, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.6066, 19.6105, 20.7277],
        [24.9272, 20.3202, 21.1542],
        [24.4429, 20.2379, 20.9380],
        [24.6609, 20.0982, 20.5945],
        [25.0256, 19.8965, 20.7507],
        [25.5202, 20.4469, 21.1815]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(40.2823, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.0690,  25.5186, -21.0508],
        [-23.2454,  24.5222, -20.3573],
        [-23.1405,  24.5688, -20.2172],
        [-22.8489,  24.5162, -20.3565],
        [-23.8205,  25.2009, -20.9379],
        [-23.8215,  25.4428, -20.7930]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.6066,  19.6105,  20.7277, -24.0690,  25.5186, -21.0508],
        [ 24.9

epoch:1, loss:-5.08638334274292
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3676, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.4905, 20.0784, 21.3897],
        [24.9919, 20.1407, 20.8042],
        [24.5158, 19.3232, 20.5481],
        [25.4986, 20.2051, 21.4058],
        [25.0243, 20.4182, 21.2847],
        [24.5951, 20.2032, 20.5855]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(40.5515, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.3615,  24.2373, -20.1123],
        [-23.3770,  24.6427, -20.4425],
        [-23.9461,  25.3826, -20.7329],
        [-23.6109,  24.5067, -20.4122],
        [-23.7550,  24.8907, -20.8062],
        [-24.0443,  25.5736, -21.1423]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.4905,  20.0784,  21.3897, -23.3615,  24.2373, -20.1123],
        [ 24.99

epoch:1, loss:-5.000710487365723
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.6605, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.2097, 20.2532, 21.1490],
        [25.1111, 20.3308, 20.7715],
        [25.7170, 20.6990, 21.5827],
        [25.2976, 20.2662, 21.0392],
        [24.9117, 20.1452, 21.0870],
        [25.5998, 20.7021, 21.2537]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(48.5913, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.2739,  25.6212, -21.3280],
        [-23.6945,  25.0530, -20.4710],
        [-23.4336,  24.8301, -20.8774],
        [-23.5892,  25.0536, -20.7026],
        [-23.2955,  24.8610, -20.5274],
        [-23.2548,  24.5878, -20.0198]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.2097,  20.2532,  21.1490, -24.2739,  25.6212, -21.3280],
        [ 25.1

epoch:1, loss:-5.1151556968688965
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.0230, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.7979, 20.6907, 21.0155],
        [25.2318, 20.2339, 21.3175],
        [25.2493, 20.3620, 21.4206],
        [25.0279, 20.0852, 21.0989],
        [25.5935, 20.5479, 21.3761],
        [25.8335, 20.8785, 21.5680]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(40.2799, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.5486,  25.2951, -20.8756],
        [-23.9547,  25.3606, -20.8431],
        [-23.8110,  25.1526, -20.3753],
        [-24.0594,  24.6466, -20.6022],
        [-24.1520,  25.3603, -21.1580],
        [-23.7767,  24.8712, -20.3476]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.7979,  20.6907,  21.0155, -23.5486,  25.2951, -20.8756],
        [ 25.

epoch:1, loss:-5.178369998931885
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5286, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.9927, 20.1335, 21.1135],
        [25.5123, 20.6585, 21.3272],
        [25.0033, 20.1752, 20.7235],
        [24.7121, 20.0531, 21.0919],
        [25.3737, 20.5343, 21.3455],
        [25.7722, 20.9974, 21.6835]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(40.4240, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.4570,  24.7294, -20.3056],
        [-23.7938,  25.0483, -20.7183],
        [-24.4861,  25.7004, -21.2655],
        [-24.2661,  25.3697, -21.1735],
        [-23.7753,  24.9445, -20.9192],
        [-23.5592,  25.2853, -20.8422]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.9927,  20.1335,  21.1135, -23.4570,  24.7294, -20.3056],
        [ 25.5

epoch:1, loss:-5.187885284423828
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6268, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.2861, 20.6909, 21.9060],
        [24.8657, 20.2475, 21.1539],
        [25.7238, 20.8414, 21.6655],
        [24.7432, 20.2182, 20.7353],
        [25.3557, 20.2477, 21.3745],
        [25.2466, 20.1862, 21.0468]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(40.7912, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.0625,  24.8596, -20.6708],
        [-23.4325,  24.8079, -20.5179],
        [-24.1309,  25.2192, -21.0129],
        [-23.9673,  25.5792, -21.0879],
        [-23.9077,  25.2636, -21.0319],
        [-23.3317,  24.4254, -20.2884]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.2861,  20.6909,  21.9060, -24.0625,  24.8596, -20.6708],
        [ 24.8

epoch:1, loss:-5.136941909790039
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5455, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.0496, 20.3212, 21.1001],
        [25.2778, 20.7709, 21.4015],
        [25.2696, 20.2058, 21.1606],
        [25.6289, 20.9028, 21.8922],
        [25.2473, 20.6280, 21.2359],
        [25.7038, 20.4628, 21.6584]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(31.8064, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.1769,  25.1961, -20.8226],
        [-23.9481,  25.7108, -21.0835],
        [-24.0998,  25.6438, -20.9479],
        [-23.7534,  25.4212, -20.8551],
        [-23.1383,  24.6817, -20.4767],
        [-23.6657,  25.1339, -21.0667]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.0496,  20.3212,  21.1001, -24.1769,  25.1961, -20.8226],
        [ 25.2

epoch:1, loss:-5.222722053527832
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2140, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.7928, 20.2318, 20.8202],
        [24.8183, 20.4203, 21.1322],
        [25.4366, 20.4314, 21.6192],
        [25.3126, 20.6265, 21.6730],
        [25.3315, 20.5957, 21.6152],
        [25.4217, 20.5734, 21.4976]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(40.7503, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.2528,  25.4212, -21.4272],
        [-24.0594,  25.1820, -20.9031],
        [-23.4784,  25.3897, -21.0780],
        [-23.9494,  25.7498, -21.0485],
        [-24.1421,  25.7028, -20.9769],
        [-24.5321,  25.5613, -21.0557]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.7928,  20.2318,  20.8202, -24.2528,  25.4212, -21.4272],
        [ 24.8

epoch:1, loss:-5.203471660614014
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5075, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[24.4820, 19.9397, 21.0921],
        [25.1354, 19.9369, 21.5084],
        [25.1182, 19.9861, 20.9685],
        [25.3010, 20.9063, 21.3781],
        [25.6899, 20.1784, 21.1268],
        [25.4266, 20.7827, 21.5190]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(32.6376, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.4075,  25.2653, -21.2258],
        [-24.0986,  25.1401, -21.0831],
        [-23.9413,  25.4445, -20.9299],
        [-24.4490,  25.7042, -21.2713],
        [-24.6682,  25.8781, -21.5561],
        [-24.1302,  25.3497, -20.8990]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 24.4820,  19.9397,  21.0921, -24.4075,  25.2653, -21.2258],
        [ 25.1

epoch:1, loss:-5.256747245788574
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.8471, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.9995, 20.6362, 21.5996],
        [25.1442, 20.1889, 21.0154],
        [25.5740, 20.5604, 21.1209],
        [25.7398, 20.6198, 21.6050],
        [25.7126, 20.4637, 21.6908],
        [25.3885, 20.5985, 21.3143]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(24.4925, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.3170,  25.5357, -21.0910],
        [-24.3523,  25.6227, -21.3441],
        [-24.1448,  25.4716, -21.1474],
        [-24.3995,  25.7362, -21.5085],
        [-24.6570,  25.5411, -21.4025],
        [-23.9900,  25.2941, -21.0833]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.9995,  20.6362,  21.5996, -24.3170,  25.5357, -21.0910],
        [ 25.1

epoch:1, loss:-5.326473712921143
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.1275, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.8980, 20.6955, 21.5888],
        [25.2300, 20.4041, 21.4358],
        [25.6656, 20.7468, 21.9278],
        [25.9625, 20.9580, 21.8218],
        [25.8910, 20.7661, 21.8958],
        [25.3005, 20.6100, 21.4570]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(33.1351, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.8175,  25.0747, -20.9731],
        [-24.3354,  25.8544, -21.5750],
        [-24.7782,  26.2145, -21.8772],
        [-24.7015,  26.0725, -22.0967],
        [-24.0843,  25.3001, -20.9733],
        [-23.8642,  24.8720, -20.9512]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.8980,  20.6955,  21.5888, -23.8175,  25.0747, -20.9731],
        [ 25.2

epoch:1, loss:-5.3221611976623535
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5496, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.4183, 20.8255, 21.4475],
        [25.4938, 20.8407, 21.2759],
        [24.3241, 19.8114, 20.8583],
        [25.6856, 20.6454, 21.6162],
        [25.2847, 20.6661, 21.7150],
        [25.1958, 20.4403, 21.1179]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(48.9605, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.7879,  26.0338, -21.6088],
        [-24.1698,  25.3788, -21.4321],
        [-23.9437,  25.3371, -20.9260],
        [-24.5597,  25.8613, -21.3381],
        [-24.5149,  25.5831, -21.2346],
        [-24.4660,  25.5279, -20.9198]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.4183,  20.8255,  21.4475, -24.7879,  26.0338, -21.6088],
        [ 25.

epoch:1, loss:-5.344089508056641
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.2446, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.2886, 20.0359, 21.1588],
        [25.1136, 20.2472, 21.0369],
        [25.9095, 20.8404, 22.1321],
        [25.0647, 20.4748, 21.2409],
        [25.7187, 20.2967, 21.4344],
        [25.9320, 21.1836, 21.8550]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(41.1935, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.2468,  26.0235, -21.6075],
        [-24.2673,  25.7263, -21.4581],
        [-23.5845,  24.7844, -20.7556],
        [-24.5668,  25.8870, -21.7190],
        [-24.9048,  25.8861, -21.9349],
        [-24.4765,  26.2267, -21.5108]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.2886,  20.0359,  21.1588, -24.2468,  26.0235, -21.6075],
        [ 25.1

epoch:1, loss:-5.356222152709961
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.8573, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.7031, 20.8083, 21.7983],
        [25.9117, 21.0202, 21.7852],
        [25.0556, 20.5672, 21.1643],
        [25.8097, 20.7347, 21.9918],
        [26.2566, 21.2655, 22.1985],
        [25.7070, 20.4666, 21.4398]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(42.1435, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.5865,  25.9872, -21.5452],
        [-25.3588,  26.7299, -22.0284],
        [-24.3443,  25.4757, -21.3267],
        [-24.6801,  25.7064, -21.4151],
        [-24.7738,  26.0380, -21.5126],
        [-24.4688,  25.5233, -21.1500]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.7031,  20.8083,  21.7983, -24.5865,  25.9872, -21.5452],
        [ 25.9

epoch:1, loss:-5.3729729652404785
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4437, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.6374, 20.7355, 21.5430],
        [25.8303, 21.2678, 22.0803],
        [25.1886, 20.7188, 21.5971],
        [25.8071, 20.9437, 21.4495],
        [26.0585, 20.9421, 22.0145],
        [25.5948, 20.8441, 21.3342]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(49.4880, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.9059,  25.3175, -21.2097],
        [-24.6665,  25.6084, -21.4437],
        [-24.0924,  25.4194, -21.1148],
        [-24.5365,  25.8720, -21.1231],
        [-24.8893,  25.9390, -21.6491],
        [-23.9574,  25.4197, -21.3360]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.6374,  20.7355,  21.5430, -23.9059,  25.3175, -21.2097],
        [ 25.

epoch:1, loss:-5.369766712188721
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.1777, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.9554, 21.4977, 22.2025],
        [25.5459, 20.9748, 21.8422],
        [26.0619, 21.1969, 21.8056],
        [25.8795, 20.9396, 22.0164],
        [25.3389, 20.7558, 21.6494],
        [25.7058, 20.7451, 21.4783]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(24.8896, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-23.9607,  25.5588, -20.9488],
        [-24.2384,  25.7386, -21.3724],
        [-24.2248,  25.0857, -21.1276],
        [-24.3402,  25.7585, -21.3787],
        [-24.4239,  25.6265, -21.1295],
        [-24.4749,  25.9834, -21.2190]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.9554,  21.4977,  22.2025, -23.9607,  25.5588, -20.9488],
        [ 25.5

epoch:1, loss:-5.576909065246582
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2820, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.0785, 21.7161, 22.1356],
        [24.8780, 20.7654, 20.9598],
        [25.8997, 21.3220, 21.9823],
        [25.5618, 20.5031, 21.5744],
        [26.1776, 21.6093, 22.1603],
        [25.5221, 20.8213, 21.6359]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(33.3145, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.4578,  25.6637, -21.2093],
        [-24.9124,  26.1274, -21.8360],
        [-24.3431,  25.8101, -21.6022],
        [-24.1528,  25.2249, -20.9821],
        [-25.2185,  26.1720, -22.1924],
        [-24.9731,  26.3074, -21.9145]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.0785,  21.7161,  22.1356, -24.4578,  25.6637, -21.2093],
        [ 24.8

epoch:1, loss:-5.393967151641846
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.8388, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.1585, 21.1556, 22.2626],
        [26.4302, 21.5010, 22.5075],
        [26.0022, 21.2582, 21.9424],
        [25.9896, 21.1081, 21.8853],
        [25.8960, 21.1230, 21.4175],
        [26.3099, 21.5010, 22.4512]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(50.1679, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.1478,  24.8142, -20.7919],
        [-24.5772,  26.2524, -21.7618],
        [-24.8615,  25.8906, -21.4134],
        [-24.3762,  25.5275, -21.6595],
        [-24.8958,  26.2371, -21.5736],
        [-24.2176,  25.2099, -21.2072]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.1585,  21.1556,  22.2626, -24.1478,  24.8142, -20.7919],
        [ 26.4

epoch:1, loss:-5.467453956604004
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.6958, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.8709, 20.7085, 21.5925],
        [26.3487, 21.5117, 22.3037],
        [26.2252, 21.0366, 21.9598],
        [26.0748, 21.3193, 22.2101],
        [26.1658, 21.2253, 21.9429],
        [25.9679, 21.2450, 22.3299]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(50.3285, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.8950,  25.9151, -21.8411],
        [-24.3412,  25.6821, -21.3004],
        [-24.8019,  26.3519, -21.7581],
        [-25.1556,  26.0846, -22.0237],
        [-24.8349,  26.0633, -21.6545],
        [-24.8782,  26.0994, -21.5537]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.8709,  20.7085,  21.5925, -24.8950,  25.9151, -21.8411],
        [ 26.3

epoch:1, loss:-5.420872211456299
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7502, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.1023, 21.2486, 21.7767],
        [25.6018, 21.2431, 21.7064],
        [25.7997, 20.5661, 21.9028],
        [26.2884, 21.5313, 21.9491],
        [25.7015, 21.0144, 22.0793],
        [26.1132, 21.1760, 22.1962]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.2413, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.4972,  25.6962, -21.9958],
        [-24.3740,  25.5736, -21.4037],
        [-24.4756,  26.0543, -21.9987],
        [-25.2786,  26.5264, -22.3039],
        [-25.0773,  26.2698, -21.9970],
        [-24.9204,  25.8041, -21.8182]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.1023,  21.2486,  21.7767, -24.4972,  25.6962, -21.9958],
        [ 25.6

epoch:1, loss:-5.427718162536621
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.8145, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.0318, 21.7364, 22.0858],
        [25.4954, 20.7774, 21.7424],
        [26.4928, 21.2779, 22.5453],
        [26.4092, 21.4572, 22.2755],
        [25.9260, 21.2154, 22.2631],
        [25.8043, 20.8987, 22.0703]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(42.5420, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.2900,  26.5042, -22.2311],
        [-24.9083,  25.8232, -21.9865],
        [-24.2736,  26.3824, -22.0042],
        [-25.0885,  26.3657, -21.9378],
        [-25.0463,  25.9103, -21.8636],
        [-24.5463,  25.7691, -21.6152]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.0318,  21.7364,  22.0858, -25.2900,  26.5042, -22.2311],
        [ 25.4

epoch:1, loss:-5.562824249267578
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4985, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[25.6374, 21.4917, 22.4730],
        [26.0553, 21.3090, 22.0189],
        [25.6671, 21.1701, 21.6242],
        [25.5820, 20.4681, 21.2327],
        [26.2987, 21.5284, 22.3137],
        [26.0509, 21.3715, 22.1141]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(49.7523, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.6968,  25.6666, -21.5995],
        [-24.8269,  25.9121, -21.8930],
        [-24.9094,  26.0671, -22.1908],
        [-24.3105,  25.9606, -21.6949],
        [-25.0571,  26.2524, -21.7664],
        [-24.6150,  25.7888, -21.5315]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 25.6374,  21.4917,  22.4730, -24.6968,  25.6666, -21.5995],
        [ 26.0

epoch:1, loss:-5.572372913360596
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.9504, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.0575, 21.5404, 22.2240],
        [26.0557, 21.2737, 22.3228],
        [26.6332, 21.6935, 21.8718],
        [25.6174, 21.0876, 21.9170],
        [26.3343, 21.4638, 21.7344],
        [26.2803, 21.8458, 22.2949]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(42.5069, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.0461,  26.4443, -22.0573],
        [-24.8536,  26.2056, -21.8447],
        [-24.4643,  25.9118, -21.3850],
        [-25.1177,  26.6270, -22.1968],
        [-24.7870,  26.3738, -22.0996],
        [-24.6072,  26.3480, -22.0428]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.0575,  21.5404,  22.2240, -25.0461,  26.4443, -22.0573],
        [ 26.0

epoch:1, loss:-5.591063499450684
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.1222, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.1783, 21.2778, 22.1622],
        [26.0306, 20.7932, 21.9328],
        [26.0382, 21.4232, 22.1255],
        [26.0421, 20.9853, 21.5817],
        [26.4254, 21.4467, 22.2051],
        [26.7624, 22.1061, 22.8158]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(34.0396, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.9994,  26.0377, -22.0045],
        [-24.9699,  26.2634, -22.1388],
        [-24.9710,  25.6978, -21.9565],
        [-24.7189,  25.8997, -21.9723],
        [-24.8153,  26.3926, -22.1028],
        [-24.8401,  26.2873, -21.9303]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.1783,  21.2778,  22.1622, -24.9994,  26.0377, -22.0045],
        [ 26.0

epoch:1, loss:-5.643794059753418
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.9047, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.4822, 21.5720, 22.1755],
        [26.4741, 21.3178, 22.3020],
        [26.0578, 21.3566, 21.8957],
        [25.6494, 20.8081, 21.8845],
        [26.1380, 21.8680, 22.5272],
        [26.4209, 21.4762, 22.3329]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(42.2607, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.1905,  26.0392, -21.9916],
        [-25.1711,  26.3644, -22.1364],
        [-25.3699,  26.0576, -22.2934],
        [-25.2944,  26.7141, -22.1541],
        [-24.9886,  26.6312, -22.4399],
        [-24.3289,  26.1106, -21.2626]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.4822,  21.5720,  22.1755, -25.1905,  26.0392, -21.9916],
        [ 26.4

epoch:1, loss:-5.6447954177856445
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5642, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.1529, 21.5614, 22.5909],
        [26.0784, 21.1788, 22.4066],
        [26.3117, 21.4187, 22.4195],
        [26.2592, 21.6779, 22.5289],
        [26.4898, 21.1383, 22.1655],
        [26.7129, 21.6470, 22.3468]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(33.5427, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.6767,  26.1015, -21.5567],
        [-24.8890,  25.8828, -21.9106],
        [-25.3472,  26.5933, -22.2887],
        [-25.4169,  26.9991, -22.4482],
        [-24.9485,  26.1826, -21.7030],
        [-24.8414,  26.1395, -21.6261]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.1529,  21.5614,  22.5909, -24.6767,  26.1015, -21.5567],
        [ 26.

epoch:1, loss:-5.742901802062988
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.6135, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.0697, 21.3456, 22.3606],
        [26.0183, 21.4336, 22.1934],
        [27.0151, 22.0413, 22.9515],
        [26.6297, 21.7022, 22.2698],
        [26.2281, 21.7836, 22.7301],
        [26.3565, 21.6069, 22.6895]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(42.8449, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.2603,  26.8111, -22.4095],
        [-25.0846,  26.6234, -21.7515],
        [-25.0891,  26.9183, -22.3859],
        [-25.4305,  26.7167, -22.2816],
        [-25.5028,  26.7814, -22.6918],
        [-25.2040,  26.9967, -22.5970]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.0697,  21.3456,  22.3606, -25.2603,  26.8111, -22.4095],
        [ 26.0

epoch:1, loss:-5.71702241897583
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5775, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.1448, 21.7061, 21.9683],
        [26.6462, 22.0912, 22.2555],
        [26.4583, 21.4754, 22.5513],
        [26.8080, 22.0665, 22.3946],
        [26.5685, 21.6624, 22.3727],
        [26.0502, 21.3612, 22.0219]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.7322, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.4114,  26.8397, -22.2122],
        [-24.5464,  26.1013, -21.8446],
        [-24.9076,  25.9598, -22.0535],
        [-24.6045,  25.8140, -21.5688],
        [-25.2313,  26.4923, -22.1438],
        [-25.5979,  26.7908, -22.4961]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.1448,  21.7061,  21.9683, -25.4114,  26.8397, -22.2122],
        [ 26.64

epoch:1, loss:-5.728199005126953
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3782, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.9127, 22.0173, 22.7281],
        [26.2702, 21.2594, 21.9266],
        [26.6775, 21.9010, 22.5453],
        [26.4197, 21.6973, 22.3499],
        [26.9100, 21.9362, 22.8997],
        [25.7694, 21.2626, 21.7225]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(17.2690, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.4861,  26.7687, -22.7796],
        [-25.4218,  26.6696, -22.0592],
        [-24.8411,  26.4034, -21.9896],
        [-25.2482,  26.5687, -22.3477],
        [-25.3948,  26.4024, -22.2905],
        [-25.5803,  26.7272, -22.7990]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.9127,  22.0173,  22.7281, -25.4861,  26.7687, -22.7796],
        [ 26.2

epoch:1, loss:-5.713675498962402
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.8075, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.1667, 21.4798, 22.0233],
        [26.3153, 21.7256, 22.6741],
        [26.6747, 22.2984, 22.8451],
        [26.9580, 21.7452, 22.8600],
        [26.4111, 21.4202, 22.1187],
        [26.3736, 21.7280, 22.0024]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(42.9938, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-24.6360,  25.8875, -22.4328],
        [-25.3063,  26.7599, -22.3110],
        [-25.3284,  26.9627, -22.4022],
        [-25.3983,  26.5718, -22.3646],
        [-25.6915,  27.1410, -22.5711],
        [-25.3153,  26.4135, -22.5224]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.1667,  21.4798,  22.0233, -24.6360,  25.8875, -22.4328],
        [ 26.3

epoch:1, loss:-5.696831703186035
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7049, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.6537, 21.8098, 22.4707],
        [26.5567, 21.6960, 22.2441],
        [27.0636, 22.4099, 22.9970],
        [26.5407, 21.6798, 22.3303],
        [27.1667, 22.1394, 23.0052],
        [26.2961, 21.8569, 23.0188]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(25.7065, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.1929,  26.0322, -21.9624],
        [-25.7392,  27.1891, -22.6336],
        [-25.3025,  26.0979, -22.4911],
        [-25.5819,  26.7669, -22.8385],
        [-24.7077,  25.9573, -21.6423],
        [-24.9734,  25.7787, -21.6693]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.6537,  21.8098,  22.4707, -25.1929,  26.0322, -21.9624],
        [ 26.5

epoch:1, loss:-5.77553129196167
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.0282, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.4522, 21.5079, 22.0982],
        [26.9841, 22.4335, 23.1242],
        [27.0760, 22.0454, 23.0410],
        [26.8451, 22.0195, 22.6693],
        [26.5838, 21.8994, 22.4176],
        [26.3088, 21.6997, 22.2476]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(42.7699, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.5438,  26.9732, -22.6193],
        [-25.6959,  26.7168, -22.6100],
        [-25.5049,  26.1603, -22.5459],
        [-25.7790,  26.7076, -22.6098],
        [-25.2346,  26.4562, -21.9391],
        [-25.1227,  26.4327, -21.8713]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.4522,  21.5079,  22.0982, -25.5438,  26.9732, -22.6193],
        [ 26.98

epoch:1, loss:-5.799091815948486
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.1351, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.6240, 21.9418, 22.5109],
        [26.9591, 22.2581, 23.1049],
        [26.8153, 21.9365, 22.7067],
        [26.1996, 21.8451, 22.4624],
        [27.0863, 22.3384, 23.0776],
        [27.0317, 22.1948, 23.2015]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(43.6383, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.6416,  26.6986, -22.6107],
        [-25.3912,  26.6343, -22.4065],
        [-25.9274,  27.5361, -22.9541],
        [-25.6504,  26.7568, -22.2604],
        [-25.3428,  26.7258, -22.3697],
        [-25.2997,  26.2504, -22.1867]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.6240,  21.9418,  22.5109, -25.6416,  26.6986, -22.6107],
        [ 26.9

epoch:1, loss:-5.88944673538208
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4486, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.0016, 22.4434, 23.2018],
        [26.4356, 21.7765, 22.4387],
        [27.4673, 22.3116, 22.9510],
        [26.6747, 21.3200, 22.4413],
        [26.9849, 21.7823, 22.9428],
        [27.0207, 21.7642, 22.6895]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(34.9558, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.5778,  26.9344, -22.8379],
        [-25.2998,  26.6491, -22.5207],
        [-25.1095,  26.9543, -22.2765],
        [-25.6717,  27.0074, -23.0627],
        [-26.0534,  27.1564, -22.8499],
        [-25.4253,  26.7207, -22.4709]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.0016,  22.4434,  23.2018, -25.5778,  26.9344, -22.8379],
        [ 26.43

epoch:1, loss:-5.847139835357666
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4490, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.4454, 22.1244, 22.4168],
        [26.5581, 21.5189, 22.9895],
        [27.1188, 22.4360, 23.2488],
        [26.4708, 22.1289, 23.1044],
        [27.1423, 22.7015, 23.3694],
        [26.9481, 22.0212, 22.8334]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(42.3135, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.1500,  26.5984, -22.5942],
        [-25.4999,  26.7505, -22.7180],
        [-25.1390,  26.6453, -22.5583],
        [-26.0253,  27.1389, -23.1031],
        [-25.9937,  27.2246, -22.9838],
        [-25.0786,  26.5945, -22.0618]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.4454,  22.1244,  22.4168, -25.1500,  26.5984, -22.5942],
        [ 26.5

epoch:1, loss:-5.896975517272949
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4408, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.7767, 22.2641, 22.7350],
        [26.1276, 21.5513, 22.1852],
        [27.1913, 22.2924, 23.1871],
        [26.8889, 22.5738, 23.2725],
        [26.5501, 21.7351, 22.6956],
        [26.6309, 21.7850, 22.9159]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(26.0579, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.8378,  27.0502, -22.8177],
        [-26.1208,  27.2482, -23.1359],
        [-25.4536,  26.6678, -22.8648],
        [-25.8234,  26.7681, -22.8136],
        [-24.6479,  26.5850, -22.1194],
        [-25.4957,  27.0273, -22.5282]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.7767,  22.2641,  22.7350, -25.8378,  27.0502, -22.8177],
        [ 26.1

epoch:1, loss:-5.906905174255371
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.8549, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.5765, 22.0217, 22.6474],
        [27.1926, 22.1490, 23.0657],
        [26.7527, 22.4469, 22.9803],
        [26.9122, 21.9805, 22.5856],
        [26.8373, 21.8534, 22.4156],
        [26.7727, 21.7849, 22.3714]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(43.3716, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.0510,  27.2528, -22.7212],
        [-25.6274,  26.8426, -23.0970],
        [-26.0836,  27.2734, -23.2317],
        [-25.3254,  26.3977, -22.6833],
        [-25.4561,  26.9261, -22.4935],
        [-25.6585,  27.1693, -23.0987]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.5765,  22.0217,  22.6474, -26.0510,  27.2528, -22.7212],
        [ 27.1

epoch:1, loss:-5.779446125030518
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.7045, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.5977, 21.9221, 22.8333],
        [26.0596, 21.5636, 22.6791],
        [26.7817, 22.2835, 22.9773],
        [26.3009, 22.1336, 22.8575],
        [27.4848, 22.1933, 23.3702],
        [27.3496, 22.6215, 23.5702]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(42.6499, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.9544,  27.0106, -22.9667],
        [-25.9558,  27.0662, -23.0799],
        [-25.0464,  26.6639, -22.3693],
        [-25.3368,  26.6545, -22.7167],
        [-25.2481,  26.5936, -22.5789],
        [-25.5739,  27.3463, -22.7138]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.5977,  21.9221,  22.8333, -25.9544,  27.0106, -22.9667],
        [ 26.0

epoch:1, loss:-5.943390846252441
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.6100, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.8781, 22.1015, 22.7051],
        [27.1017, 22.4243, 22.8632],
        [27.0828, 22.5943, 23.4583],
        [27.3036, 22.4155, 23.2180],
        [26.8574, 22.1684, 22.6683],
        [26.0161, 21.5989, 22.5302]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(43.2118, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.4303,  26.7464, -22.5799],
        [-25.8415,  26.9288, -23.1304],
        [-26.0049,  27.0294, -22.8565],
        [-25.4228,  26.8426, -22.7536],
        [-25.3128,  26.8377, -22.3156],
        [-25.1510,  26.6102, -22.4192]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.8781,  22.1015,  22.7051, -25.4303,  26.7464, -22.5799],
        [ 27.1

epoch:1, loss:-5.989816188812256
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3982, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.9801, 22.2068, 22.9631],
        [26.8752, 22.2538, 23.1366],
        [27.1362, 22.6328, 23.0382],
        [27.2031, 22.3556, 23.3532],
        [26.8848, 22.0122, 22.8080],
        [26.9729, 22.3779, 23.2509]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(52.1305, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.2943,  27.5681, -23.1030],
        [-25.7513,  27.0542, -23.3034],
        [-25.6923,  27.1944, -22.5772],
        [-26.2484,  26.9775, -23.0290],
        [-25.8447,  27.0796, -22.6510],
        [-25.7850,  26.9347, -22.8618]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.9801,  22.2068,  22.9631, -26.2943,  27.5681, -23.1030],
        [ 26.8

epoch:1, loss:-5.955976963043213
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7360, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.1173, 22.1202, 22.8602],
        [27.0034, 22.1088, 22.9624],
        [27.5096, 22.6960, 23.0120],
        [26.8037, 21.9716, 22.8679],
        [27.1741, 22.1933, 22.8967],
        [26.9799, 22.2472, 22.9442]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(51.6926, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.8900,  26.9736, -22.9544],
        [-25.9567,  27.0578, -22.4300],
        [-26.1457,  27.0051, -22.9098],
        [-26.0725,  27.7613, -23.3185],
        [-25.6568,  27.1111, -22.8391],
        [-26.2782,  27.0499, -23.2285]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.1173,  22.1202,  22.8602, -25.8900,  26.9736, -22.9544],
        [ 27.0

epoch:1, loss:-5.977970600128174
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.4409, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.3007, 22.4974, 23.1833],
        [27.1114, 22.5019, 23.6089],
        [27.1791, 22.2340, 23.0737],
        [26.8294, 22.4098, 22.9085],
        [27.5536, 22.4882, 23.1034],
        [27.2212, 22.5904, 23.3991]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(52.4565, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.8398,  27.3126, -23.1776],
        [-25.6393,  26.3916, -22.7164],
        [-25.9404,  27.3445, -23.0802],
        [-26.1168,  27.6141, -23.4450],
        [-25.5650,  26.5928, -22.5342],
        [-25.8935,  27.3485, -23.2719]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.3007,  22.4974,  23.1833, -25.8398,  27.3126, -23.1776],
        [ 27.1

epoch:1, loss:-6.137003421783447
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.7887, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.2212, 22.6176, 23.3516],
        [27.4089, 22.3826, 23.2144],
        [27.1675, 22.4545, 23.3276],
        [27.4020, 22.2882, 23.4797],
        [27.9072, 22.7425, 23.5223],
        [27.5082, 22.9004, 23.3763]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(43.6925, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.1780,  27.3185, -23.3045],
        [-26.2279,  27.3319, -22.8857],
        [-25.5787,  26.5376, -22.7806],
        [-26.0284,  27.4959, -23.2012],
        [-26.1611,  27.5755, -23.4026],
        [-25.7793,  26.7213, -22.5599]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.2212,  22.6176,  23.3516, -26.1780,  27.3185, -23.3045],
        [ 27.4

epoch:1, loss:-6.160897254943848
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7467, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.6833, 22.5791, 23.4564],
        [26.8082, 22.3966, 22.7580],
        [27.9635, 23.0534, 23.8348],
        [27.4175, 22.9815, 23.7461],
        [26.6738, 22.2437, 22.9066],
        [27.2739, 22.9747, 23.5604]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(52.8339, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.7602,  27.4132, -23.0832],
        [-26.5626,  27.7185, -23.5690],
        [-26.7734,  27.8150, -23.7456],
        [-26.5661,  27.8265, -23.4809],
        [-25.9504,  27.3694, -22.9902],
        [-26.0781,  27.2829, -23.3168]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.6833,  22.5791,  23.4564, -25.7602,  27.4132, -23.0832],
        [ 26.8

epoch:1, loss:-6.139070987701416
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.3896, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[26.6288, 21.7726, 22.7912],
        [27.4519, 22.7678, 23.2428],
        [27.3960, 22.7825, 23.2894],
        [27.1854, 22.4988, 23.2121],
        [27.2471, 23.0605, 23.6644],
        [27.4310, 22.8086, 23.5162]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(35.4695, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.0732,  27.1949, -22.8186],
        [-25.2299,  26.6941, -22.8382],
        [-26.1781,  27.8100, -23.3935],
        [-26.3765,  27.4314, -23.4696],
        [-26.8955,  28.1123, -23.7912],
        [-25.1677,  26.6059, -22.6953]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 26.6288,  21.7726,  22.7912, -26.0732,  27.1949, -22.8186],
        [ 27.4

epoch:1, loss:-6.1176886558532715
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.1104, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.8569, 22.8852, 23.6256],
        [27.4095, 22.7197, 23.5244],
        [27.3186, 22.7403, 23.0547],
        [27.2905, 22.9260, 23.7234],
        [27.4122, 23.0713, 23.4060],
        [27.0778, 22.6856, 23.3657]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(45.1699, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.3537,  27.6124, -23.4313],
        [-26.3613,  27.9795, -23.4318],
        [-26.6876,  27.9808, -23.9181],
        [-26.1586,  27.6559, -23.3932],
        [-26.4279,  27.6116, -23.3715],
        [-26.6013,  27.5551, -23.4399]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.8569,  22.8852,  23.6256, -26.3537,  27.6124, -23.4313],
        [ 27.

epoch:1, loss:-6.194766044616699
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3126, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.6356, 22.3978, 23.6778],
        [27.9983, 23.1302, 23.5076],
        [27.4272, 22.6662, 23.1100],
        [27.4112, 22.6844, 23.1870],
        [28.1685, 23.0941, 23.6827],
        [27.1155, 22.6402, 23.3068]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(26.3598, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.3630,  27.4774, -23.5106],
        [-26.0251,  27.1577, -23.1435],
        [-25.8222,  27.2988, -23.4302],
        [-26.0114,  27.5778, -23.4194],
        [-25.6115,  26.6392, -22.1571],
        [-26.1538,  27.5639, -23.2101]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.6356,  22.3978,  23.6778, -26.3630,  27.4774, -23.5106],
        [ 27.9

epoch:1, loss:-6.165699481964111
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2350, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.0576, 22.4629, 23.1129],
        [27.6817, 22.8219, 23.2925],
        [27.1766, 22.1570, 23.4333],
        [27.5064, 22.6434, 23.5274],
        [27.3430, 22.6924, 23.1359],
        [27.7011, 23.0167, 23.5109]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(53.7352, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.8670,  27.6970, -23.2366],
        [-26.3772,  27.4803, -23.3044],
        [-26.2057,  27.3553, -23.0991],
        [-26.5107,  27.7430, -23.6002],
        [-26.2169,  27.6252, -23.3377],
        [-25.8108,  27.5221, -23.3429]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.0576,  22.4629,  23.1129, -25.8670,  27.6970, -23.2366],
        [ 27.6

epoch:1, loss:-6.27678918838501
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3519, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.9866, 22.9679, 24.2114],
        [26.8639, 22.5546, 23.0234],
        [27.6726, 22.9712, 23.5104],
        [27.2913, 22.7419, 23.2410],
        [27.2009, 22.7613, 22.9837],
        [27.5481, 22.8430, 23.0568]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(34.9145, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.1199,  27.1100, -23.6551],
        [-25.9378,  27.3025, -23.0866],
        [-26.4051,  27.5418, -23.6690],
        [-26.6998,  28.2411, -23.5978],
        [-26.5902,  27.6803, -23.7582],
        [-26.2426,  27.1515, -23.3881]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.9866,  22.9679,  24.2114, -26.1199,  27.1100, -23.6551],
        [ 26.86

epoch:1, loss:-6.181157112121582
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.0275, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.2009, 23.5237, 23.9993],
        [27.2376, 22.8187, 23.0168],
        [28.0028, 23.3801, 23.7404],
        [27.6002, 22.9206, 23.8901],
        [26.9191, 22.4510, 23.7071],
        [27.7268, 23.2437, 23.8806]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(35.8492, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.0616,  27.4559, -23.2092],
        [-26.4098,  27.8207, -23.7245],
        [-26.0471,  27.5044, -23.6424],
        [-26.5867,  27.5912, -23.8562],
        [-26.1155,  27.2829, -23.3496],
        [-26.2226,  27.7449, -23.7227]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.2009,  23.5237,  23.9993, -26.0616,  27.4559, -23.2092],
        [ 27.2

epoch:1, loss:-6.286011695861816
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5462, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.8601, 23.1791, 23.3179],
        [27.5948, 22.9829, 23.5426],
        [27.2436, 22.6807, 23.2102],
        [28.4412, 23.1058, 24.4873],
        [27.8812, 23.3441, 23.9136],
        [26.9809, 22.5534, 23.4707]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(27.2061, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.2335,  27.6081, -23.2450],
        [-26.6217,  27.8664, -23.5349],
        [-26.1770,  27.1161, -23.1890],
        [-26.9235,  27.8865, -24.1393],
        [-26.5423,  27.3962, -23.5124],
        [-26.3176,  27.3199, -23.3430]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.8601,  23.1791,  23.3179, -26.2335,  27.6081, -23.2450],
        [ 27.5

epoch:1, loss:-6.293581008911133
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.3561, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.3697, 22.9071, 23.3814],
        [27.3086, 22.3815, 23.3001],
        [28.0370, 23.4511, 23.9263],
        [27.6757, 23.1359, 23.4633],
        [27.3128, 22.9131, 23.4579],
        [27.1190, 22.0099, 23.2412]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.2157, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-25.4650,  26.6816, -22.8317],
        [-25.7794,  27.3215, -23.0117],
        [-26.2348,  27.5352, -23.4545],
        [-26.4714,  27.6919, -23.5688],
        [-26.2115,  27.7500, -23.4936],
        [-27.0635,  28.3361, -24.6462]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.3697,  22.9071,  23.3814, -25.4650,  26.6816, -22.8317],
        [ 27.3

epoch:1, loss:-6.348987579345703
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.6327, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.0004, 23.5169, 23.8785],
        [27.6622, 23.1564, 23.7944],
        [27.8986, 23.1338, 24.1102],
        [27.7041, 23.2448, 24.2775],
        [27.3482, 22.7084, 23.7505],
        [28.1206, 23.6601, 24.1310]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(35.8147, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.3922,  27.5013, -23.6397],
        [-26.8369,  28.1178, -24.0854],
        [-26.6339,  27.7720, -23.5421],
        [-27.0048,  28.3250, -24.2349],
        [-26.0900,  27.0384, -22.9352],
        [-26.4937,  27.7332, -23.5661]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.0004,  23.5169,  23.8785, -26.3922,  27.5013, -23.6397],
        [ 27.6

epoch:1, loss:-6.32328462600708
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7337, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.2728, 22.4385, 23.0376],
        [26.3427, 21.5494, 22.9072],
        [27.4013, 22.5859, 23.5259],
        [27.8461, 23.3724, 24.2008],
        [27.9330, 22.8937, 23.7041],
        [27.9622, 23.1664, 23.4328]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(45.7785, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.7825,  28.3101, -23.9639],
        [-26.8652,  28.1538, -23.8846],
        [-26.6818,  28.2167, -23.6835],
        [-26.6491,  27.9371, -23.4433],
        [-26.7756,  28.2993, -23.8299],
        [-26.8085,  27.6509, -23.5624]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.2728,  22.4385,  23.0376, -26.7825,  28.3101, -23.9639],
        [ 26.34

epoch:1, loss:-6.402603626251221
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.0679, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.9903, 23.3575, 24.0491],
        [28.3902, 23.6003, 23.9021],
        [27.9881, 23.1915, 23.5826],
        [27.9414, 23.3430, 23.7760],
        [28.4203, 23.3783, 24.3785],
        [27.5836, 22.8391, 23.6974]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(54.6817, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.9743,  27.8313, -23.9728],
        [-26.5538,  27.4392, -23.1802],
        [-26.6276,  27.9236, -23.8249],
        [-26.7384,  28.2156, -24.1872],
        [-26.7722,  27.7739, -23.9225],
        [-26.8775,  28.3628, -23.9929]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.9903,  23.3575,  24.0491, -26.9743,  27.8313, -23.9728],
        [ 28.3

epoch:1, loss:-6.3447184562683105
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.0319, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.9123, 23.1067, 23.7076],
        [27.4634, 23.4858, 24.0785],
        [27.8909, 23.5112, 24.1092],
        [27.7278, 23.1966, 23.8438],
        [27.9416, 23.2925, 23.6350],
        [28.0559, 23.6310, 23.9871]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.4018, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.0427,  28.3102, -24.3204],
        [-27.1780,  28.0638, -23.8636],
        [-26.9857,  28.0672, -23.8509],
        [-27.3153,  28.3130, -24.2526],
        [-26.4810,  27.8779, -23.5648],
        [-27.0232,  28.4347, -24.3284]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.9123,  23.1067,  23.7076, -27.0427,  28.3102, -24.3204],
        [ 27.

epoch:1, loss:-6.379024028778076
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.0877, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.6259, 23.0627, 23.8806],
        [27.3922, 22.9061, 23.7562],
        [27.4547, 23.1297, 23.8775],
        [28.2143, 23.3180, 24.3609],
        [27.7897, 22.9097, 23.6585],
        [28.3242, 23.9418, 24.0568]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.5617, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.2098,  28.4084, -24.0773],
        [-27.0783,  27.6874, -24.2465],
        [-26.3768,  27.8119, -23.3510],
        [-26.7777,  28.0197, -23.8860],
        [-26.7251,  28.0505, -23.5335],
        [-26.4595,  27.5146, -23.6139]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.6259,  23.0627,  23.8806, -27.2098,  28.4084, -24.0773],
        [ 27.3

epoch:1, loss:-6.487983703613281
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5986, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.3498, 23.1720, 23.8949],
        [27.1852, 23.0887, 23.9264],
        [28.1742, 23.8242, 24.0296],
        [27.6052, 23.3555, 23.8087],
        [27.8946, 23.6528, 23.9660],
        [26.9128, 22.9274, 22.9936]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(45.8877, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.6140,  28.0007, -24.0521],
        [-26.5311,  27.6093, -23.5204],
        [-27.0633,  28.4877, -24.3538],
        [-26.5508,  28.3194, -24.0074],
        [-27.4079,  28.9168, -24.5834],
        [-26.8063,  27.8892, -24.0380]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.3498,  23.1720,  23.8949, -26.6140,  28.0007, -24.0521],
        [ 27.1

epoch:1, loss:-6.351722717285156
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.0225, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.3688, 23.3084, 24.1416],
        [28.1601, 23.6080, 24.2680],
        [28.2183, 23.8971, 24.7502],
        [27.3001, 22.7542, 23.5103],
        [28.2680, 23.3881, 24.3378],
        [28.2574, 23.8082, 24.4468]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.0221, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.9515,  28.0245, -24.4173],
        [-26.8958,  28.1175, -23.7809],
        [-26.7885,  28.1627, -23.7992],
        [-26.9404,  27.7169, -24.2039],
        [-26.9119,  28.0922, -24.0430],
        [-26.4503,  27.6321, -23.6842]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.3688,  23.3084,  24.1416, -26.9515,  28.0245, -24.4173],
        [ 28.1

epoch:1, loss:-6.581818580627441
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7612, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.1620, 22.9303, 24.3376],
        [28.1812, 23.8117, 24.4561],
        [27.9049, 23.6807, 24.2307],
        [27.3817, 23.2632, 23.9387],
        [28.8794, 24.1037, 24.7240],
        [28.2786, 23.3762, 24.2480]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.4478, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.2898,  28.7682, -24.2856],
        [-27.4307,  28.9014, -24.4512],
        [-26.2264,  27.7248, -23.9330],
        [-27.1354,  28.1896, -24.4147],
        [-26.9900,  28.2972, -24.1669],
        [-27.5559,  28.6527, -24.6774]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.1620,  22.9303,  24.3376, -27.2898,  28.7682, -24.2856],
        [ 28.1

epoch:1, loss:-6.507805824279785
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.9188, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.3757, 23.5895, 24.4471],
        [28.2397, 23.6116, 24.2057],
        [28.0223, 23.4993, 23.9575],
        [28.8804, 23.9359, 24.7463],
        [27.7241, 23.4711, 23.9117],
        [27.9689, 23.5835, 24.4313]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.9515, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.9264,  29.0066, -24.4576],
        [-27.3620,  28.1944, -24.1450],
        [-27.6631,  28.9557, -24.4215],
        [-27.1786,  28.6639, -24.7141],
        [-26.1298,  27.4535, -23.7151],
        [-26.8216,  28.0703, -23.8309]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.3757,  23.5895,  24.4471, -27.9264,  29.0066, -24.4576],
        [ 28.2

epoch:1, loss:-6.502106189727783
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.2460, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.1853, 23.2538, 23.7732],
        [28.6810, 24.1090, 24.2772],
        [28.3249, 23.7604, 24.4607],
        [28.0911, 23.2742, 23.8293],
        [28.4636, 23.9204, 24.4303],
        [27.7466, 23.8153, 24.1203]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(54.0201, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.8176,  28.3456, -24.1491],
        [-27.3191,  28.4880, -24.2774],
        [-26.1473,  27.5297, -23.6288],
        [-26.9036,  28.5434, -24.6431],
        [-27.3629,  28.3785, -24.3833],
        [-26.6298,  27.6764, -23.9945]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.1853,  23.2538,  23.7732, -26.8176,  28.3456, -24.1491],
        [ 28.6

epoch:1, loss:-6.5511980056762695
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4123, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[27.9763, 23.5186, 24.5238],
        [28.5519, 24.0857, 24.2710],
        [28.0331, 23.9382, 24.2312],
        [28.5632, 23.7208, 24.7223],
        [28.2711, 23.8735, 24.5478],
        [28.3065, 23.7751, 24.6349]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(18.4656, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.4196,  28.6148, -24.3014],
        [-27.0032,  27.6406, -23.7654],
        [-26.9708,  27.7881, -23.8854],
        [-26.7284,  27.9432, -24.3008],
        [-27.4368,  28.6924, -24.4249],
        [-27.5942,  28.7323, -24.7811]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 27.9763,  23.5186,  24.5238, -27.4196,  28.6148, -24.3014],
        [ 28.

epoch:1, loss:-6.476741790771484
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.1280, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.3813, 23.6939, 24.4627],
        [28.1375, 24.2853, 24.1994],
        [28.0196, 23.5973, 23.9895],
        [28.4952, 24.3098, 24.5401],
        [28.2646, 23.1160, 24.3623],
        [28.9404, 24.2487, 24.8652]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.7538, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.0752,  28.4947, -24.7030],
        [-27.0527,  28.0674, -24.0985],
        [-27.5967,  28.7053, -24.4510],
        [-27.4272,  28.4668, -24.7307],
        [-27.8716,  29.1914, -25.0177],
        [-27.4754,  28.6855, -24.5974]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.3813,  23.6939,  24.4627, -27.0752,  28.4947, -24.7030],
        [ 28.1

epoch:1, loss:-6.650601863861084
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.5230, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.4439, 23.5578, 24.4115],
        [28.2647, 23.7715, 24.1011],
        [27.9493, 23.7193, 24.0912],
        [28.2695, 23.7757, 24.7487],
        [28.8292, 24.5091, 24.7291],
        [28.0951, 23.4633, 24.2428]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(55.3441, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.2918,  27.6174, -23.5303],
        [-27.9688,  28.8845, -24.8519],
        [-26.9327,  28.3814, -24.1270],
        [-27.2322,  28.5394, -24.5369],
        [-27.0673,  28.5699, -24.3070],
        [-26.4630,  28.1166, -23.8147]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.4439,  23.5578,  24.4115, -26.2918,  27.6174, -23.5303],
        [ 28.2

epoch:1, loss:-6.655755996704102
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4327, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.8696, 24.0670, 24.8168],
        [28.6293, 24.0289, 24.5517],
        [28.7814, 24.3439, 24.4303],
        [28.2663, 23.9237, 24.3790],
        [28.1984, 23.9556, 24.1079],
        [28.5558, 23.6531, 24.4203]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.6219, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.2707,  28.5727, -24.7572],
        [-26.7386,  28.1646, -24.2948],
        [-27.6334,  28.7272, -24.7589],
        [-27.5021,  28.6824, -24.8329],
        [-27.1299,  28.1756, -24.3179],
        [-27.1607,  28.1139, -24.2840]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.8696,  24.0670,  24.8168, -27.2707,  28.5727, -24.7572],
        [ 28.6

epoch:1, loss:-6.6888227462768555
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2026, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.2640, 23.5783, 24.3064],
        [28.0944, 23.7433, 24.4146],
        [29.1663, 24.3714, 24.8518],
        [27.8425, 23.4932, 23.8420],
        [27.6997, 23.0045, 23.3013],
        [28.6064, 23.6305, 24.4945]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(36.4243, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.8293,  28.4517, -24.1137],
        [-27.8477,  29.0337, -24.8870],
        [-27.2057,  28.1795, -24.3835],
        [-27.6111,  28.9744, -24.5940],
        [-27.1221,  28.3546, -24.2924],
        [-28.0108,  29.0552, -24.8303]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.2640,  23.5783,  24.3064, -26.8293,  28.4517, -24.1137],
        [ 28.

epoch:1, loss:-6.785391807556152
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.1799, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.8271, 24.3105, 24.4385],
        [28.4070, 23.6787, 24.6087],
        [28.3797, 23.9212, 24.2169],
        [28.6330, 24.0337, 24.4043],
        [28.1874, 24.0243, 24.1596],
        [28.1354, 23.5787, 24.3660]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(45.8501, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.4164,  27.8631, -24.1158],
        [-27.0219,  28.0603, -23.8675],
        [-27.7463,  28.6944, -24.5601],
        [-28.1150,  29.6461, -25.4389],
        [-27.5443,  28.9852, -24.4180],
        [-27.0499,  28.5178, -24.3853]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.8271,  24.3105,  24.4385, -26.4164,  27.8631, -24.1158],
        [ 28.4

epoch:1, loss:-6.675902366638184
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.0301, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.1015, 23.2287, 24.4297],
        [28.0692, 23.6141, 24.4823],
        [28.6204, 24.0309, 24.5903],
        [28.8733, 24.3036, 24.7167],
        [28.7791, 24.5503, 24.8523],
        [29.1054, 24.1795, 24.9287]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.0083, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-26.7851,  28.4199, -24.3745],
        [-27.3650,  28.5397, -24.3513],
        [-27.4679,  28.8174, -24.4566],
        [-27.0509,  28.4978, -24.7143],
        [-27.0290,  28.2287, -24.7451],
        [-27.9896,  28.9505, -25.2594]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.1015,  23.2287,  24.4297, -26.7851,  28.4199, -24.3745],
        [ 28.0

epoch:1, loss:-6.691075325012207
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.2064, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.8839, 24.3037, 25.3449],
        [28.3076, 23.6273, 24.5396],
        [28.1630, 24.2077, 24.8460],
        [28.5015, 24.0266, 24.1898],
        [28.1341, 23.5223, 24.3674],
        [28.5889, 24.1870, 24.5740]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(37.2191, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.7642,  28.9590, -24.9529],
        [-27.4923,  28.7664, -25.0596],
        [-27.9285,  28.9824, -25.0186],
        [-27.5434,  28.7880, -24.9284],
        [-28.3320,  29.6595, -25.3857],
        [-27.3797,  28.5714, -24.6932]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.8839,  24.3037,  25.3449, -27.7642,  28.9590, -24.9529],
        [ 28.3

epoch:1, loss:-6.752771854400635
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.1853, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.5865, 24.2456, 24.5310],
        [28.2522, 23.8983, 24.3044],
        [28.3275, 23.9938, 24.6413],
        [28.5516, 24.0830, 24.7340],
        [29.1059, 24.3203, 25.0499],
        [29.0955, 24.6467, 24.9010]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(27.5522, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.5143,  29.0609, -24.6810],
        [-27.5478,  28.9052, -25.0728],
        [-28.1943,  29.9302, -25.1822],
        [-27.3676,  28.8837, -24.9377],
        [-27.9299,  29.1174, -25.0516],
        [-27.8520,  29.0288, -24.9208]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.5865,  24.2456,  24.5310, -27.5143,  29.0609, -24.6810],
        [ 28.2

epoch:1, loss:-6.815430164337158
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.8391, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.6650, 24.1684, 24.3300],
        [28.4316, 24.5688, 24.7321],
        [29.1875, 24.5914, 25.1072],
        [28.2450, 24.0360, 24.3060],
        [28.4780, 24.4242, 24.8527],
        [29.0690, 24.3893, 25.2428]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(46.8255, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.7623,  28.4834, -24.6136],
        [-27.2475,  28.5587, -24.4991],
        [-28.1480,  28.9712, -25.1760],
        [-27.5783,  28.2401, -24.3377],
        [-27.7496,  28.9912, -25.1348],
        [-28.1356,  29.4402, -25.6006]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.6650,  24.1684,  24.3300, -27.7623,  28.4834, -24.6136],
        [ 28.4

epoch:1, loss:-6.807179927825928
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.9304, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.9925, 24.5458, 24.7520],
        [28.4244, 24.1870, 24.4412],
        [28.2790, 23.7193, 24.2257],
        [28.4006, 24.2767, 24.2364],
        [28.9550, 24.2221, 25.3378],
        [29.2741, 24.7495, 25.5055]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(37.2483, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-28.1773,  29.3259, -24.7257],
        [-27.9065,  28.8691, -24.9671],
        [-28.1287,  29.2524, -25.4432],
        [-26.8342,  28.4897, -24.5274],
        [-27.6348,  29.0519, -24.9795],
        [-28.2050,  29.1336, -25.1059]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.9925,  24.5458,  24.7520, -28.1773,  29.3259, -24.7257],
        [ 28.4

epoch:1, loss:-6.923034191131592
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.8800, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.8861, 24.3791, 24.8511],
        [28.8050, 24.6467, 24.9980],
        [28.9251, 24.3780, 24.8620],
        [28.8514, 24.6390, 25.1881],
        [28.9224, 24.2303, 24.8858],
        [28.6832, 24.4808, 25.0735]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.4622, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.8584,  29.2936, -24.9166],
        [-27.7019,  29.0697, -25.0456],
        [-28.5579,  29.2828, -25.6126],
        [-27.3119,  28.6099, -24.8273],
        [-27.6042,  28.8561, -24.6981],
        [-28.0272,  28.9834, -25.1734]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.8861,  24.3791,  24.8511, -27.8584,  29.2936, -24.9166],
        [ 28.8

epoch:1, loss:-6.86877965927124
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7395, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[29.3831, 25.0751, 25.2127],
        [28.6363, 24.2158, 24.9012],
        [28.9629, 24.7570, 24.7122],
        [28.6749, 24.4232, 24.5988],
        [29.2426, 24.9384, 25.1695],
        [28.6234, 24.3261, 24.6631]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(56.5848, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.9041,  28.8455, -24.7690],
        [-27.8230,  29.1663, -24.7260],
        [-27.8891,  29.0391, -25.4450],
        [-28.9000,  30.1225, -26.0588],
        [-27.3158,  27.9562, -24.1438],
        [-27.8446,  29.1470, -25.1756]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 29.3831,  25.0751,  25.2127, -27.9041,  28.8455, -24.7690],
        [ 28.63

epoch:1, loss:-6.846578598022461
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4893, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[29.1800, 25.2423, 25.2967],
        [29.2167, 24.3188, 25.2898],
        [29.1330, 24.2646, 25.2959],
        [28.9499, 24.6145, 25.1880],
        [29.2484, 24.6192, 25.2757],
        [28.9741, 24.1560, 24.6508]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(56.2592, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-28.3090,  29.3434, -25.3302],
        [-27.7414,  29.3478, -25.1230],
        [-28.4543,  29.8049, -25.7133],
        [-27.8726,  28.9709, -24.9947],
        [-27.9369,  28.9306, -25.0131],
        [-27.6997,  29.0465, -24.8307]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 29.1800,  25.2423,  25.2967, -28.3090,  29.3434, -25.3302],
        [ 29.2

epoch:1, loss:-6.870548248291016
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.0255, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[29.3292, 24.7398, 25.4794],
        [28.7180, 24.3312, 24.9829],
        [29.2516, 24.7468, 25.4320],
        [28.8123, 24.3788, 25.0581],
        [29.0440, 24.4293, 24.8046],
        [28.9111, 24.5024, 25.0158]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(47.5117, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-28.2878,  29.2797, -25.0440],
        [-27.8265,  28.7296, -24.8668],
        [-28.2054,  29.4638, -25.1382],
        [-27.3318,  28.8370, -24.5542],
        [-27.7612,  29.0486, -25.0519],
        [-27.5904,  29.2644, -25.3757]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 29.3292,  24.7398,  25.4794, -28.2878,  29.2797, -25.0440],
        [ 28.7

epoch:1, loss:-7.05684232711792
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.8306, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[29.0347, 24.6432, 25.4828],
        [29.0213, 24.2458, 25.0930],
        [29.0153, 24.7043, 25.4779],
        [29.3101, 24.6077, 25.3815],
        [28.8916, 24.4406, 24.7018],
        [28.5494, 24.6259, 24.8570]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(37.8800, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.7003,  28.6791, -24.6093],
        [-27.8253,  29.1704, -24.8020],
        [-28.4186,  29.2557, -25.1960],
        [-28.3142,  29.2242, -24.9720],
        [-28.1427,  28.9470, -25.3869],
        [-28.7243,  29.5953, -25.7349]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 29.0347,  24.6432,  25.4828, -27.7003,  28.6791, -24.6093],
        [ 29.02

epoch:1, loss:-6.9681854248046875
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.0083, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.9881, 24.6298, 25.4579],
        [29.0461, 24.5625, 24.9674],
        [28.7841, 24.3145, 24.9134],
        [29.3871, 24.6144, 25.0351],
        [27.6581, 23.4234, 23.9294],
        [28.5672, 24.7329, 24.7217]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(47.1928, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-27.3294,  28.5418, -24.7794],
        [-27.9271,  29.4018, -24.7753],
        [-27.7606,  29.0684, -25.1720],
        [-28.0112,  29.5637, -25.4110],
        [-27.9015,  29.0200, -24.7948],
        [-27.4190,  28.0412, -24.9170]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.9881,  24.6298,  25.4579, -27.3294,  28.5418, -24.7794],
        [ 29.

epoch:1, loss:-6.888181686401367
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.5832, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[28.0546, 24.1325, 24.7257],
        [28.5139, 24.5397, 25.0517],
        [29.1702, 24.3354, 25.0676],
        [29.2473, 24.5996, 25.3201],
        [29.0783, 24.4420, 25.1511],
        [29.1282, 24.4687, 25.1655]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(28.1432, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-28.1065,  29.5732, -25.2615],
        [-28.5614,  30.0053, -25.5807],
        [-28.2984,  29.7364, -25.4462],
        [-27.9470,  29.1298, -25.2540],
        [-27.8757,  29.1116, -25.3483],
        [-28.0688,  29.6144, -25.1804]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 28.0546,  24.1325,  24.7257, -28.1065,  29.5732, -25.2615],
        [ 28.5

epoch:1, loss:-6.945169925689697
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(1.4420, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[29.2411, 24.8110, 25.5171],
        [29.4199, 24.9597, 25.3056],
        [29.1564, 24.5833, 25.0230],
        [29.0708, 24.0781, 25.1043],
        [29.2223, 24.2101, 25.1963],
        [29.3516, 24.8301, 25.1297]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(37.6816, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-28.0467,  28.8259, -25.1737],
        [-27.8857,  29.1269, -25.1813],
        [-28.1808,  29.4093, -25.2836],
        [-28.2489,  29.2264, -25.3878],
        [-28.2459,  28.9682, -25.0221],
        [-28.3327,  29.5144, -25.6248]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 29.2411,  24.8110,  25.5171, -28.0467,  28.8259, -25.1737],
        [ 29.4

epoch:1, loss:-7.078799724578857
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.0957, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[29.1389, 25.0924, 25.4812],
        [29.9612, 25.3887, 25.8288],
        [29.4235, 24.6151, 25.5871],
        [29.9501, 25.1239, 25.8026],
        [29.0501, 24.8015, 24.6698],
        [28.3030, 24.5369, 24.8897]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(37.8535, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-28.6461,  29.5862, -25.8756],
        [-27.4698,  28.3816, -24.7392],
        [-27.3086,  28.8581, -24.9542],
        [-27.5476,  28.9909, -25.3361],
        [-28.4639,  29.9058, -25.7755],
        [-28.7169,  29.8472, -26.0239]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 29.1389,  25.0924,  25.4812, -28.6461,  29.5862, -25.8756],
        [ 29.9

epoch:1, loss:-7.105643272399902
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.1595, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[29.2380, 24.4481, 24.9965],
        [28.9519, 24.4928, 24.9252],
        [28.9652, 24.8390, 24.8576],
        [28.8047, 24.6266, 24.8083],
        [29.4326, 24.8706, 25.4465],
        [28.9974, 24.9736, 25.5488]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(37.8696, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-28.3137,  29.6519, -25.7875],
        [-27.9180,  29.0865, -24.7624],
        [-28.2312,  29.4118, -25.4500],
        [-28.8607,  30.0371, -26.1793],
        [-28.8568,  29.7937, -25.9742],
        [-28.6203,  29.6455, -25.7726]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 29.2380,  24.4481,  24.9965, -28.3137,  29.6519, -25.7875],
        [ 28.9

epoch:1, loss:-7.0097737312316895
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(3.5400, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[29.5678, 25.4506, 25.8476],
        [29.4981, 24.9818, 25.3344],
        [29.2939, 24.7423, 25.2374],
        [29.0944, 24.8791, 25.2120],
        [29.1667, 24.7923, 24.9321],
        [29.1830, 24.7473, 24.9332]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(38.8359, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-28.4461,  29.6262, -25.5398],
        [-28.7506,  30.0912, -25.8797],
        [-28.9748,  30.2406, -26.1667],
        [-28.8056,  29.8653, -25.8321],
        [-27.9794,  29.4511, -25.3959],
        [-28.0513,  29.0221, -25.4265]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 29.5678,  25.4506,  25.8476, -28.4461,  29.6262, -25.5398],
        [ 29.

epoch:1, loss:-7.213237762451172
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(2.7546, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[29.3492, 24.9581, 25.4175],
        [29.8264, 25.1416, 25.7614],
        [29.1119, 24.5910, 25.3049],
        [29.8289, 25.6082, 25.7583],
        [28.7590, 24.3599, 25.1665],
        [28.9254, 24.4041, 25.3075]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(58.2148, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-28.3916,  29.4825, -25.5230],
        [-28.7848,  29.8467, -25.9438],
        [-28.7472,  29.8749, -26.3172],
        [-28.5388,  29.7117, -25.9667],
        [-28.3454,  29.6961, -25.7714],
        [-28.2814,  29.5874, -25.8925]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 29.3492,  24.9581,  25.4175, -28.3916,  29.4825, -25.5230],
        [ 29.8

epoch:1, loss:-7.25870418548584
<class 'dict'>
[SequenceClassifierOutput(loss=tensor(0.7402, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[29.6216, 24.4576, 25.5923],
        [29.8774, 25.6431, 26.0495],
        [29.2977, 24.9809, 25.5186],
        [29.7391, 25.3073, 26.1094],
        [29.4141, 25.1447, 25.4114],
        [29.2952, 25.0679, 25.9913]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=tensor(58.5700, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-29.1873,  30.1541, -26.4645],
        [-28.8337,  29.6196, -26.0694],
        [-28.5826,  30.0485, -25.8523],
        [-28.0860,  28.9091, -24.8004],
        [-28.5168,  30.0130, -25.9532],
        [-29.3297,  30.1394, -26.3611]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]
Printing last hidden states
tensor([[ 29.6216,  24.4576,  25.5923, -29.1873,  30.1541, -26.4645],
        [ 29.87

# Test Code

In [10]:
# standard pytorch way of doing things
# 1. create a custom Dataset 
# 2. pass the dataset to a dataloader
# 3. iterate the dataloader and pass the inputs to the model

max_len = 256
batch_size = 6
grad_step = 1
initialization_input = (max_len, batch_size)

#Reading datasets and initializing data loaders
dataset_location = '../2022.07.07_task5/'

#Gives us tweet_id, sentence, and label for each dataset.
train_data = read_task5(dataset_location , split = 'dev')
#test_data = read_task5(dataset_location , split = 'dev')#load test set
labels_to_ids = task5_labels_to_ids
#input_data = (train_data, dev_data, labels_to_ids)

dataloader_m1 = initialize_data(model1_tokenizer, initialization_input, train_data, labels_to_ids, shuffle = False)
dataloader_m2 = initialize_data(model2_tokenizer, initialization_input, train_data, labels_to_ids, shuffle = False)


complete_outputs, complete_label_ids = [], []

# iterate the QA and the AQ inputs simultaneously
for step, combined_batch in enumerate(zip(dataloader_m1, dataloader_m2)):
  # only forward pass so no dropout
  model.eval()
  batch_1, batch_2 = combined_batch


  # no back pass so no need to track variables for differentiation
  with torch.no_grad():
    inputs = {
        "input_ids": [batch_1[0].to(device, dtype = torch.long), batch_2[0].to(device, dtype = torch.long)],
        "attention_mask": [batch_1[1].to(device, dtype = torch.long), batch_2[1].to(device, dtype = torch.long)],
        "labels": [batch_1[2].to(device, dtype = torch.long), batch_2[2].to(device, dtype = torch.long)]
    }
    outputs = model(**inputs)
    tmp_eval_loss, logits = outputs[:2]
    logits = logits.detach().cpu().numpy()
    outputs = np.argmax(logits, axis=1)
    label_ids = inputs["labels"].detach().cpu().numpy()
  complete_outputs.extend(outputs)
  complete_label_ids.extend(label_ids)

print(complete_outputs, complete_label_ids)

NameError: name 'prepare_data' is not defined