## Importing BERT model

In [1]:
from transformers import BertTokenizer, BertModel

In [2]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3090


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [4]:
#checking tokenizer
tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
}

## Importing CoLA dataset

### grammatically correct or not

In [5]:
import wget
import os

print('Downloading dataset...')

# The URL for the dataset zip file.
url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'

# Download the file (if we haven't already)
if not os.path.exists('/scratch/paneah/cola_public_1.1.zip'):
    wget.download(url, '/scratch/paneah/cola_public_1.1.zip')

# Unzip the dataset (if we haven't already)
if not os.path.exists('/scratch/paneah/cola_public'):
    !unzip /scratch/paneah/cola_public_1.1.zip -d /scratch/paneah/

Downloading dataset...


In [6]:
!nvidia-smi

Thu Dec 14 22:58:45 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090         On | 00000000:81:00.0 Off |                  N/A |
| 37%   28C    P8               23W / 350W|      3MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090         On | 00000000:A1:00.0 Off |  

In [7]:
import pandas as pd

#loading dataset with pandas
df = pd.read_csv("/scratch/paneah/cola_public/raw/in_domain_train.tsv", \
                 delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
#printing number of sentences
print('Number of training sentences: {:,}\n'.format(df.shape[0]))
#display random 10 rows 
df.sample(10)

Number of training sentences: 8,551



Unnamed: 0,sentence_source,label,label_notes,sentence
5071,ks08,0,*,I believe that the world is round strongly.
2294,l-93,0,*,The witch turned him from a prince.
979,bc01,1,,John bought the picture of himself that Bill saw.
8366,ad03,1,,Aphrodite may quickly free the animals.
5857,c_13,1,,The Brazilians pumped the oil across the river.
5367,b_73,1,,Susan doesn't eat her vegetables enough.
4560,ks08,0,*,It has rain every day for the last week.
6403,d_98,1,,Every woman standing under that tree is Mary's...
4356,ks08,1,,There are believed to be sheep in the park.
6299,c_13,1,,Frank will eat an apple and Morgan will too.


In [8]:
# Get the lists of sentences and their labels.
sentences = df.sentence.values
labels = df.label.values

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  Our friends won't buy this analysis, let alone the next one we propose.
Token IDs: [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102]


In [9]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  Our friends won't buy this analysis, let alone the next one we propose.
Token IDs: tensor([  101,  2256,  2814,  2180,  1005,  1056,  4965,  2023,  4106,  1010,
         2292,  2894,  1996,  2279,  2028,  2057, 16599,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


In [10]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

7,695 training samples
  856 validation samples


In [11]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [12]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [13]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [14]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



In [16]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [27]:
from tqdm import tqdm

seed_val = 42

torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

for epoch_i in tqdm(range(0, epochs), desc = "epoch", mininterval=0.01):
    total_train_loss = 0

    model.train()

    for step, batch in tqdm(enumerate(train_dataloader), desc = "step", mininterval=0.01):

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
    
        


epoch:   0%|                                                                                      | 0/4 [00:00<?, ?it/s]

SequenceClassifierOutput(loss=tensor(0.8277, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6166,  0.1010],
        [ 0.7037,  0.0917],
        [ 0.5382,  0.2110],
        [ 0.3998,  0.0816],
        [ 0.7558,  0.1214],
        [ 0.5495,  0.0447],
        [ 0.5952,  0.0849],
        [ 0.6200, -0.1042],
        [ 0.8252,  0.0635],
        [ 0.5924,  0.1349],
        [ 0.7005,  0.0552],
        [ 0.9809,  0.1697],
        [ 0.7981,  0.1985],
        [ 0.6155,  0.1968],
        [ 0.9245,  0.2495],
        [ 1.0990,  0.1884],
        [ 0.5896,  0.1726],
        [ 0.7901,  0.0817],
        [ 0.5910,  0.1578],
        [ 0.7196,  0.1819],
        [ 0.7065, -0.0479],
        [ 0.8457, -0.1434],
        [ 0.9219,  0.1112],
        [ 1.0231,  0.0543],
        [ 0.1673,  0.0660],
        [ 0.8025,  0.3730],
        [ 0.5688, -0.0068],
        [ 0.8737,  0.4722],
        [ 0.6920,  0.3135],
        [ 0.5603,  0.0166],
        [ 0.7782, -0.0880],
        [ 0.9404,  0.3009]], devic



SequenceClassifierOutput(loss=tensor(0.8913, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6741,  0.1703],
        [ 0.9368,  0.3134],
        [ 0.8631,  0.0445],
        [ 0.9039,  0.1512],
        [ 1.0093,  0.0530],
        [ 0.5826,  0.2729],
        [ 0.8901,  0.1489],
        [ 0.4644, -0.0527],
        [ 0.8479,  0.3709],
        [ 0.5641, -0.2604],
        [ 0.7914,  0.2472],
        [ 0.7523,  0.2347],
        [ 0.6239, -0.0688],
        [ 0.7732,  0.0906],
        [ 0.6484, -0.0154],
        [ 1.0744,  0.0494],
        [ 0.8737,  0.2988],
        [ 0.7966,  0.0043],
        [ 0.3217, -0.2110],
        [ 0.7336,  0.0112],
        [ 1.1123,  0.0492],
        [ 0.9147,  0.0979],
        [ 0.7654, -0.1320],
        [ 0.5138, -0.0419],
        [ 0.7282, -0.0145],
        [ 0.6101,  0.1462],
        [ 0.6644, -0.1601],
        [ 0.8492,  0.0910],
        [ 0.5035, -0.0565],
        [ 0.7657, -0.1354],
        [ 0.6726, -0.0154],
        [ 0.6577,  0.0623]], devic



SequenceClassifierOutput(loss=tensor(0.8718, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8013,  0.0165],
        [ 0.7721,  0.1568],
        [ 0.5262,  0.1656],
        [ 0.6242,  0.1874],
        [ 0.6123,  0.1490],
        [ 0.6285, -0.0356],
        [ 0.7219, -0.1199],
        [ 0.4162,  0.0638],
        [ 0.7555, -0.1570],
        [ 0.7468,  0.1609],
        [ 0.7483,  0.1573],
        [ 0.8780,  0.0436],
        [ 0.5386,  0.1808],
        [ 0.8515, -0.0087],
        [ 0.9322,  0.0745],
        [ 0.8046,  0.1786],
        [ 0.4342,  0.3201],
        [ 0.5046,  0.1219],
        [ 0.6050, -0.0177],
        [ 0.5709,  0.0630],
        [ 0.6432, -0.1031],
        [ 0.6741,  0.3958],
        [ 0.7630,  0.1847],
        [ 0.6751,  0.2565],
        [ 0.6180,  0.1092],
        [ 0.8566,  0.0625],
        [ 0.8524,  0.1339],
        [ 0.5476, -0.0033],
        [ 0.6616,  0.1793],
        [ 0.7525,  0.0500],
        [ 0.6805,  0.2232],
        [ 0.7609, -0.0425]], devic



SequenceClassifierOutput(loss=tensor(0.9334, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.1114, -0.1638],
        [ 0.6663, -0.0262],
        [ 0.8716, -0.1542],
        [ 0.8704,  0.4767],
        [ 0.7988,  0.0932],
        [ 0.5793,  0.2470],
        [ 0.5803,  0.1878],
        [ 0.9828,  0.0132],
        [ 0.4216,  0.1455],
        [ 0.5369,  0.2185],
        [ 0.9622,  0.3689],
        [ 0.7027,  0.0021],
        [ 0.8307,  0.0089],
        [ 0.4864, -0.0713],
        [ 1.0567,  0.2167],
        [ 0.2881,  0.0990],
        [ 0.7652,  0.3147],
        [ 0.1572,  0.1062],
        [ 0.7299, -0.0083],
        [ 0.7482, -0.0043],
        [ 0.4070,  0.1844],
        [ 0.5493,  0.0803],
        [ 0.3532, -0.0496],
        [ 0.5104, -0.1920],
        [ 0.2599,  0.1624],
        [ 0.7489,  0.1132],
        [ 0.7353,  0.0075],
        [ 0.9183,  0.1722],
        [ 0.8054,  0.2692],
        [ 0.5702, -0.0021],
        [ 0.5001,  0.2045],
        [ 0.6884,  0.1129]], devic



SequenceClassifierOutput(loss=tensor(0.9883, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0154,  0.0540],
        [ 1.1272, -0.2768],
        [ 0.8259,  0.2002],
        [ 1.0778,  0.1666],
        [ 0.5451,  0.0104],
        [ 0.5281, -0.0246],
        [ 0.8180,  0.0858],
        [ 0.4402, -0.1584],
        [ 0.5353, -0.1856],
        [ 0.5377, -0.0891],
        [ 0.5019, -0.0181],
        [ 0.6739,  0.3389],
        [ 0.8877,  0.3220],
        [ 0.6936,  0.0425],
        [ 0.5720,  0.0857],
        [ 0.9082, -0.0809],
        [ 0.6806,  0.1451],
        [ 0.7265,  0.0259],
        [ 0.8149,  0.1764],
        [ 0.8266,  0.2165],
        [ 1.0856, -0.0125],
        [ 0.7491,  0.2409],
        [ 0.9826, -0.2565],
        [ 0.4873,  0.0335],
        [ 0.5607, -0.2586],
        [ 0.6946,  0.0841],
        [ 0.6564,  0.1702],
        [ 0.4824,  0.1498],
        [ 0.6044,  0.0845],
        [ 0.8497,  0.0137],
        [ 0.8875,  0.0169],
        [ 0.4369,  0.0393]], devic



SequenceClassifierOutput(loss=tensor(0.8996, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8819, -0.0782],
        [ 0.8120,  0.2015],
        [ 0.8280,  0.1018],
        [ 0.5224,  0.2624],
        [ 0.6668,  0.0711],
        [ 0.9448,  0.0769],
        [ 0.8869,  0.0027],
        [ 0.7694, -0.0320],
        [ 0.7398,  0.2512],
        [ 0.9987,  0.0129],
        [ 0.5731, -0.1579],
        [ 0.6236, -0.1410],
        [ 0.3795, -0.0044],
        [ 0.5862, -0.1329],
        [ 0.5953,  0.1445],
        [ 0.7215, -0.0208],
        [ 0.8022,  0.0858],
        [ 0.8111,  0.1654],
        [ 0.8531,  0.1633],
        [ 0.8908,  0.0270],
        [ 0.6283,  0.1683],
        [ 0.9584,  0.1934],
        [ 0.3836, -0.2876],
        [ 0.7911, -0.0098],
        [ 0.8683,  0.1271],
        [ 0.7334, -0.0914],
        [ 0.7618,  0.2448],
        [ 0.7044,  0.0102],
        [ 0.7280, -0.0203],
        [ 0.8964, -0.0135],
        [ 0.6112,  0.2077],
        [ 0.6518,  0.0358]], devic



SequenceClassifierOutput(loss=tensor(0.8977, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7326,  0.0645],
        [ 0.5904, -0.0730],
        [ 0.9431,  0.0394],
        [ 0.7694,  0.1983],
        [ 0.6283,  0.0110],
        [ 0.9245,  0.1878],
        [ 0.7841,  0.3142],
        [ 0.9582,  0.1728],
        [ 0.8566,  0.2043],
        [ 0.7538,  0.0821],
        [ 0.7804,  0.1129],
        [ 0.9259,  0.2114],
        [ 0.6083,  0.1056],
        [ 0.6603,  0.1815],
        [ 0.7111,  0.1846],
        [ 0.7097, -0.0682],
        [ 0.9133,  0.2671],
        [ 0.9168, -0.0834],
        [ 0.9165,  0.0192],
        [ 0.9025,  0.3314],
        [ 0.9935,  0.1144],
        [ 0.3444,  0.0916],
        [ 0.7070,  0.0920],
        [ 0.4855, -0.0426],
        [ 0.4975,  0.1174],
        [ 0.8218,  0.2108],
        [ 0.6838,  0.2916],
        [ 0.8603, -0.0818],
        [ 0.8318, -0.1513],
        [ 0.6936, -0.0753],
        [ 0.5416, -0.1034],
        [ 0.6496, -0.0216]], devic



SequenceClassifierOutput(loss=tensor(0.9047, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.1344, -0.0391],
        [ 0.8676, -0.0169],
        [ 0.4475, -0.0532],
        [ 0.6975, -0.0323],
        [ 1.1248,  0.1675],
        [ 0.9191,  0.0775],
        [ 0.8485, -0.1859],
        [ 0.5359, -0.1683],
        [ 1.0996,  0.0960],
        [ 0.8187,  0.2764],
        [ 0.5448,  0.1096],
        [ 0.5970, -0.2303],
        [ 0.9162,  0.2128],
        [ 0.6160, -0.0662],
        [ 0.8021,  0.0839],
        [ 0.7971,  0.2322],
        [ 0.6144, -0.0072],
        [ 0.5029,  0.1405],
        [ 0.8243, -0.0319],
        [ 0.8529,  0.0143],
        [ 0.4625,  0.1268],
        [ 1.0049,  0.2263],
        [ 0.5204,  0.0750],
        [ 0.6829, -0.0407],
        [ 0.9577,  0.0677],
        [ 0.5276,  0.1717],
        [ 0.3977, -0.1486],
        [ 0.6895,  0.1089],
        [ 1.0385,  0.1962],
        [ 0.8625,  0.0373],
        [ 0.8402,  0.0744],
        [ 1.0444,  0.0672]], devic



SequenceClassifierOutput(loss=tensor(0.8769, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 1.0215,  0.1907],
        [ 0.7611,  0.0351],
        [ 0.6846,  0.3695],
        [ 0.5846,  0.3336],
        [ 0.5906,  0.1491],
        [ 0.9230,  0.1697],
        [ 0.9080,  0.3431],
        [ 0.7049,  0.1708],
        [ 0.5078,  0.0797],
        [ 0.7085, -0.0323],
        [ 0.3937,  0.0588],
        [ 0.5708,  0.1493],
        [ 0.5406,  0.1333],
        [ 0.5496, -0.0269],
        [ 0.4111, -0.2115],
        [ 0.7018,  0.0912],
        [ 0.6919,  0.0884],
        [ 0.7805, -0.0269],
        [ 0.5058, -0.0838],
        [ 0.6228,  0.1619],
        [ 0.7654, -0.1163],
        [ 0.6120,  0.0850],
        [ 0.6854, -0.2262],
        [ 0.8543, -0.1871],
        [ 0.6340, -0.1378],
        [ 0.7754, -0.0482],
        [ 0.8516,  0.1337],
        [ 0.5423, -0.0164],
        [ 0.8214,  0.0528],
        [ 0.6096,  0.0728],
        [ 0.5461, -0.0691],
        [ 1.0150,  0.1890]], devic



SequenceClassifierOutput(loss=tensor(0.9327, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5114,  0.0834],
        [ 0.7375,  0.0838],
        [ 0.7553,  0.1489],
        [ 1.1158,  0.2019],
        [ 0.6253, -0.0620],
        [ 0.7816,  0.0367],
        [ 0.6240, -0.1125],
        [ 0.7310,  0.0408],
        [ 0.6864, -0.0400],
        [ 0.7638, -0.0423],
        [ 0.8992,  0.2960],
        [ 0.6337,  0.0794],
        [ 0.6217,  0.0470],
        [ 0.6262,  0.4199],
        [ 0.8973,  0.1524],
        [ 0.5611, -0.0112],
        [ 0.6637,  0.1645],
        [ 0.3126, -0.0500],
        [ 0.8113,  0.1330],
        [ 0.6679,  0.1955],
        [ 0.8766, -0.0538],
        [ 0.5556,  0.2334],
        [ 0.7177,  0.1736],
        [ 0.5080,  0.0032],
        [ 0.5215,  0.0721],
        [ 1.0290, -0.0191],
        [ 0.6493, -0.0429],
        [ 0.8098, -0.1136],
        [ 0.6378,  0.0082],
        [ 0.4444, -0.0251],
        [ 0.7029,  0.0144],
        [ 0.6549,  0.2135]], devic



SequenceClassifierOutput(loss=tensor(0.8177, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6116, -0.0603],
        [ 0.9753,  0.1734],
        [ 0.6653, -0.0380],
        [ 0.6374, -0.0123],
        [ 0.8856, -0.0035],
        [ 0.6688, -0.0446],
        [ 0.5746, -0.1724],
        [ 0.8455,  0.2885],
        [ 1.0047, -0.0814],
        [ 0.6539,  0.3154],
        [ 0.4657,  0.0521],
        [ 0.6903,  0.1643],
        [ 0.8759,  0.0350],
        [ 0.5756,  0.1649],
        [ 0.4734,  0.0949],
        [ 0.5324,  0.1271],
        [ 0.6762,  0.1318],
        [ 0.6295,  0.2842],
        [ 0.7637, -0.0720],
        [ 0.6279, -0.1423],
        [ 0.9809, -0.0191],
        [ 0.9209,  0.2414],
        [ 0.7446,  0.1233],
        [ 0.8373,  0.3265],
        [ 0.7542,  0.2114],
        [ 0.8230, -0.2252],
        [ 0.6766,  0.1068],
        [ 0.7402,  0.0147],
        [ 0.7733,  0.0660],
        [ 0.6913,  0.0741],
        [ 0.5876,  0.0830],
        [ 0.6409,  0.0867]], devic



SequenceClassifierOutput(loss=tensor(0.8912, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 5.2901e-01, -3.8281e-02],
        [ 9.4804e-01, -1.1985e-01],
        [ 6.4083e-01,  4.0773e-02],
        [ 5.8466e-01,  2.4103e-01],
        [ 8.1452e-01,  6.1113e-02],
        [ 8.2432e-01,  1.3295e-01],
        [ 5.8847e-01, -1.0444e-01],
        [ 6.7361e-01,  1.9297e-01],
        [ 6.4619e-01, -7.4764e-02],
        [ 5.8571e-01,  1.6353e-01],
        [ 6.4824e-01,  2.1187e-01],
        [ 9.3620e-01,  2.9705e-02],
        [ 6.6083e-01, -4.1256e-01],
        [ 8.2464e-01,  1.9091e-02],
        [ 6.5119e-01,  1.6299e-02],
        [ 5.4662e-01,  1.3118e-01],
        [ 5.6673e-01,  2.8193e-05],
        [ 1.0668e+00,  1.6578e-01],
        [ 2.6687e-01, -5.0496e-04],
        [ 8.2637e-01,  1.2314e-01],
        [ 5.9668e-01,  1.8491e-01],
        [ 8.9041e-01,  1.1976e-01],
        [ 8.6318e-01,  1.3094e-01],
        [ 6.6053e-01,  3.9777e-01],
        [ 6.7943e-01, -6.2069e-02],
  



SequenceClassifierOutput(loss=tensor(0.8386, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5965,  0.0724],
        [ 0.7626, -0.0381],
        [ 0.3541,  0.0017],
        [ 0.4979, -0.0878],
        [ 0.5124,  0.3211],
        [ 0.7916,  0.0479],
        [ 0.3828, -0.0311],
        [ 0.6076,  0.1449],
        [ 0.8373,  0.0624],
        [ 0.8096, -0.0344],
        [ 0.5645,  0.0729],
        [ 0.7931,  0.1882],
        [ 0.6361,  0.0873],
        [ 0.7741,  0.1555],
        [ 0.4029,  0.2069],
        [ 0.6594,  0.1453],
        [ 0.5113, -0.0449],
        [ 1.0216, -0.1808],
        [ 0.6500,  0.0605],
        [ 0.9580,  0.0056],
        [ 0.7847,  0.2056],
        [ 0.8339, -0.0236],
        [ 0.6600,  0.1297],
        [ 0.8205, -0.1580],
        [ 0.7702, -0.0371],
        [ 0.7879,  0.1325],
        [ 0.4576,  0.1350],
        [ 1.0805, -0.0348],
        [ 0.8066,  0.1085],
        [ 0.6103, -0.0037],
        [ 0.7293,  0.0515],
        [ 1.1807, -0.2248]], devic



SequenceClassifierOutput(loss=tensor(0.8829, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 6.1622e-01,  7.1860e-02],
        [ 6.2543e-01,  3.9688e-02],
        [ 7.3078e-01,  4.1522e-02],
        [ 7.7229e-01,  4.2979e-02],
        [ 7.3689e-01, -8.2457e-02],
        [ 7.5863e-01,  5.3932e-04],
        [ 9.3950e-01, -3.6670e-02],
        [ 7.0246e-01,  1.2101e-01],
        [ 9.4806e-01,  3.1183e-01],
        [ 7.7320e-01,  2.0410e-01],
        [ 9.6442e-01, -5.9754e-02],
        [ 6.7515e-01,  1.3025e-01],
        [ 7.0456e-01,  1.5444e-01],
        [ 5.7400e-01,  2.3947e-01],
        [ 5.4267e-01,  1.1250e-01],
        [ 9.4476e-01, -7.8788e-02],
        [ 8.7332e-01,  2.3671e-01],
        [ 7.8376e-01, -5.4763e-02],
        [ 8.7265e-01,  2.0808e-01],
        [ 5.5896e-01,  1.1233e-01],
        [ 8.7515e-01,  2.7334e-01],
        [ 6.4312e-01, -1.3958e-01],
        [ 5.3403e-01,  4.7582e-02],
        [ 5.9638e-01,  3.3218e-02],
        [ 6.5045e-01, -1.4873e-02],
  



SequenceClassifierOutput(loss=tensor(0.7599, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.9366,  0.1067],
        [ 0.5960,  0.1079],
        [ 0.8220,  0.2125],
        [ 0.7687,  0.0600],
        [ 0.6518,  0.0759],
        [ 0.5056,  0.0161],
        [ 0.7914,  0.1558],
        [ 0.6380,  0.1288],
        [ 0.6533,  0.3592],
        [ 0.7486,  0.0855],
        [ 0.6113, -0.1430],
        [ 0.8705,  0.0397],
        [ 0.4788,  0.1087],
        [ 0.6497,  0.0516],
        [ 0.8489,  0.1140],
        [ 0.8160,  0.3392],
        [ 0.6135,  0.2408],
        [ 0.7742, -0.2305],
        [ 0.0132, -0.0359],
        [ 0.8455,  0.1080],
        [ 0.7084,  0.2726],
        [ 0.6957,  0.3174],
        [ 0.7527,  0.2822],
        [ 0.7878,  0.1622],
        [ 0.5887, -0.1545],
        [ 0.6041, -0.1610],
        [ 0.4747,  0.0190],
        [ 0.9993,  0.0300],
        [ 0.7494,  0.0574],
        [ 0.8390, -0.1532],
        [ 0.6234,  0.0372],
        [ 0.5245, -0.0634]], devic



SequenceClassifierOutput(loss=tensor(0.8839, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6052,  0.2560],
        [-0.1816,  0.0584],
        [ 0.9210,  0.1115],
        [ 0.8470,  0.1695],
        [ 0.9512, -0.0689],
        [ 0.6517,  0.0858],
        [ 0.9311,  0.2094],
        [ 0.6737,  0.1011],
        [ 0.7310, -0.1573],
        [ 0.8060, -0.0033],
        [ 0.8001,  0.0120],
        [ 0.3018, -0.0560],
        [ 0.7838,  0.1903],
        [ 0.5968, -0.1626],
        [ 0.7351,  0.0056],
        [ 0.5495,  0.0115],
        [ 0.5690, -0.0255],
        [ 0.4864,  0.1328],
        [ 0.7277,  0.2019],
        [ 0.5175,  0.1230],
        [ 0.5134, -0.1681],
        [ 1.1329,  0.3993],
        [ 0.8706,  0.2564],
        [ 0.8894,  0.1608],
        [ 0.8078, -0.1859],
        [ 0.6792,  0.1293],
        [ 0.1181,  0.1197],
        [ 0.6195,  0.2526],
        [ 0.8654,  0.1693],
        [ 0.3607,  0.0096],
        [ 0.7099,  0.0807],
        [ 0.4322,  0.2280]], devic



SequenceClassifierOutput(loss=tensor(0.9124, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.9959,  0.0241],
        [ 0.8831, -0.0512],
        [ 0.9899,  0.0997],
        [ 0.6591,  0.0202],
        [ 0.8138,  0.1608],
        [ 0.4531,  0.1875],
        [ 0.1102,  0.2400],
        [ 0.7016,  0.2253],
        [ 0.9105,  0.0098],
        [ 0.6411,  0.0532],
        [ 0.8010, -0.0635],
        [ 0.8293,  0.2114],
        [ 0.8926,  0.0171],
        [ 0.7414,  0.0314],
        [ 0.7664,  0.1326],
        [ 0.5008, -0.0437],
        [ 0.9503,  0.1950],
        [ 0.8697, -0.1388],
        [ 0.7331,  0.2952],
        [ 0.4313,  0.3388],
        [ 0.8643,  0.0910],
        [ 0.6689, -0.0481],
        [ 0.7654, -0.0457],
        [ 0.7896,  0.1005],
        [ 0.7959,  0.0503],
        [ 0.9284,  0.0047],
        [ 0.8666,  0.1644],
        [ 0.9412,  0.0837],
        [ 0.7122,  0.1881],
        [ 0.8329,  0.2377],
        [ 0.7671, -0.0079],
        [ 0.3047,  0.1701]], devic



SequenceClassifierOutput(loss=tensor(0.7725, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 4.8593e-01, -1.2171e-01],
        [ 6.0558e-01, -1.2421e-01],
        [ 9.9449e-01,  1.3980e-01],
        [ 3.5634e-01,  8.5191e-02],
        [ 6.5631e-01, -1.5156e-01],
        [ 6.9280e-01,  2.9282e-01],
        [ 5.1887e-01,  4.0215e-03],
        [ 7.4318e-01,  1.0884e-01],
        [ 6.5842e-01,  6.0759e-02],
        [ 7.7404e-01,  2.9138e-01],
        [ 6.4089e-01,  1.8385e-01],
        [ 8.6076e-01,  2.8918e-01],
        [ 5.5186e-01, -2.4098e-01],
        [ 6.8491e-01, -1.1396e-01],
        [ 4.5696e-01,  2.0664e-01],
        [ 7.1125e-01,  4.1054e-01],
        [ 8.9868e-01,  4.7923e-02],
        [ 5.0533e-01,  3.5782e-02],
        [ 5.1677e-01,  1.6987e-01],
        [ 8.9699e-01,  8.5952e-03],
        [ 5.0033e-01, -5.0210e-03],
        [ 8.9762e-01,  2.6074e-01],
        [ 6.0424e-01, -7.4439e-02],
        [ 8.3287e-01, -2.3243e-02],
        [ 3.4152e-01,  2.2907e-01],
  



SequenceClassifierOutput(loss=tensor(0.8152, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7592,  0.0077],
        [ 0.5636, -0.0299],
        [ 0.5927,  0.0562],
        [ 0.7210,  0.1014],
        [ 0.4998,  0.1281],
        [ 0.6073,  0.0436],
        [ 0.8387,  0.1962],
        [ 0.7453,  0.0065],
        [ 0.5471,  0.0111],
        [ 0.5895,  0.1185],
        [ 0.5834, -0.1092],
        [ 0.9185,  0.3783],
        [ 0.7943,  0.3428],
        [ 0.6078,  0.0441],
        [ 0.4745,  0.1000],
        [ 0.8485,  0.3832],
        [ 0.4043, -0.0209],
        [ 0.3500, -0.0725],
        [ 0.5673,  0.3154],
        [ 0.8272,  0.1185],
        [ 0.6915,  0.0461],
        [ 0.9447,  0.0393],
        [ 0.7710,  0.2677],
        [ 0.7315,  0.0744],
        [ 0.7355,  0.0576],
        [ 0.4813,  0.1265],
        [ 0.5256, -0.2428],
        [ 0.6504, -0.0775],
        [ 0.8394,  0.1790],
        [ 0.6614,  0.1932],
        [ 0.8525,  0.1314],
        [ 0.5539,  0.0235]], devic



SequenceClassifierOutput(loss=tensor(0.8781, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7307,  0.0201],
        [ 0.6001, -0.3164],
        [ 0.8186,  0.3299],
        [ 0.7981,  0.1039],
        [ 0.8707,  0.0424],
        [ 0.4210,  0.1113],
        [ 0.8835,  0.0055],
        [ 0.7022,  0.1369],
        [ 0.6870,  0.3434],
        [ 0.2432,  0.2077],
        [ 0.5204, -0.0848],
        [ 0.9229,  0.3924],
        [ 0.6960, -0.1730],
        [ 0.6109, -0.0808],
        [ 0.6201,  0.0344],
        [ 0.8712,  0.1909],
        [ 0.5532,  0.0992],
        [ 0.6744,  0.1111],
        [ 0.9221,  0.2364],
        [ 1.0020, -0.3108],
        [ 0.5752,  0.0042],
        [ 0.3539, -0.0199],
        [ 0.7491,  0.0343],
        [ 0.8523,  0.1052],
        [ 0.5503, -0.0411],
        [ 0.5172,  0.0574],
        [ 0.8682,  0.0566],
        [ 0.7776,  0.2747],
        [ 0.4920, -0.2421],
        [ 0.8437,  0.3826],
        [ 0.5411, -0.0483],
        [ 0.8392,  0.1196]], devic



SequenceClassifierOutput(loss=tensor(0.9224, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8538,  0.3536],
        [ 0.6441,  0.1323],
        [ 0.5382,  0.2227],
        [ 0.5970,  0.2252],
        [ 0.6995,  0.1221],
        [ 0.6866,  0.1426],
        [ 0.9552, -0.1100],
        [ 0.6112,  0.0974],
        [ 0.7678,  0.4062],
        [ 0.2360,  0.1582],
        [ 0.7410,  0.2054],
        [ 0.7617, -0.0916],
        [ 0.8213,  0.1213],
        [ 1.0966,  0.1316],
        [ 0.1082,  0.2849],
        [ 0.7495, -0.1232],
        [ 0.5722,  0.1603],
        [ 0.7571, -0.0441],
        [ 0.6981, -0.0725],
        [ 0.4587, -0.0687],
        [ 0.4576, -0.0822],
        [ 0.4906,  0.1752],
        [ 1.1325,  0.2204],
        [ 0.7802, -0.0505],
        [ 0.5479,  0.0682],
        [ 0.5840,  0.1117],
        [ 0.8587, -0.0063],
        [ 0.6105, -0.0304],
        [ 0.7348,  0.0928],
        [ 0.8305,  0.3008],
        [ 0.5793,  0.1637],
        [ 1.0230,  0.0964]], devic



SequenceClassifierOutput(loss=tensor(0.9099, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6074,  0.1241],
        [ 0.7113,  0.1452],
        [ 0.3622,  0.1334],
        [ 0.4096,  0.3398],
        [ 0.8274, -0.0616],
        [ 0.5075,  0.1339],
        [ 0.8782,  0.1534],
        [ 0.5803, -0.0800],
        [ 0.7305,  0.0641],
        [ 0.7992,  0.0902],
        [ 0.5799,  0.1263],
        [ 0.4500,  0.4402],
        [ 0.5796,  0.0267],
        [ 0.8461,  0.1993],
        [ 0.6926, -0.0846],
        [ 0.5877,  0.2095],
        [ 0.8724,  0.2426],
        [ 0.8135, -0.0322],
        [ 0.2648,  0.2165],
        [ 0.6750,  0.2563],
        [ 0.9215,  0.2031],
        [ 0.5686,  0.1719],
        [ 0.7159,  0.0221],
        [ 0.9071,  0.1114],
        [ 0.6251,  0.1764],
        [ 0.7763,  0.2526],
        [ 0.8906,  0.2163],
        [ 0.5163, -0.0176],
        [ 0.6630,  0.0536],
        [ 0.5916,  0.0829],
        [ 0.5763,  0.1814],
        [ 0.8754,  0.0993]], devic



SequenceClassifierOutput(loss=tensor(0.9560, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4369,  0.1151],
        [ 0.6591, -0.0231],
        [ 0.4936, -0.0555],
        [ 0.8585, -0.1955],
        [ 0.6960,  0.2000],
        [ 0.7402,  0.0167],
        [ 0.3702, -0.3514],
        [ 0.6854,  0.1387],
        [ 0.8792,  0.1333],
        [ 0.5952, -0.1746],
        [ 0.4953,  0.0961],
        [ 1.2374,  0.0602],
        [ 0.4632, -0.0441],
        [ 0.6931,  0.0895],
        [ 0.6281, -0.0018],
        [ 0.4461,  0.1298],
        [ 0.5768,  0.1452],
        [ 0.6523,  0.1162],
        [ 0.7827, -0.1571],
        [ 0.9590, -0.2485],
        [ 0.7191, -0.0137],
        [ 0.8257,  0.2056],
        [ 0.9226, -0.0834],
        [ 0.7039,  0.1998],
        [ 0.8465,  0.3023],
        [ 0.5923, -0.0795],
        [ 0.7434,  0.0068],
        [ 0.9323, -0.1969],
        [ 0.4844,  0.0678],
        [ 0.8600,  0.0937],
        [ 0.6294,  0.2630],
        [ 0.9536,  0.0619]], devic



SequenceClassifierOutput(loss=tensor(0.9496, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3608,  0.0129],
        [ 0.6937,  0.0701],
        [ 0.7520,  0.1981],
        [ 0.3987,  0.1300],
        [ 0.3444,  0.1677],
        [ 0.8752,  0.1993],
        [ 0.7192, -0.5560],
        [ 0.7340,  0.3411],
        [ 0.6779,  0.2379],
        [ 1.0999,  0.0061],
        [ 0.8595,  0.3814],
        [ 0.5421,  0.0899],
        [ 0.5097, -0.1236],
        [ 0.8405,  0.0773],
        [ 0.4800, -0.0578],
        [ 0.8213,  0.0266],
        [ 0.3522, -0.0341],
        [ 0.0465, -0.0748],
        [ 0.3437, -0.0124],
        [ 0.9037, -0.1093],
        [ 0.4607, -0.0186],
        [ 0.5001,  0.1758],
        [ 0.6009, -0.0806],
        [ 1.0558,  0.0084],
        [ 0.7687, -0.2785],
        [ 0.7748,  0.1457],
        [ 0.9937,  0.2033],
        [ 0.5135,  0.0814],
        [ 0.5626,  0.0334],
        [-0.2489,  0.0732],
        [ 0.6664, -0.0209],
        [ 1.0041, -0.0059]], devic



SequenceClassifierOutput(loss=tensor(0.8109, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 5.0479e-01,  1.5521e-02],
        [ 8.8947e-01, -3.2422e-02],
        [ 5.8626e-01, -7.6267e-02],
        [ 7.7864e-01,  1.2062e-01],
        [ 6.9787e-01,  5.7763e-02],
        [ 5.8390e-01,  1.2058e-01],
        [ 5.0750e-01,  1.2849e-01],
        [ 5.9040e-01,  2.6257e-02],
        [ 9.7084e-01,  3.4711e-02],
        [ 5.7289e-01,  1.9223e-01],
        [ 6.4002e-01,  3.2677e-01],
        [ 9.7977e-01, -1.0686e-01],
        [ 7.3613e-01, -3.9263e-02],
        [ 8.2983e-01, -8.4656e-04],
        [ 6.5438e-01,  1.8400e-01],
        [ 8.3243e-01, -2.4774e-02],
        [ 5.7499e-01,  2.6476e-01],
        [ 6.7166e-01,  3.0294e-02],
        [ 7.3358e-01,  1.6906e-01],
        [ 8.4095e-01,  2.5172e-01],
        [ 6.8120e-01,  9.3918e-02],
        [ 6.6408e-01,  1.3609e-01],
        [ 7.8565e-01, -2.7183e-02],
        [ 6.7563e-01,  2.5419e-01],
        [ 3.7400e-01,  1.7818e-01],
  



SequenceClassifierOutput(loss=tensor(0.9202, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6071, -0.2246],
        [ 0.4554,  0.0588],
        [ 0.6871,  0.0497],
        [ 0.5794, -0.2755],
        [ 0.9332, -0.0641],
        [ 1.1378,  0.0557],
        [ 0.6918,  0.0592],
        [ 1.0826,  0.1337],
        [ 0.8292, -0.0057],
        [ 0.6812,  0.2155],
        [ 0.4515,  0.0452],
        [ 0.5141,  0.1322],
        [ 0.5683, -0.1017],
        [ 0.5783, -0.1445],
        [ 0.7518,  0.1303],
        [ 0.5547,  0.2305],
        [ 0.7014,  0.1598],
        [ 0.6108, -0.0103],
        [ 0.7278,  0.0344],
        [ 0.8883,  0.0926],
        [ 0.6164,  0.0265],
        [ 0.8420,  0.0825],
        [ 0.6604,  0.0038],
        [ 0.7616,  0.1585],
        [ 0.5801,  0.0198],
        [ 0.8235,  0.1315],
        [ 0.7118,  0.2497],
        [ 0.7348,  0.0398],
        [ 0.6430,  0.0425],
        [ 0.7981,  0.0702],
        [ 0.8408,  0.1045],
        [ 0.7105,  0.0351]], devic



SequenceClassifierOutput(loss=tensor(0.7439, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 9.7247e-01, -2.7992e-02],
        [ 6.8343e-01,  1.4520e-01],
        [ 4.8800e-01,  2.4791e-01],
        [ 9.2142e-01,  3.6532e-02],
        [ 7.4188e-01,  2.8301e-02],
        [ 9.4048e-01,  1.8298e-01],
        [ 3.4853e-01,  2.3636e-01],
        [ 8.1687e-01,  6.9084e-02],
        [ 8.8453e-01, -4.6557e-02],
        [ 8.2548e-01, -1.8013e-01],
        [ 6.1245e-01, -4.4421e-02],
        [ 9.4811e-01, -5.1683e-04],
        [ 5.9158e-01,  1.2501e-01],
        [ 7.7510e-01,  3.1443e-01],
        [ 4.0183e-01,  1.9423e-01],
        [ 9.0979e-01,  2.4973e-01],
        [ 7.6953e-01,  2.6811e-03],
        [ 4.3062e-01,  2.5756e-03],
        [ 1.0927e+00,  2.8873e-01],
        [ 7.5524e-01, -6.9609e-02],
        [ 6.6579e-01,  3.1233e-01],
        [ 1.1717e+00,  1.4322e-01],
        [ 5.1316e-01,  3.4667e-02],
        [ 9.8884e-01,  3.7953e-01],
        [ 9.0732e-01, -3.4857e-02],
  



SequenceClassifierOutput(loss=tensor(0.9564, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6914,  0.1474],
        [ 0.7095, -0.1869],
        [ 0.6035, -0.1810],
        [ 0.6907,  0.1991],
        [ 0.6467, -0.1541],
        [ 0.8460,  0.1217],
        [ 0.6728,  0.0063],
        [ 0.9631, -0.0283],
        [ 0.7549,  0.0184],
        [ 0.5634, -0.1090],
        [ 0.7370, -0.0512],
        [ 0.7521,  0.0502],
        [ 0.5977, -0.0571],
        [ 0.6186,  0.2161],
        [ 0.6130,  0.1653],
        [ 0.6427,  0.1156],
        [ 1.0198, -0.0265],
        [ 1.0592,  0.2526],
        [ 0.5155,  0.0822],
        [ 0.7754,  0.0732],
        [ 0.9023,  0.0405],
        [ 0.5806, -0.0724],
        [ 0.7194,  0.0203],
        [ 0.9355,  0.4364],
        [ 0.7760,  0.0521],
        [ 0.8527, -0.1575],
        [ 0.6747,  0.1726],
        [ 0.6919,  0.0985],
        [ 0.9148,  0.1317],
        [ 0.8539, -0.0581],
        [ 0.9266,  0.1444],
        [ 0.7448,  0.1746]], devic



SequenceClassifierOutput(loss=tensor(0.8983, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 7.7473e-01,  1.6250e-01],
        [ 5.9785e-01,  3.4578e-02],
        [ 7.7245e-01,  6.3097e-02],
        [ 9.4134e-01,  1.0594e-01],
        [ 9.1840e-01,  1.3960e-01],
        [ 6.3979e-01, -1.7935e-01],
        [ 6.7922e-01,  6.4047e-02],
        [ 8.1841e-01,  1.0338e-01],
        [ 8.9003e-01,  4.1057e-01],
        [ 9.8216e-01,  3.2142e-01],
        [ 1.0084e+00,  1.9910e-02],
        [ 7.3874e-01, -4.8803e-02],
        [ 4.7799e-01, -9.0591e-02],
        [ 7.4542e-01,  3.0621e-01],
        [ 5.6576e-01,  3.9168e-02],
        [ 7.9072e-01, -4.7861e-03],
        [ 5.1774e-01,  5.1303e-02],
        [ 7.4237e-01,  1.2244e-01],
        [ 4.8870e-01,  1.4777e-01],
        [ 5.4173e-01,  5.7241e-02],
        [ 7.5711e-01,  2.6616e-01],
        [ 4.2568e-01, -8.5048e-02],
        [ 7.5651e-01, -4.4292e-04],
        [ 8.8488e-01,  3.8255e-01],
        [ 6.4109e-01, -2.0033e-01],
  



SequenceClassifierOutput(loss=tensor(0.9826, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4913, -0.0788],
        [ 0.7763,  0.1229],
        [ 0.6966,  0.2805],
        [ 0.7774,  0.2935],
        [ 0.4788, -0.0610],
        [ 0.6606,  0.1626],
        [ 0.5947, -0.0406],
        [ 0.9269,  0.3196],
        [ 0.4412,  0.0218],
        [ 1.3086,  0.1915],
        [ 0.7406, -0.0669],
        [ 0.9172, -0.0215],
        [ 0.7437, -0.0097],
        [ 0.9213,  0.1246],
        [ 0.5704,  0.0819],
        [ 0.4528,  0.0741],
        [ 1.0683,  0.2501],
        [ 0.8693, -0.0078],
        [ 0.6269,  0.0219],
        [ 0.5642, -0.0956],
        [ 0.9746, -0.0343],
        [ 0.8257,  0.2313],
        [ 1.1246,  0.1321],
        [ 0.8463,  0.2522],
        [ 0.6652,  0.0340],
        [ 0.9545, -0.0985],
        [ 0.6585,  0.1966],
        [ 0.5545,  0.2550],
        [ 0.7135,  0.1475],
        [ 0.6191,  0.0547],
        [ 1.0719,  0.2068],
        [ 0.3619, -0.0663]], devic



SequenceClassifierOutput(loss=tensor(0.8381, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6859,  0.1267],
        [ 0.5162, -0.1520],
        [ 0.8282,  0.0452],
        [ 1.1902,  0.2601],
        [ 0.7432,  0.2144],
        [ 0.9481, -0.0762],
        [ 0.9167,  0.0726],
        [ 0.7135, -0.1072],
        [ 0.0491,  0.2124],
        [ 0.6312,  0.0214],
        [ 0.6854,  0.1167],
        [ 0.7303, -0.0646],
        [ 0.8946,  0.0933],
        [ 0.8788, -0.0605],
        [ 0.4057, -0.1383],
        [ 0.5954, -0.2236],
        [ 0.8224, -0.1224],
        [ 0.6109, -0.0385],
        [ 0.7364,  0.3232],
        [ 0.3032, -0.1649],
        [ 1.0576,  0.1255],
        [ 0.6446,  0.3132],
        [ 0.6516,  0.1127],
        [ 0.7250,  0.2758],
        [ 0.5856,  0.0418],
        [ 0.6101, -0.0294],
        [ 0.7755,  0.1558],
        [ 0.7224,  0.0740],
        [ 0.6634,  0.2164],
        [ 0.8939, -0.0724],
        [ 0.8864,  0.1870],
        [ 0.6173,  0.0695]], devic



SequenceClassifierOutput(loss=tensor(0.9194, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6053,  0.0115],
        [ 0.6395,  0.1914],
        [ 0.7046, -0.0449],
        [ 0.7587,  0.1354],
        [ 1.0913,  0.0935],
        [ 0.8097, -0.0048],
        [ 0.6407, -0.0519],
        [ 0.6120, -0.1288],
        [ 0.6322, -0.1145],
        [ 0.6218,  0.2000],
        [ 0.5104,  0.2419],
        [ 0.5551, -0.0191],
        [ 0.7004,  0.0715],
        [ 0.3879, -0.1879],
        [ 0.4395,  0.1924],
        [ 0.7727, -0.0431],
        [ 0.9479, -0.0964],
        [ 0.6883,  0.0149],
        [ 0.7542, -0.1653],
        [ 1.0737,  0.0109],
        [ 0.7488, -0.0355],
        [ 0.7661,  0.0094],
        [ 0.6234,  0.2360],
        [ 0.7013, -0.1374],
        [ 0.5984,  0.2944],
        [ 0.6624,  0.2358],
        [ 0.6344, -0.0551],
        [ 0.6896,  0.0339],
        [ 0.5407,  0.0339],
        [ 0.7214,  0.2341],
        [ 0.6485,  0.3192],
        [ 0.7527, -0.2350]], devic



SequenceClassifierOutput(loss=tensor(0.8725, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8411,  0.2398],
        [ 0.3338, -0.2260],
        [ 0.6696, -0.1278],
        [ 0.6975,  0.1977],
        [ 0.3630,  0.1611],
        [ 0.8471,  0.2897],
        [ 0.7931,  0.1909],
        [ 1.0068, -0.0494],
        [ 0.8754,  0.2296],
        [ 0.3777,  0.0863],
        [ 0.7856,  0.1121],
        [ 0.8662,  0.1105],
        [ 0.3941,  0.1016],
        [ 0.5630,  0.0250],
        [ 0.7418,  0.2932],
        [ 0.8779,  0.2242],
        [ 0.5228,  0.0498],
        [ 0.6028,  0.1816],
        [ 0.4021,  0.0471],
        [ 0.9716,  0.2895],
        [ 0.8218,  0.0073],
        [ 0.9529,  0.2939],
        [ 0.8093,  0.0929],
        [ 0.5400,  0.1552],
        [ 0.8981,  0.0098],
        [ 0.8082,  0.4665],
        [ 0.5623,  0.2475],
        [ 0.7708,  0.2782],
        [ 0.9609,  0.2829],
        [ 0.5955,  0.0980],
        [ 0.5867,  0.0764],
        [ 0.6743,  0.0144]], devic



SequenceClassifierOutput(loss=tensor(0.9771, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4614, -0.1359],
        [ 0.7462,  0.0670],
        [ 0.6825,  0.1275],
        [ 0.7645, -0.1514],
        [ 0.6589,  0.0672],
        [ 0.6231,  0.2183],
        [ 0.4791,  0.0514],
        [ 0.6783,  0.0060],
        [ 0.4550, -0.0059],
        [ 0.6232,  0.1285],
        [ 0.6183, -0.0870],
        [ 0.6835,  0.1391],
        [ 0.9102,  0.2168],
        [ 0.3228,  0.1361],
        [ 0.4519, -0.1388],
        [ 0.6655, -0.1170],
        [ 0.4462,  0.0902],
        [ 0.5859,  0.0612],
        [ 0.8951,  0.1896],
        [ 0.3376, -0.0433],
        [ 0.9635, -0.0146],
        [ 0.7995,  0.1641],
        [ 0.6052,  0.1003],
        [ 0.6643, -0.1316],
        [ 0.9983,  0.1829],
        [ 0.5553,  0.0861],
        [ 0.7559,  0.2063],
        [ 1.0408,  0.1681],
        [ 0.4228, -0.0585],
        [ 0.5994,  0.1295],
        [ 0.7695, -0.1874],
        [ 0.6437,  0.2705]], devic



SequenceClassifierOutput(loss=tensor(0.8358, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.9276,  0.0734],
        [ 0.5930,  0.0112],
        [ 0.8717,  0.0618],
        [ 0.8569, -0.0543],
        [ 0.5587,  0.3053],
        [ 0.8049,  0.3285],
        [ 0.9603,  0.3212],
        [ 0.9691,  0.0253],
        [ 0.5331,  0.0313],
        [ 0.9935,  0.1834],
        [ 0.9753, -0.0830],
        [ 0.6675,  0.0619],
        [ 0.6103, -0.1114],
        [ 0.6707,  0.1809],
        [ 0.8016, -0.0166],
        [ 0.4831,  0.0096],
        [ 0.7816,  0.0319],
        [ 0.8633,  0.1876],
        [ 0.6764,  0.0852],
        [ 0.6696,  0.1779],
        [ 0.8321,  0.2108],
        [ 0.6607,  0.0766],
        [ 0.9449,  0.0932],
        [ 0.6124,  0.2738],
        [ 0.7740,  0.0810],
        [ 0.4998,  0.0403],
        [ 0.7087,  0.1611],
        [ 0.8104,  0.2028],
        [ 0.9601,  0.1398],
        [ 0.7397, -0.0702],
        [ 0.8033,  0.1238],
        [ 0.9125, -0.0703]], devic



SequenceClassifierOutput(loss=tensor(0.8285, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 5.8714e-01,  1.3672e-01],
        [ 7.9732e-01,  2.8807e-01],
        [ 6.3204e-01, -7.8381e-03],
        [ 7.5085e-01,  1.1217e-01],
        [ 5.7771e-01,  9.2088e-02],
        [ 8.9547e-01, -1.2815e-04],
        [ 5.7997e-01, -3.8771e-02],
        [ 9.7563e-01,  1.8967e-01],
        [ 4.6666e-01,  2.3669e-01],
        [ 5.7485e-01,  1.6829e-01],
        [ 7.0280e-01,  1.2552e-01],
        [ 1.1467e+00,  1.2929e-01],
        [ 7.4366e-01,  1.3165e-01],
        [ 7.1739e-01, -5.4560e-02],
        [ 3.6204e-01,  1.3326e-01],
        [ 5.0153e-01, -2.0621e-02],
        [ 6.0233e-01,  2.5776e-01],
        [ 7.7371e-01,  2.9435e-02],
        [ 7.8658e-01, -4.2307e-02],
        [ 7.2221e-01,  3.8515e-01],
        [ 4.2836e-01,  9.7076e-02],
        [ 4.9139e-01, -4.0236e-02],
        [ 8.5518e-01, -2.8403e-02],
        [ 9.7087e-01,  2.8129e-01],
        [ 9.4491e-01,  8.5203e-02],
  



SequenceClassifierOutput(loss=tensor(0.9782, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4493,  0.0452],
        [ 0.8260,  0.0238],
        [ 0.6693,  0.1374],
        [ 0.8815, -0.1291],
        [ 0.7081, -0.0178],
        [ 0.6364,  0.0626],
        [ 0.6603,  0.0585],
        [ 0.8071,  0.0770],
        [ 0.6758, -0.2535],
        [ 0.7328,  0.0121],
        [ 0.3819, -0.4155],
        [ 1.0310,  0.3217],
        [ 0.6921,  0.1872],
        [ 0.7284,  0.2329],
        [ 0.7511,  0.1905],
        [ 0.9340,  0.1078],
        [ 0.9235, -0.0429],
        [ 0.4589,  0.4254],
        [ 0.2088, -0.0480],
        [ 0.6849, -0.0828],
        [ 0.3793,  0.0307],
        [ 0.5922,  0.0546],
        [ 0.7187,  0.0354],
        [ 0.5683,  0.0789],
        [ 0.6814,  0.0332],
        [ 0.9459,  0.0783],
        [ 0.3897,  0.1492],
        [ 0.8007,  0.0423],
        [ 0.6977,  0.1190],
        [ 0.6286,  0.2897],
        [ 0.6920,  0.1393],
        [ 0.8020,  0.1530]], devic



SequenceClassifierOutput(loss=tensor(0.7610, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.8714,  0.2707],
        [ 0.7637,  0.1567],
        [ 0.3728,  0.2676],
        [ 0.4561, -0.0440],
        [ 0.8525,  0.2863],
        [ 0.4709,  0.1581],
        [ 0.6846,  0.3699],
        [ 0.4172, -0.0147],
        [ 0.7555, -0.1362],
        [ 0.7110,  0.1257],
        [ 0.5132, -0.1027],
        [ 0.5562, -0.1103],
        [ 0.6498,  0.1172],
        [ 1.0371,  0.2188],
        [ 0.6305,  0.0230],
        [ 0.7387,  0.2313],
        [ 0.5933,  0.1398],
        [ 0.5154,  0.0052],
        [ 1.0236,  0.1743],
        [ 0.5009,  0.1924],
        [ 0.6047,  0.0372],
        [ 0.4682,  0.0582],
        [ 0.7666,  0.0667],
        [ 0.9684, -0.0952],
        [ 1.0265,  0.1609],
        [ 1.0605,  0.1941],
        [ 0.5952,  0.1011],
        [ 0.8453,  0.3963],
        [ 1.0640,  0.1638],
        [ 0.8405,  0.0889],
        [ 1.0257, -0.1011],
        [ 0.8482,  0.1230]], devic



SequenceClassifierOutput(loss=tensor(0.8818, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7469,  0.0612],
        [ 0.7604,  0.2498],
        [ 0.7003,  0.0413],
        [ 0.8678, -0.1376],
        [ 0.8444,  0.1399],
        [ 0.5493,  0.1400],
        [ 0.8159,  0.0950],
        [ 0.9270,  0.0521],
        [ 1.0157,  0.2474],
        [ 0.7967,  0.0570],
        [ 0.7520,  0.0931],
        [ 0.6334,  0.2906],
        [ 0.8777, -0.0664],
        [ 0.4995,  0.2157],
        [ 0.6800,  0.0857],
        [ 1.0914,  0.0325],
        [ 0.5170,  0.0686],
        [ 0.8396,  0.2542],
        [ 0.6920,  0.2367],
        [ 0.6358,  0.1209],
        [ 0.6044,  0.1148],
        [ 0.5727,  0.1883],
        [ 0.9428,  0.0948],
        [ 0.6886,  0.2998],
        [ 0.6932,  0.0492],
        [ 0.6676, -0.0486],
        [ 0.6385,  0.2320],
        [ 0.8999,  0.1951],
        [ 0.9401,  0.0863],
        [ 1.0669,  0.1690],
        [ 0.8923, -0.0728],
        [ 0.2579, -0.1166]], devic



SequenceClassifierOutput(loss=tensor(0.8266, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6462,  0.1514],
        [ 0.7269,  0.0731],
        [ 0.8368,  0.1534],
        [ 0.4711, -0.0602],
        [ 0.6010,  0.0434],
        [ 0.6023,  0.1061],
        [ 0.6772,  0.1113],
        [ 0.9054,  0.0978],
        [ 0.8418,  0.0200],
        [ 0.6782, -0.0078],
        [ 0.3983,  0.2208],
        [ 0.8635,  0.0813],
        [ 0.5781,  0.2091],
        [ 0.6332,  0.1677],
        [ 1.2822,  0.2968],
        [ 0.8330, -0.1373],
        [ 1.0456,  0.0908],
        [ 0.9373,  0.0130],
        [ 1.0765,  0.2114],
        [ 0.9060,  0.0662],
        [ 0.3719,  0.0316],
        [ 0.7888, -0.0756],
        [ 0.7000,  0.0553],
        [ 0.9064,  0.1956],
        [ 0.9625,  0.4064],
        [ 0.7839,  0.1461],
        [ 0.7917,  0.1933],
        [ 0.3898,  0.2746],
        [ 0.7713,  0.1874],
        [ 0.6025, -0.0648],
        [ 0.5764,  0.1186],
        [ 0.8561,  0.2167]], devic



SequenceClassifierOutput(loss=tensor(0.8533, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 8.6878e-01,  1.8284e-01],
        [ 8.7948e-01, -5.4660e-04],
        [ 6.2813e-01,  7.4799e-02],
        [ 8.1248e-01,  4.8459e-02],
        [ 6.1475e-01,  3.2974e-01],
        [ 8.2470e-01,  2.4418e-01],
        [ 7.9102e-01,  3.0857e-01],
        [ 4.9065e-01,  1.0327e-02],
        [ 5.4914e-01,  7.9578e-02],
        [ 5.8098e-01,  1.4102e-01],
        [ 6.8324e-01,  2.1191e-02],
        [ 7.2714e-01, -8.9353e-02],
        [ 6.6713e-01, -1.1375e-01],
        [ 4.7392e-01,  2.8409e-01],
        [ 7.2435e-01,  1.7602e-02],
        [ 5.7597e-01,  9.1109e-02],
        [ 8.5809e-01, -3.1504e-02],
        [ 7.4281e-01,  1.9758e-01],
        [ 6.4917e-01, -6.9639e-02],
        [ 3.9751e-01,  1.0574e-01],
        [ 9.2647e-01,  1.3234e-01],
        [ 5.4507e-01,  1.0048e-01],
        [ 4.1788e-01, -1.1529e-01],
        [ 8.2681e-01, -1.4495e-01],
        [ 6.2975e-01,  6.9409e-02],
  



SequenceClassifierOutput(loss=tensor(0.8731, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7978, -0.0012],
        [ 0.5947,  0.0984],
        [ 0.2086,  0.0857],
        [ 0.8287,  0.0438],
        [ 0.4216,  0.0740],
        [ 0.5359,  0.1255],
        [ 0.6479, -0.0066],
        [ 1.0823,  0.2824],
        [ 0.7090,  0.2230],
        [ 0.6530,  0.2493],
        [ 0.6953,  0.0257],
        [ 0.6354,  0.0494],
        [ 0.4718,  0.0807],
        [ 0.6924,  0.1836],
        [ 0.9652, -0.0989],
        [ 0.7664,  0.1918],
        [ 0.2298,  0.0609],
        [ 0.7243, -0.0141],
        [ 0.8164,  0.1752],
        [ 0.7796,  0.1027],
        [ 0.7639,  0.1246],
        [ 0.4446, -0.2065],
        [ 0.8586,  0.0579],
        [ 0.7297,  0.1381],
        [ 0.7561,  0.0210],
        [ 0.8398,  0.2459],
        [ 0.7968,  0.1436],
        [ 0.3776,  0.0720],
        [ 0.6035, -0.1577],
        [ 0.8835, -0.0306],
        [ 0.4682, -0.0150],
        [ 0.4215, -0.0020]], devic



SequenceClassifierOutput(loss=tensor(0.8579, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4558,  0.0502],
        [ 0.8184,  0.1159],
        [ 0.6485,  0.1553],
        [ 0.6512,  0.0549],
        [ 0.2913,  0.0157],
        [ 0.7461, -0.2071],
        [ 0.9383,  0.1147],
        [ 0.6888,  0.0870],
        [ 0.6326, -0.0326],
        [ 0.7781, -0.0712],
        [ 0.5838,  0.1022],
        [ 0.9876,  0.2664],
        [ 0.6917,  0.0082],
        [ 0.7978,  0.2349],
        [ 1.1667,  0.1694],
        [ 0.7995, -0.0168],
        [ 0.7458, -0.0961],
        [ 0.6681,  0.0691],
        [ 0.9334,  0.2713],
        [ 0.5873, -0.1187],
        [ 0.4332,  0.1461],
        [ 0.7878,  0.1507],
        [ 0.9474,  0.1841],
        [ 0.5043,  0.0127],
        [ 0.5837,  0.0604],
        [ 0.9820,  0.0943],
        [ 0.9139,  0.0092],
        [ 0.7275, -0.1303],
        [ 0.5012,  0.0488],
        [-0.5179,  0.1847],
        [ 0.7767, -0.0419],
        [ 0.1781, -0.0239]], devic



SequenceClassifierOutput(loss=tensor(0.9167, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7176,  0.2092],
        [ 0.9914,  0.2417],
        [ 1.0611,  0.1065],
        [ 0.5132, -0.0386],
        [ 0.7876, -0.1662],
        [ 0.9363, -0.0591],
        [ 1.0133, -0.0445],
        [ 0.9083,  0.1443],
        [ 0.7845, -0.0399],
        [ 0.7995,  0.1858],
        [ 0.6444, -0.0831],
        [ 0.5307,  0.3089],
        [ 0.7819,  0.2542],
        [ 0.7127,  0.0480],
        [ 0.7784,  0.1312],
        [ 0.9024,  0.0125],
        [ 1.1016,  0.1437],
        [ 0.5952,  0.0934],
        [ 0.9124,  0.1869],
        [ 0.5252,  0.1569],
        [ 1.0963,  0.1531],
        [ 0.8542,  0.1373],
        [ 0.6803,  0.3172],
        [ 0.8924,  0.0814],
        [ 0.9623,  0.0033],
        [ 0.8451,  0.0758],
        [ 0.8422,  0.1889],
        [ 1.0054, -0.0381],
        [ 0.5868,  0.1914],
        [ 0.4275, -0.0329],
        [ 0.7498, -0.1479],
        [ 0.5659, -0.0142]], devic



SequenceClassifierOutput(loss=tensor(0.9548, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7040,  0.0336],
        [ 0.8670, -0.1706],
        [ 0.4601,  0.1923],
        [ 0.9517,  0.2973],
        [ 0.8586, -0.1884],
        [ 0.9116,  0.0444],
        [ 0.9421,  0.2547],
        [ 0.9954,  0.0152],
        [ 0.7622,  0.0678],
        [ 0.8304,  0.1897],
        [ 0.9711, -0.1294],
        [ 0.6684, -0.0610],
        [ 0.5358, -0.0295],
        [ 0.6551,  0.1442],
        [ 1.0086,  0.3309],
        [ 0.7942,  0.0231],
        [ 0.7933, -0.1053],
        [ 0.5217,  0.3000],
        [ 0.7398, -0.0528],
        [ 0.8569,  0.2555],
        [ 0.8200,  0.1331],
        [ 1.0187,  0.0526],
        [ 0.6797, -0.0539],
        [ 0.7739,  0.1955],
        [ 0.8915,  0.2490],
        [ 0.7119,  0.1741],
        [ 0.5347,  0.0226],
        [ 1.1341,  0.1156],
        [ 0.8361,  0.0691],
        [ 0.5460,  0.0083],
        [ 0.7144,  0.1462],
        [ 0.8354, -0.0941]], devic



SequenceClassifierOutput(loss=tensor(0.8466, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.5895,  0.0900],
        [ 0.9764,  0.1605],
        [ 0.6198,  0.0611],
        [ 1.0655,  0.3266],
        [ 0.9051,  0.2431],
        [ 0.9756, -0.2429],
        [ 0.9812,  0.0681],
        [ 0.3210,  0.1312],
        [ 0.8697,  0.0420],
        [ 0.6170,  0.0629],
        [ 0.8585, -0.2279],
        [ 0.7082,  0.0401],
        [ 0.8797,  0.0546],
        [ 0.1768,  0.1208],
        [ 1.1794,  0.3713],
        [ 0.9682,  0.3321],
        [ 0.2758,  0.0089],
        [ 0.7756, -0.0040],
        [ 0.9194,  0.0074],
        [ 0.4579, -0.0700],
        [ 0.9735,  0.4202],
        [ 0.5973,  0.3554],
        [ 0.8897,  0.1110],
        [ 0.7978,  0.0073],
        [ 0.8311,  0.0445],
        [ 0.9321,  0.2389],
        [ 0.7766,  0.1285],
        [ 0.5401,  0.2085],
        [ 0.7489, -0.0706],
        [ 0.9260,  0.2697],
        [ 0.7043,  0.2206],
        [ 0.8423, -0.0040]], devic



SequenceClassifierOutput(loss=tensor(0.8942, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6430,  0.2637],
        [ 0.9523,  0.0452],
        [ 0.5560,  0.2626],
        [ 0.4881,  0.0455],
        [ 0.4973, -0.0051],
        [ 0.6854,  0.0158],
        [ 0.5596, -0.0574],
        [ 0.5205,  0.2639],
        [ 0.9921, -0.1117],
        [ 0.7015,  0.0555],
        [ 0.9230,  0.1531],
        [ 0.9189,  0.0960],
        [ 0.8933, -0.0733],
        [ 0.7016, -0.1699],
        [ 0.6890,  0.0636],
        [ 1.1727, -0.0660],
        [ 0.5646, -0.0628],
        [ 0.9719,  0.1991],
        [ 0.7890,  0.2961],
        [ 0.6531, -0.0234],
        [ 0.5522,  0.3504],
        [ 0.7418, -0.2650],
        [ 0.6460,  0.2640],
        [ 0.5989,  0.0470],
        [ 0.7311, -0.1680],
        [ 0.0897, -0.0195],
        [ 0.8414,  0.1787],
        [ 0.8599,  0.0387],
        [ 0.7658, -0.1587],
        [ 0.5305,  0.3270],
        [ 0.8118,  0.1652],
        [ 0.6828, -0.0142]], devic



SequenceClassifierOutput(loss=tensor(0.9133, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.9367,  0.0968],
        [ 1.0969,  0.2840],
        [ 0.7759, -0.0279],
        [ 0.6926,  0.1652],
        [ 0.8799,  0.1868],
        [ 0.9198,  0.2344],
        [ 0.4248,  0.2247],
        [ 0.7822,  0.1992],
        [ 0.6137,  0.0609],
        [ 0.6372, -0.0592],
        [ 0.7962, -0.0074],
        [ 0.8560, -0.0174],
        [ 0.3714,  0.0476],
        [ 0.3727,  0.1440],
        [ 0.5567,  0.2315],
        [ 0.7532,  0.2074],
        [ 0.8844, -0.0648],
        [ 1.0157,  0.3021],
        [ 1.0665,  0.1513],
        [ 0.7946,  0.0021],
        [ 1.1729,  0.2602],
        [ 0.6457,  0.2065],
        [ 0.6014,  0.1396],
        [ 0.7187,  0.1189],
        [ 0.8501,  0.0091],
        [ 0.6332,  0.1199],
        [ 0.5243, -0.0077],
        [ 0.7249,  0.1697],
        [ 0.6885,  0.0745],
        [ 1.1519,  0.0691],
        [ 0.4650, -0.0716],
        [ 0.6884,  0.1071]], devic



SequenceClassifierOutput(loss=tensor(0.7877, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4632, -0.0033],
        [ 0.6045,  0.0804],
        [ 0.6477,  0.1731],
        [ 0.9166,  0.1767],
        [ 0.9454,  0.2087],
        [ 0.5677, -0.0127],
        [ 0.7512, -0.0140],
        [ 0.9155,  0.1893],
        [ 0.7691, -0.2163],
        [ 0.8533, -0.0255],
        [ 1.0010,  0.3049],
        [ 0.8302,  0.1177],
        [ 0.7351, -0.0243],
        [ 0.4403,  0.0529],
        [ 0.6577,  0.0972],
        [ 1.1216,  0.0492],
        [ 0.8687,  0.0030],
        [ 0.9356,  0.1439],
        [ 0.5579, -0.1050],
        [ 0.8072, -0.0245],
        [ 0.8881, -0.1439],
        [ 0.7972,  0.0303],
        [ 0.8009, -0.1050],
        [ 0.3870,  0.0952],
        [ 0.8508,  0.1812],
        [ 0.8519, -0.0397],
        [ 0.4057,  0.2143],
        [ 1.1530, -0.0144],
        [ 0.8464,  0.3563],
        [ 0.7441,  0.0047],
        [ 0.5268, -0.0985],
        [ 0.3959,  0.2937]], devic



SequenceClassifierOutput(loss=tensor(0.8644, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.7321,  0.1110],
        [ 0.3449, -0.2533],
        [ 0.6733,  0.2346],
        [ 0.8205,  0.1627],
        [ 0.5382,  0.2527],
        [ 0.7462, -0.2218],
        [ 0.3237,  0.1147],
        [ 0.8820,  0.1094],
        [ 0.8743,  0.1434],
        [ 0.1228,  0.2041],
        [ 0.8742,  0.0013],
        [ 0.5255,  0.0444],
        [ 0.4294,  0.0520],
        [ 0.6645, -0.0922],
        [ 0.4689,  0.2126],
        [ 1.0452,  0.2805],
        [ 1.1184, -0.0411],
        [ 0.5989, -0.1034],
        [ 0.6647, -0.1488],
        [ 0.8060,  0.2471],
        [ 0.9626,  0.3399],
        [ 0.5143,  0.0645],
        [ 0.5213, -0.1276],
        [ 0.6938,  0.1689],
        [ 0.4961,  0.0583],
        [ 0.7400,  0.0421],
        [ 0.5498, -0.1300],
        [ 0.6627,  0.1586],
        [ 0.9279,  0.0371],
        [ 0.6143,  0.0023],
        [ 0.8853,  0.0061],
        [ 0.6448,  0.0817]], devic



SequenceClassifierOutput(loss=tensor(1.0055, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.6703, -0.1125],
        [ 0.6031,  0.2513],
        [ 0.6551, -0.0130],
        [ 0.9297,  0.2770],
        [ 0.9000,  0.1555],
        [ 0.6589,  0.1622],
        [ 0.6545, -0.1054],
        [ 0.5069,  0.2491],
        [ 1.1330, -0.0496],
        [ 0.4114,  0.1564],
        [ 0.6501, -0.0743],
        [ 0.9854,  0.2197],
        [ 1.0233, -0.0719],
        [ 0.6115,  0.1869],
        [ 0.5444, -0.0238],
        [ 0.9812,  0.0471],
        [ 0.7711,  0.0613],
        [ 0.7830,  0.2704],
        [ 0.8942,  0.1800],
        [ 0.7889, -0.2090],
        [ 0.7862,  0.3013],
        [ 1.0688,  0.0319],
        [ 0.6202,  0.1651],
        [ 0.8969,  0.0722],
        [ 0.9103,  0.0903],
        [ 0.3374,  0.1550],
        [ 0.5771,  0.0327],
        [ 1.0407,  0.2399],
        [ 0.6117,  0.0467],
        [ 0.6275,  0.0802],
        [ 0.5964,  0.0961],
        [ 0.4625,  0.2812]], devic



SequenceClassifierOutput(loss=tensor(0.8113, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.2742, -0.0861],
        [ 0.7952,  0.1754],
        [ 0.5790,  0.0347],
        [ 0.7233,  0.1138],
        [ 0.5932, -0.0757],
        [ 0.9297,  0.4483],
        [ 0.6412, -0.0017],
        [ 0.7459,  0.1648],
        [ 1.0211,  0.1403],
        [ 0.4018,  0.0673],
        [ 0.8412,  0.3081],
        [ 0.1640, -0.1971],
        [ 0.7211,  0.1126],
        [ 0.5360,  0.1680],
        [ 0.6924,  0.1794],
        [ 0.5352, -0.1557],
        [ 0.7021,  0.1991],
        [ 0.9737,  0.3409],
        [ 0.6079,  0.0364],
        [ 0.4994,  0.2987],
        [ 0.4055, -0.0366],
        [ 0.3359,  0.0957],
        [ 0.4537, -0.0596],
        [ 0.5312,  0.1395],
        [ 0.9242,  0.1025],
        [ 0.2828, -0.1101],
        [ 0.8126,  0.1567],
        [ 0.5846, -0.0074],
        [ 0.7382,  0.4529],
        [ 0.4919, -0.1134],
        [ 0.5502, -0.0040],
        [ 0.9292,  0.0609]], devic

step: 210it [00:12, 16.88it/s]
epoch:   0%|                                                                                      | 0/4 [00:12<?, ?it/s]


SequenceClassifierOutput(loss=tensor(0.9246, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4728,  0.1264],
        [ 0.8969,  0.1241],
        [ 0.7220,  0.2474],
        [ 0.7090,  0.1989],
        [ 0.4975, -0.1529],
        [ 0.6892,  0.1214],
        [ 0.5973, -0.1205],
        [ 0.8064, -0.0767],
        [ 0.9888, -0.1634],
        [ 0.9957,  0.4381],
        [ 0.6598, -0.0743],
        [ 0.7961,  0.4322],
        [ 0.6839,  0.0813],
        [ 0.5933,  0.1730],
        [ 0.5757, -0.2757],
        [ 0.7534,  0.0568],
        [ 0.5466, -0.0593],
        [ 0.2677,  0.1505],
        [ 0.8103, -0.0018],
        [ 0.7269,  0.0943],
        [ 0.8505, -0.1025],
        [ 0.9394, -0.0565],
        [ 1.1651,  0.2563],
        [ 0.7906,  0.0263],
        [ 0.6901, -0.1205],
        [ 0.7536,  0.1464],
        [ 0.6808,  0.0594],
        [ 0.9388,  0.2185],
        [ 0.2205,  0.2450],
        [ 0.7671,  0.0276],
        [ 0.6813, -0.0056],
        [ 1.1233,  0.0213]], devic

KeyboardInterrupt: 