# LSTM-arithmetic

## Dataset
- [Arithmetic dataset](https://drive.google.com/file/d/1cMuL3hF9jefka9RyF4gEBIGGeFGZYHE-/view?usp=sharing)

In [116]:
# ! pip install seaborn
# ! pip install opencc
# ! pip install -U scikit-learn

import numpy as np
import pandas as pd
import torch
import torch.nn
import torch.nn.utils.rnn
import torch.utils.data
import matplotlib.pyplot as plt
import seaborn as sns
import opencc
import os
from sklearn.model_selection import train_test_split

data_path = './data'

In [117]:
df_train = pd.read_csv(os.path.join(data_path, 'arithmetic_train.csv'))
df_eval = pd.read_csv(os.path.join(data_path, 'arithmetic_eval.csv'))
df_train.head()

Unnamed: 0.1,Unnamed: 0,src,tgt
0,2285313,14*(43+20)=,882
1,317061,(6+1)*5=,35
2,718770,13+32+29=,74
3,170195,31*(3-11)=,-248
4,2581417,24*49+1=,1177


In [118]:
# transform the input data to string
df_train['tgt'] = df_train['tgt'].apply(lambda x: str(x))
df_train['src'] = df_train['src'].add(df_train['tgt'])
df_train['len'] = df_train['src'].apply(lambda x: len(x))

df_eval['tgt'] = df_eval['tgt'].apply(lambda x: str(x))
df_eval['src'] = df_eval['src'].add(df_eval['tgt'])
df_eval['len'] = df_eval['src'].apply(lambda x: len(x))

# Build Dictionary
 - The model cannot perform calculations directly with plain text.
 - Convert all text (numbers/symbols) into numerical representations.
 - Special tokens
    - '&lt;pad&gt;'
        - Each sentence within a batch may have different lengths.
        - The length is padded with '&lt;pad&gt;' to match the longest sentence in the batch.
    - '&lt;eos&gt;'
        - Specifies the end of the generated sequence.
        - Without '&lt;eos&gt;', the model will not know when to stop generating.

In [119]:
char_to_id = {}
id_to_char = {}

# write your code here
# Build a dictionary and give every token in the train dataset an id
# The dictionary should contain <eos> and <pad>
# char_to_id is to conver charactors to ids, while id_to_char is the opposite
pairs = [(0, '<pad>'), (1, '<eos>'), (2, '0'), (3, '1'), (4, '2'), (5, '3'), (6, '4'), (7, '5'), \
         (8, '6'), (9, '7'), (10, '8'), (11, '9'), (12, '+'), (13, '-'), (14, '*'), (15, '/'), \
         (16, '('), (17, ')'), (18, '=')]

for p in pairs:
    char_to_id[p[1]] = p[0]
    id_to_char[p[0]] = p[1]

vocab_size = len(char_to_id)
print('Vocab size = {}'.format(vocab_size))

Vocab size = 19


# Data Preprocessing
 - The data is processed into the format required for the model's input and output.
 - Example: 1+2-3=0
     - Model input: 1 + 2 - 3 = 0
     - Model output: / / / / / 0 &lt;eos&gt;  (the '/' can be replaced with &lt;pad&gt;)
     - The key for the model's output is that the model does not need to predict the next character of the previous part. What matters is that once the model sees '=', it should start generating the answer, which is '0'. After generating the answer, it should also generate&lt;eos&gt;


In [120]:
# Write your code here
def data_preprocessing(data):
    df_train.head()
    char_id_list = []
    label_id_list = []

    for i in range(len(data)):
        temp1 = []
        temp2 = []
        ans = False
        for c in data.iloc[i, 1]:
            temp1.append(char_to_id[c])
            
            if not ans:
                temp2.append(char_to_id['<pad>'])
            else:
                temp2.append(char_to_id[c])
            
            if c == '=':
                ans = True
        temp1.append(char_to_id['<eos>'])
        temp2.append(char_to_id['<eos>'])
        char_id_list.append(temp1)
        label_id_list.append(temp2)
        
    return (char_id_list, label_id_list)

L = data_preprocessing(df_train)
df_train['char_id_list'] = L[0]
df_train['label_id_list'] = L[1]

M = data_preprocessing(df_eval)
df_eval['char_id_list'] = M[0]
df_eval['label_id_list'] = M[1]

df_train.head()


Unnamed: 0.1,Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,2285313,14*(43+20)=882,882,14,"[3, 6, 14, 16, 6, 5, 12, 4, 2, 17, 18, 10, 10,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 4, 1]"
1,317061,(6+1)*5=35,35,10,"[16, 8, 12, 3, 17, 14, 7, 18, 5, 7, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 1]"
2,718770,13+32+29=74,74,11,"[3, 5, 12, 5, 4, 12, 4, 11, 18, 9, 6, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 6, 1]"
3,170195,31*(3-11)=-248,-248,14,"[5, 3, 14, 16, 5, 13, 3, 3, 17, 18, 13, 4, 6, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 4, 6, 10, 1]"
4,2581417,24*49+1=1177,1177,12,"[4, 6, 14, 6, 11, 12, 3, 18, 3, 3, 9, 9, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 9, 9, 1]"


# Hyper Parameters

|Hyperparameter|Meaning|Value|
|-|-|-|
|`batch_size`|Number of data samples in a single batch|64|
|`epochs`|Total number of epochs to train|10|
|`embed_dim`|Dimension of the word embeddings|256|
|`hidden_dim`|Dimension of the hidden state in each timestep of the LSTM|256|
|`lr`|Learning Rate|0.001|
|`grad_clip`|To prevent gradient explosion in RNNs, restrict the gradient range|1|

In [121]:
batch_size = 64
epochs = 1
embed_dim = 16
hidden_dim = 256
lr = 0.001
grad_clip = 1

# Data Batching
- Use `torch.utils.data.Dataset` to create a data generation tool called  `dataset`.
- The, use `torch.utils.data.DataLoader` to randomly sample from the `dataset` and group the samples into batches.

In [122]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
    
    def __len__(self):
        # return the amount of data
        return len(self.sequences)
    
    def __getitem__(self, index):
        # Extract the input data x and the ground truth y from the data
        x = self.sequences['char_id_list'][index] # Write your code here
        y = self.sequences['label_id_list'][index] # Write your code here
        return x, y

# collate function, used to build dataloader
def collate_fn(batch):
    batch_x = [torch.tensor(data[0]) for data in batch]
    batch_y = [torch.tensor(data[1]) for data in batch]
    batch_x_lens = torch.LongTensor([len(x) for x in batch_x])
    batch_y_lens = torch.LongTensor([len(y) for y in batch_y])
    
    # Pad the input sequence
    pad_batch_x = torch.nn.utils.rnn.pad_sequence(batch_x,
                                                  batch_first=True,
                                                  padding_value=char_to_id['<pad>'])
    
    pad_batch_y = torch.nn.utils.rnn.pad_sequence(batch_y,
                                                  batch_first=True,
                                                  padding_value=char_to_id['<pad>'])
    
    return pad_batch_x, pad_batch_y, batch_x_lens, batch_y_lens

In [123]:
ds_train = Dataset(df_train[['char_id_list', 'label_id_list']])
ds_eval = Dataset(df_eval[['char_id_list', 'label_id_list']])
ds_train[0]

([3, 6, 14, 16, 6, 5, 12, 4, 2, 17, 18, 10, 10, 4, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 4, 1])

In [124]:
# Build dataloader of train set and eval set, collate_fn is the collate function
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dl_eval = torch.utils.data.DataLoader(ds_eval, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Model Design

## Execution Flow
1. Convert all characters in the sentence into embeddings.
2. Pass the embeddings through an LSTM sequentially.
3. The output of the LSTM is passed into another LSTM, and additional layers can be added.
4. The output from all time steps of the final LSTM is passed through a Fully Connected layer.
5. The character corresponding to the maximum value across all output dimensions is selected as the next character.

## Loss Function
Since this is a classification task, Cross Entropy is used as the loss function.

## Gradient Update
Adam algorithm is used for gradient updates.

In [125]:
class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()
        
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embed_dim,
                                            padding_idx=char_to_id['<pad>'])
        
        self.rnn_layer1 = torch.nn.LSTM(input_size=embed_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        
        self.rnn_layer2 = torch.nn.LSTM(input_size=hidden_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        
        self.linear = torch.nn.Sequential(torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=hidden_dim),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=vocab_size))
        
    def forward(self, batch_x, batch_x_lens):
        return self.encoder(batch_x, batch_x_lens)
    
    # The forward pass of the model
    def encoder(self, batch_x, batch_x_lens):
        batch_x = self.embedding(batch_x)
        
        batch_x = torch.nn.utils.rnn.pack_padded_sequence(batch_x,
                                                          batch_x_lens,
                                                          batch_first=True,
                                                          enforce_sorted=False)
        
        batch_x, _ = self.rnn_layer1(batch_x)
        batch_x, _ = self.rnn_layer2(batch_x)
        
        batch_x, _ = torch.nn.utils.rnn.pad_packed_sequence(batch_x,
                                                            batch_first=True)
        
        batch_x = self.linear(batch_x)
        
        return batch_x
    
    def generator(self, start_char, max_len=200):
        
        char_list = [char_to_id[c] for c in start_char]
        
        next_char = None
        
        while len(char_list) < max_len: 
            # Write your code here 
            # Pack the char_list to tensor
            x = torch.LongTensor(char_list).unsqueeze(0)
            # Input the tensor to the embedding layer, LSTM layers, linear respectively
            x = self.embedding(x)
            # print(f'x = {x}')
            _, (ht, _) = self.rnn_layer1(x)
            _, (ht, _) = self.rnn_layer2(ht)
            y = self.linear(ht) # Obtain the next token prediction y
            # print(f'y = {y}')
            
            next_char = torch.argmax(y, num=-1) # Use argmax function to get the next token prediction
            
            if next_char == char_to_id['<eos>']:
                break
            
            char_list.append(next_char)
            
        return [id_to_char[ch_id] for ch_id in char_list]

In [126]:
torch.manual_seed(2)

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")


model = CharRNN(vocab_size,
                embed_dim,
                hidden_dim)

In [127]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=char_to_id['<pad>'], reduction='mean') # Write your code here. Cross-entropy loss function. The loss function should ignore <pad>
optimizer = optimizer = torch.optim.AdamW(model.parameters(), lr=lr) # Write your code here. Use Adam or AdamW for Optimizer

# Training
1. The outer `for` loop controls the `epoch`
    1. The inner `for` loop uses `data_loader` to retrieve batches.
        1. Pass the batch to the `model` for training.
        2. Compare the predicted results `batch_pred_y` with the true labels `batch_y` using Cross Entropy to calculate the loss `loss`
        3. Use `loss.backward` to automatically compute the gradients.
        4. Use `torch.nn.utils.clip_grad_value_` to limit the gradient values between `-grad_clip` &lt; and &lt; `grad_clip`.
        5. Use `optimizer.step()` to update the model (backpropagation).
2.  After every `1000` batches, output the current loss to monitor whether it is converging.

In [128]:
from tqdm import tqdm
from copy import deepcopy
model = model.to(device)
model.train()
i = 0
flag = False
for epoch in range(1, epochs+1):
    # The process bar
    bar = tqdm(dl_train, desc=f"Train epoch {epoch}")
    for batch_x, batch_y, batch_x_lens, batch_y_lens in bar:
        if not flag:
            flag = True
            print(batch_x)
            print(batch_y)
            print(batch_x_lens)
            print(batch_y_lens)
        # Write your code here
        # Clear the gradient
        optimizer.zero_grad()
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        
        # Write your code here
        # Input the prediction and ground truths to loss function
        # print(f'batch_pred_y = {batch_pred_y}')
        # print(f'batch_y = {batch_y}')
        loss = criterion(batch_pred_y.view(-1, vocab_size), batch_y.view(-1).to(device))
        # Back propagatio.
        loss.backward()
        torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip) # gradient clipping

        # Write your code here
        # Optimize parameters in the model
        optimizer.step()
        i+=1
        if i%50==0:
            bar.set_postfix(loss = loss.item())
    
    # Evaluate your model
    bar = tqdm(dl_eval, desc=f"Validation epoch {epoch}")
    matched = 0
    total = 0
    for batch_x, batch_y, batch_x_lens, batch_y_lens in bar:
        
        predictions = model(batch_x.to(device), batch_x_lens) # Write your code here. Input the batch_x to the model and generate the predictions
        
        predicted_tokens = torch.argmax(predictions, dim=-1) # (batch_size, seq_len)
        # Write your code here.
        # Check whether the prediction match the ground truths
        print(f'prdicted_tokens = {predicted_tokens}')
        print(f'batch_y = {batch_y}')
        print(f'batch_y_lens = {batch_y_lens}')
        # for pred, truth, length in zip(predicted_tokens, batch_y, batch_y_lens):
        #     pred, truth = pred[:length].cpu(), truth[:length].cpu()  # Trim to sequence length
        #     if torch.equal(pred, truth):  # Exact match comparison
        #         matched += 1
        #     total += 1
        # Compute exact match (EM) on the eval dataset
        # EM = correct/total

        
    # print(matched/total)

Train epoch 1:   0%|          | 24/37020 [00:00<04:59, 123.39it/s]

tensor([[ 4, 10, 14,  ...,  0,  0,  0],
        [ 3, 10, 14,  ...,  0,  0,  0],
        [ 8, 13,  4,  ...,  0,  0,  0],
        ...,
        [ 3, 11, 12,  ...,  0,  0,  0],
        [ 6,  8, 13,  ...,  0,  0,  0],
        [ 6,  7, 13,  ...,  2,  1,  0]])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 2, 1, 0]])
tensor([13, 13, 13, 14, 10, 14, 13, 17, 14, 12, 13, 13, 13, 13, 13, 16, 11, 16,
        13, 12, 15, 11, 13, 12, 15, 13, 12, 14, 15, 13, 12, 14, 16, 13, 10, 12,
        16, 12, 15, 14, 15, 14, 14, 12, 13, 14, 13, 12, 13, 13, 14, 14, 14, 15,
        12, 12, 13, 12, 14, 12, 12, 12, 12, 16])
tensor([13, 13, 13, 14, 10, 14, 13, 17, 14, 12, 13, 13, 13, 13, 13, 16, 11, 16,
        13, 12, 15, 11, 13, 12, 15, 13, 12, 14, 15, 13, 12, 14, 16, 13, 10, 12,
        16, 12, 15, 14, 15, 14, 14, 12, 13, 14, 13, 12, 13, 13, 14, 14, 14, 15,

Train epoch 1: 100%|██████████| 37020/37020 [03:41<00:00, 166.79it/s, loss=6.9e-9]  
Validation epoch 1:   1%|          | 51/4114 [00:00<00:16, 251.20it/s]

prdicted_tokens = tensor([[ 8,  2, 10,  ..., 11, 11, 11],
        [ 9,  8,  9,  ...,  8,  1, 11],
        [ 4,  2,  5,  ..., 11, 11, 11],
        ...,
        [ 8,  4,  7,  ...,  6,  1, 11],
        [ 3, 11, 13,  ...,  1, 11, 11],
        [ 3,  2,  6,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 8, 1, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 6, 1, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([14, 16, 12, 16, 15, 11, 12, 11, 12, 16, 11, 12, 15, 14, 14, 13, 15, 13,
        14, 17, 12, 13, 15, 12, 14, 16, 16, 16, 13, 13, 14, 15, 12, 13, 16, 14,
        16, 13, 15, 12, 16, 13, 14, 13, 15, 12, 13, 13, 13, 15, 15, 15, 12, 11,
        15, 11, 15, 14, 15, 15, 12, 16, 15, 13])
prdicted_tokens = tensor([[ 8,  3,  9,  ...,  2,  2,  1],
        [ 3,  9,  6,  ..., 11, 11, 11],
        [ 5,  2, 13,  ...,  6,  1, 11],
        ...,
        [ 8,  3,  5,  ..., 1

Validation epoch 1:   3%|▎         | 103/4114 [00:00<00:16, 250.51it/s]

prdicted_tokens = tensor([[ 4,  7,  5,  ..., 11, 11, 11],
        [ 9,  7,  9,  ..., 11, 11, 11],
        [ 3,  5, 11,  ..., 11, 11, 11],
        ...,
        [ 4, 11,  5,  ..., 11, 11, 11],
        [ 3,  6,  5,  ..., 11, 11, 11],
        [ 8,  4,  9,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([12, 12, 14, 12, 13, 14, 15, 13, 11, 14, 13, 13, 14, 15, 15, 13, 15, 12,
        14, 14, 14, 14, 14, 14, 13, 11, 11, 15, 12, 14, 16, 17, 16, 16, 16, 13,
        14, 16, 13, 14, 12, 14, 14, 15, 12, 15, 11, 12, 12, 14, 14, 12, 13, 14,
        13, 12, 15, 12, 11, 10, 16, 12, 11, 14])
prdicted_tokens = tensor([[ 3,  6, 13,  ..., 11, 11, 11],
        [ 9,  3,  5,  ..., 11, 11, 11],
        [ 8,  4,  2,  ...,  1, 11, 11],
        ...,
        [ 8,  4,  3,  ...,  

Validation epoch 1:   3%|▎         | 129/4114 [00:00<00:15, 250.28it/s]

prdicted_tokens = tensor([[ 5,  7,  5,  ..., 11, 11, 11],
        [ 4,  5, 13,  ..., 11, 11, 11],
        [ 3,  5,  5,  ..., 11, 11, 11],
        ...,
        [ 7,  9,  8,  ...,  1, 11, 11],
        [ 4,  3,  6,  ..., 11, 11, 11],
        [ 9,  2, 10,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([12, 12, 12, 14, 15, 13, 13, 12, 14, 15, 16, 15, 13, 14, 15, 14, 13, 10,
        12, 13, 12, 12, 14, 14, 11, 14, 13, 15, 12, 16, 11, 16, 14, 15, 13, 13,
        12, 13, 14, 14, 15, 14, 12, 15, 13, 13, 13, 16, 13, 16, 13, 14, 16, 15,
        12, 15,  9, 13, 14, 15, 15, 14, 13, 12])
prdicted_tokens = tensor([[ 8,  5,  5,  ...,  1, 11, 11],
        [ 3,  7,  6,  ...,  1, 11, 11],
        [ 5, 11,  9,  ..., 11, 11, 11],
        ...,
        [ 3,  7, 13,  ..., 1

Validation epoch 1:   4%|▍         | 181/4114 [00:00<00:15, 250.10it/s]

prdicted_tokens = tensor([[ 9,  9,  9,  ..., 11, 11, 11],
        [ 5,  8,  9,  ...,  7,  8,  1],
        [ 5,  5,  6,  ...,  8,  1, 11],
        ...,
        [ 8,  7,  9,  ..., 11, 11, 11],
        [11, 13,  7,  ...,  6,  6,  1],
        [ 8,  3,  4,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 7, 8, 1],
        [0, 0, 0,  ..., 8, 1, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 6, 6, 1],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([13, 16, 15, 10, 16, 15, 12, 12, 15, 10, 13, 10, 15, 12, 13, 11, 15, 11,
        13, 13, 13, 14, 13, 16, 16, 13, 14, 14, 13, 13, 11, 14, 13, 14, 13, 16,
        13, 14, 13, 16, 13, 14, 12, 12, 13, 12, 13, 14, 14, 11, 16, 11, 14, 14,
        13, 12, 11, 14, 12, 13, 15, 13, 16, 13])
prdicted_tokens = tensor([[ 9,  9,  5,  ..., 11, 11, 11],
        [ 4,  2, 13,  ..., 11, 11, 11],
        [ 8,  9,  8,  ..., 11, 11, 11],
        ...,
        [ 9, 11,  5,  ...,  

Validation epoch 1:   6%|▌         | 233/4114 [00:00<00:15, 250.69it/s]

prdicted_tokens = tensor([[ 8,  2,  5,  ..., 11, 11, 11],
        [ 8,  3,  5,  ...,  1, 11, 11],
        [ 9,  7,  5,  ...,  1, 11, 11],
        ...,
        [ 3,  6, 13,  ..., 11, 11, 11],
        [ 3,  8,  5,  ...,  1, 11, 11],
        [ 8, 11, 13,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([12, 14, 14, 14, 14, 16, 13, 12, 13, 12, 16, 13, 10, 13, 15, 11, 15, 14,
        16, 13, 15, 16, 14, 12, 13, 12, 16, 14, 15,  9, 13, 14, 10, 12, 13, 12,
        12, 11, 15, 15, 12, 14, 13, 14, 12, 13, 13, 13, 16, 12, 16, 13, 13, 14,
        13, 11, 14, 13, 11, 13, 15, 12, 14, 13])
prdicted_tokens = tensor([[ 8,  9,  7,  ..., 11, 11, 11],
        [ 5,  5, 11,  ..., 11, 11, 11],
        [ 8,  2, 10,  ..., 11, 11, 11],
        ...,
        [ 9,  2, 13,  ..., 1

Validation epoch 1:   7%|▋         | 285/4114 [00:01<00:15, 252.43it/s]

batch_y_lens = tensor([13, 13, 13, 13, 14, 13, 14, 15, 14, 13, 16, 11, 11, 16, 16, 11, 16, 10,
        16, 15, 13, 13, 12, 13, 12, 10, 15, 13, 13, 15, 13, 15, 13, 13, 13, 15,
        11, 13, 13, 15, 14, 16, 15,  9, 14, 15, 14, 15, 15, 14, 14,  9, 10, 13,
        15, 14, 15, 16, 15, 13, 17, 14, 15, 11])
prdicted_tokens = tensor([[ 9,  9,  9,  ..., 11, 11, 11],
        [ 3,  2,  5,  ..., 11, 11, 11],
        [ 9,  9, 13,  ...,  2,  3,  1],
        ...,
        [ 8,  9,  8,  ...,  1, 11, 11],
        [ 7,  9,  3,  ..., 11, 11, 11],
        [11,  5, 11,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 2, 3, 1],
        ...,
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([12, 13, 16, 13, 13, 14, 13, 15, 12, 11, 14, 14, 13, 13, 14, 15, 13, 12,
        14, 10, 13, 16, 14, 15, 14, 16, 11, 13, 13, 12, 14, 14, 11, 14, 11, 14,
    

Validation epoch 1:   9%|▉         | 364/4114 [00:01<00:14, 254.22it/s]

prdicted_tokens = tensor([[ 8,  5, 10,  ...,  2,  1, 11],
        [ 9,  2,  5,  ..., 11, 11, 11],
        [ 8,  3,  3,  ...,  1, 11, 11],
        ...,
        [ 8, 10,  9,  ...,  1, 11, 11],
        [ 8,  9,  7,  ...,  1, 11, 11],
        [ 4,  2, 13,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 2, 1, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([15, 13, 14, 11, 14, 11, 13, 11, 13, 16, 15, 13, 15, 14, 16, 14, 15, 14,
        10, 12, 11, 14, 12, 13, 14, 14, 15, 14, 13, 12, 12, 12, 13, 15, 12, 13,
        16, 12,  9, 12, 13, 15, 13, 16, 15, 13, 11, 13, 14, 16, 14, 15, 15, 10,
        16, 14, 14, 14, 15, 14, 12, 14, 14, 12])
prdicted_tokens = tensor([[ 3,  7,  5,  ..., 11, 11, 11],
        [ 5,  4, 13,  ...,  1, 11, 11],
        [ 4,  6,  9,  ..., 11, 11, 11],
        ...,
        [ 9, 13,  5,  ..., 1

Validation epoch 1:  10%|█         | 416/4114 [00:01<00:14, 253.24it/s]

prdicted_tokens = tensor([[ 9,  9,  5,  ..., 11, 11, 11],
        [ 8,  9,  7,  ..., 11, 11, 11],
        [ 8,  9,  2,  ...,  3,  1, 11],
        ...,
        [ 3,  8,  6,  ..., 11, 11, 11],
        [ 5,  4, 13,  ..., 11, 11, 11],
        [ 4, 10,  6,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 3, 1, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([12, 14, 16, 15, 14, 17, 15, 12, 13, 11, 12, 15, 11, 13, 16, 12, 14, 14,
        16, 12, 14, 15, 13, 13, 13, 14, 16, 16, 13, 13, 12, 13, 13, 16, 12, 12,
        16, 12, 13, 16, 14, 14, 16, 14, 13, 13, 16, 13, 15, 13, 15, 13, 15, 12,
        15, 11, 14, 11, 14, 15, 12, 11, 12, 12])
prdicted_tokens = tensor([[ 4,  4, 13,  ...,  1, 11, 11],
        [ 4,  5, 13,  ..., 11, 11, 11],
        [ 9,  9,  9,  ..., 11, 11, 11],
        ...,
        [ 8,  3,  5,  ..., 1

Validation epoch 1:  11%|█         | 442/4114 [00:01<00:14, 248.95it/s]

prdicted_tokens = tensor([[ 8,  4,  5,  ...,  1, 11, 11],
        [ 5,  6,  5,  ..., 11, 11, 11],
        [ 5,  6,  9,  ..., 11, 11, 11],
        ...,
        [ 5,  4, 13,  ..., 11, 11, 11],
        [ 8,  5, 11,  ...,  1, 11, 11],
        [ 8,  4, 10,  ...,  2,  1, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 2, 1, 0]])
batch_y_lens = tensor([15, 13, 11, 14, 16, 14, 13, 15, 10, 12, 12, 14, 15, 16, 17, 13, 15, 14,
        11, 13, 15, 16, 13, 13, 13, 16, 14, 13, 14, 14, 15, 12, 15, 10, 15, 15,
        16, 15, 15, 11, 15, 11, 13, 14, 14, 11, 16, 11, 13, 14, 14, 15, 12, 12,
        15, 15, 14, 15, 14, 15, 15, 14, 15, 16])
prdicted_tokens = tensor([[ 4,  5, 13,  ..., 11, 11, 11],
        [ 5,  5, 13,  ..., 11, 11, 11],
        [ 8,  9, 11,  ..., 11, 11, 11],
        ...,
        [ 3,  4,  6,  ..., 1

Validation epoch 1:  12%|█▏        | 494/4114 [00:01<00:14, 247.41it/s]

prdicted_tokens = tensor([[ 8,  3,  8,  ...,  1, 11, 11],
        [ 8,  3, 10,  ..., 11, 11, 11],
        [ 5,  4, 13,  ..., 11, 11, 11],
        ...,
        [ 2, 13,  4,  ..., 11, 11, 11],
        [ 8,  3,  7,  ...,  1, 11, 11],
        [ 9,  3,  6,  ...,  5,  2,  1]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 5, 2, 1]])
batch_y_lens = tensor([15, 14, 12, 12, 16, 12, 13, 13, 14, 15, 15, 10, 12, 15, 12, 15, 16, 15,
        13, 11, 12, 15, 11, 14, 14, 12, 15, 12, 12, 14, 14, 15, 12, 15, 11, 14,
        15, 12, 16, 14, 12, 13, 15, 16, 15, 13, 14, 11, 15, 13, 11, 12, 15, 12,
        16, 14, 16, 16, 16, 15, 15, 12, 15, 17])
prdicted_tokens = tensor([[ 4,  3,  6,  ..., 11, 11, 11],
        [ 8,  4,  6,  ...,  1, 11, 11],
        [ 5,  5, 13,  ...,  9,  1, 11],
        ...,
        [10,  9,  9,  ..., 1

Validation epoch 1:  14%|█▍        | 573/4114 [00:02<00:13, 254.14it/s]

prdicted_tokens = tensor([[ 4,  2,  5,  ..., 11, 11, 11],
        [10,  5,  6,  ..., 11, 11, 11],
        [ 3,  6, 11,  ..., 11, 11, 11],
        ...,
        [ 3,  7,  5,  ..., 11, 11, 11],
        [ 9,  9,  5,  ..., 11, 11, 11],
        [ 9,  5,  8,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([14, 11, 12, 11, 14, 14, 11, 13, 13, 12, 12, 17, 15, 16, 15, 12, 14, 15,
        14, 14, 14, 15, 13, 15, 10, 12, 13, 14, 13, 10, 15, 13, 13, 13, 15, 13,
        15, 12, 10, 15, 13, 14, 12, 13, 15, 13, 12, 15, 12, 17, 15, 15, 12, 13,
        11, 12, 16, 14, 13, 14, 13, 12, 12, 13])
prdicted_tokens = tensor([[ 8,  5,  7,  ..., 11, 11, 11],
        [ 8,  4,  4,  ..., 11, 11, 11],
        [ 2, 10,  8,  ..., 11, 11, 11],
        ...,
        [ 9, 11, 13,  ..., 1

Validation epoch 1:  15%|█▍        | 599/4114 [00:02<00:13, 253.95it/s]

prdicted_tokens = tensor([[ 8,  4,  6,  ...,  1, 11, 11],
        [ 3,  3,  5,  ..., 11, 11, 11],
        [ 4, 10,  5,  ..., 11, 11, 11],
        ...,
        [ 3,  5,  5,  ..., 11, 11, 11],
        [ 4,  6,  5,  ...,  9,  1, 11],
        [ 8,  3,  7,  ...,  5,  1, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 9, 1, 0],
        [0, 0, 0,  ..., 5, 1, 0]])
batch_y_lens = tensor([14, 10, 12, 12, 12, 13, 13, 13, 12, 15, 16, 14, 16, 16, 12, 15, 14, 14,
        14, 13, 12, 14, 14, 11, 15, 14, 16, 13, 12, 15, 12, 13, 14, 16, 11, 15,
        13, 15, 15, 11, 13, 15, 13, 12, 15, 14, 11, 16, 15, 14, 16, 14, 15, 10,
        12, 14, 13, 16, 16, 12, 13, 12, 15, 15])
prdicted_tokens = tensor([[ 8,  4,  5,  ..., 11, 11, 11],
        [ 8,  9, 13,  ..., 11, 11, 11],
        [ 8,  9,  3,  ..., 11, 11, 11],
        ...,
        [ 9,  9,  9,  ...,  

Validation epoch 1:  16%|█▌        | 651/4114 [00:02<00:13, 249.97it/s]

prdicted_tokens = tensor([[ 5,  8,  5,  ..., 11, 11, 11],
        [ 5,  6,  5,  ..., 11, 11, 11],
        [ 4,  6,  9,  ...,  1, 11, 11],
        ...,
        [ 5, 13,  3,  ..., 11, 11, 11],
        [ 4,  6,  5,  ..., 11, 11, 11],
        [ 4, 10, 13,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([13, 12, 15, 14, 16, 14, 14, 13, 12, 10, 13, 12, 16, 17, 14, 12, 13, 11,
        11, 10, 14, 15, 15, 12, 14, 14, 12, 14, 11, 13, 12, 10, 12, 14, 14, 15,
        15, 15, 15, 15, 13, 14, 12, 12, 13, 12, 13, 13, 15, 14, 16, 12, 12, 12,
        14, 12, 11, 11, 12, 12, 16, 11, 13, 12])
prdicted_tokens = tensor([[ 4,  3,  7,  ...,  1, 11, 11],
        [ 8,  5,  9,  ...,  7,  1, 11],
        [ 9,  9,  5,  ..., 11, 11, 11],
        ...,
        [ 8,  3,  5,  ..., 1

Validation epoch 1:  17%|█▋        | 702/4114 [00:02<00:13, 246.37it/s]

prdicted_tokens = tensor([[ 3,  6, 13,  ..., 11, 11, 11],
        [10, 13,  8,  ...,  6,  1, 11],
        [ 3, 11, 13,  ..., 11, 11, 11],
        ...,
        [ 5, 11,  5,  ..., 11, 11, 11],
        [ 5,  9, 13,  ..., 10,  5,  1],
        [ 9,  4,  5,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  6,  1,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ..., 10,  5,  1],
        [ 0,  0,  0,  ...,  0,  0,  0]])
batch_y_lens = tensor([12, 16, 14, 12, 14, 14, 12, 14, 15, 11, 14, 16, 14, 15, 13, 13, 13, 14,
        13, 14, 11,  9, 14, 12, 12, 16, 13, 14, 12, 16, 12, 16, 14, 15, 14, 12,
        14, 15, 12, 16, 10, 10, 11, 13, 14, 12, 13, 13, 12, 12, 14, 13, 14, 16,
        15, 14, 16, 14, 13, 12, 15, 13, 17, 10])
prdicted_tokens = tensor([[ 8,  4,  8,  ...,  1, 11, 11],
        [ 9, 10,  9,  ..., 11, 11, 11],
        [ 9,  7,  9,  ...,  1, 11, 11],
     

Validation epoch 1:  18%|█▊        | 754/4114 [00:03<00:13, 246.59it/s]

prdicted_tokens = tensor([[ 9, 11,  5,  ..., 11, 11, 11],
        [ 9,  5,  5,  ..., 11, 11, 11],
        [ 9,  5,  5,  ...,  1, 11, 11],
        ...,
        [ 7,  5, 11,  ..., 11, 11, 11],
        [ 4,  5,  5,  ..., 11, 11, 11],
        [ 3,  6,  6,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]])
batch_y_lens = tensor([13, 12, 15, 14, 14, 12, 16, 16, 15, 13, 14, 14, 15, 13, 16, 15, 14, 14,
        12, 11, 14, 11, 12, 13, 14, 17,  9, 13, 14, 12, 13, 15, 12, 12, 14, 12,
        17, 11, 13, 15, 14, 16, 13, 10, 12, 10, 14,  9, 16, 11, 14,  9, 15, 13,
        13, 12, 14, 12, 17, 12, 12, 12, 12, 15])
prdicted_tokens = tensor([[ 9,  9, 13,  ..., 11, 11, 11],
        [ 8,  4,  7,  ...,  1, 11, 11],
        [ 4,  9,  9,  ..., 11, 11, 11],
        ...,
        [ 4,  6,  9,  ...,  

Validation epoch 1:  20%|██        | 830/4114 [00:03<00:13, 250.06it/s]

prdicted_tokens = tensor([[ 4,  3, 13,  ...,  1, 11, 11],
        [ 5,  8,  5,  ..., 11, 11, 11],
        [ 8,  9,  9,  ..., 11, 11, 11],
        ...,
        [ 5,  3,  5,  ..., 11, 11, 11],
        [ 8,  4,  2,  ..., 11, 11, 11],
        [ 8,  9,  2,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]])
batch_y_lens = tensor([15, 11, 14, 12, 13, 11, 13, 13, 10, 16, 12, 11, 14, 13, 13, 12, 17, 15,
        14, 13, 15, 11, 15, 12, 14, 12, 16, 13, 14, 13, 13, 14, 12, 14, 13, 13,
        11, 13, 14, 16, 12, 13,  9, 12, 14, 11, 12, 13, 12, 14, 16, 14, 14, 10,
        11, 13, 11, 12, 12, 15, 16, 11, 13, 15])
prdicted_tokens = tensor([[ 8,  9, 13,  ...,  8,  1, 11],
        [ 8,  4,  8,  ..., 11, 11, 11],
        [ 4,  8,  6,  ...,  2,  2,  1],
        ...,
        [ 5,  6,  5,  ..., 1

Validation epoch 1:  21%|██▏       | 882/4114 [00:03<00:13, 247.74it/s]

prdicted_tokens = tensor([[ 4,  6,  5,  ..., 11, 11, 11],
        [ 5,  5, 13,  ..., 11, 11, 11],
        [ 8,  9,  9,  ..., 11, 11, 11],
        ...,
        [ 3,  6, 13,  ...,  1, 11, 11],
        [ 4,  5,  5,  ..., 11, 11, 11],
        [ 8,  5, 10,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([13, 14, 13, 13, 11, 11, 11, 12, 12, 12, 12, 16, 15, 15, 15, 13, 11, 12,
        13, 13, 11, 13, 12, 13, 15, 14, 12, 14, 12, 11, 15, 11, 10, 12,  8, 13,
        14, 15, 16, 12, 12, 13, 16, 12, 15, 17, 12, 16, 16, 15, 11, 11, 13,  8,
        14, 12, 15, 12, 16, 13, 13, 15, 13, 13])
prdicted_tokens = tensor([[ 7,  9,  9,  ..., 11, 11, 11],
        [ 9,  4,  3,  ..., 11, 11, 11],
        [ 9, 10,  9,  ..., 11, 11, 11],
        ...,
        [ 3,  2,  6,  ...,  

Validation epoch 1:  22%|██▏       | 907/4114 [00:03<00:12, 247.64it/s]

prdicted_tokens = tensor([[ 8,  8,  9,  ..., 11, 11, 11],
        [ 8,  7, 13,  ..., 11, 11, 11],
        [ 4,  8,  6,  ...,  1, 11, 11],
        ...,
        [ 8,  9,  5,  ..., 11, 11, 11],
        [ 4,  9, 13,  ..., 11, 11, 11],
        [ 4,  1,  5,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([13, 12, 14, 14, 15, 14, 13, 14, 16, 13, 14, 14, 11,  9, 13, 14, 15, 12,
        11, 12, 16, 12, 16, 11, 14, 13, 13, 13, 15, 13, 13, 10, 11, 12, 13, 13,
        10, 15, 14, 16, 13, 13, 11, 15, 12, 14, 13, 13, 13, 13, 14, 14, 13, 14,
        10, 12, 16, 13, 14, 16,  9, 13, 13, 10])
prdicted_tokens = tensor([[ 8,  4,  5,  ...,  6,  1, 11],
        [ 8, 13,  4,  ..., 11, 11, 11],
        [ 9,  5,  5,  ...,  1, 11, 11],
        ...,
        [ 5, 10,  9,  ...,  

Validation epoch 1:  23%|██▎       | 958/4114 [00:03<00:12, 248.35it/s]

prdicted_tokens = tensor([[ 3,  9,  6,  ..., 11, 11, 11],
        [ 8,  5,  8,  ...,  1, 11, 11],
        [ 8,  4,  9,  ..., 11, 11, 11],
        ...,
        [ 9,  5, 13,  ...,  1, 11, 11],
        [ 8,  5,  6,  ..., 11,  1, 11],
        [ 4,  6,  9,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ..., 11,  1,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]])
batch_y_lens = tensor([10, 15, 14, 15, 13, 14, 14, 17, 16, 12, 16, 12, 13, 14, 16, 14, 14, 14,
        13, 10, 14, 13, 16, 12, 15, 14, 12, 13, 16, 15, 14, 16, 14, 13, 13, 13,
         8, 13, 12, 14, 11, 14, 11, 15, 12, 13, 13, 14, 16, 14, 12, 16, 14, 15,
        13, 14, 15, 15, 11, 14, 12, 15, 16, 14])
prdicted_tokens = tensor([[ 9,  4,  5,  ..., 11, 11, 11],
        [ 9,  9,  5,  ..., 11, 11, 11],
        [ 4,  5,  6,  ..., 11, 11, 11],
     

Validation epoch 1:  25%|██▍       | 1010/4114 [00:04<00:12, 249.90it/s]

prdicted_tokens = tensor([[ 4, 10,  6,  ..., 11, 11, 11],
        [ 5,  5,  7,  ..., 11, 11, 11],
        [ 4,  8,  5,  ..., 11, 11, 11],
        ...,
        [ 5,  2, 13,  ..., 11,  3,  1],
        [ 4,  3,  6,  ..., 11, 11, 11],
        [ 4,  6,  5,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ..., 11,  3,  1],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]])
batch_y_lens = tensor([13, 11, 12, 16, 15, 12, 14, 14, 12, 11, 15, 13, 14, 13, 16, 13, 15, 11,
        10, 13, 13, 14, 16, 14, 13, 12, 13, 16, 11, 14, 12, 11, 14, 13, 13, 17,
        17, 12, 12, 15, 15, 14, 12, 15, 15, 16, 12, 12, 12, 14, 14, 16, 13, 12,
        14, 16, 11, 15, 11, 12, 14, 17, 13, 12])
prdicted_tokens = tensor([[ 5,  7,  5,  ...,  3, 11,  1],
        [ 8,  4,  4,  ...,  1, 11, 11],
        [ 3,  6, 13,  ..., 11, 11, 11],
     

Validation epoch 1:  26%|██▌       | 1063/4114 [00:04<00:12, 253.13it/s]

prdicted_tokens = tensor([[ 8,  5,  4,  ..., 11, 11, 11],
        [ 8,  5,  9,  ...,  7,  1, 11],
        [ 3,  9,  5,  ..., 11, 11, 11],
        ...,
        [ 8,  5,  9,  ..., 11, 10,  1],
        [ 8,  7,  9,  ...,  1, 11, 11],
        [ 4,  2,  5,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  7,  1,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ..., 11, 10,  1],
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]])
batch_y_lens = tensor([ 8, 15, 10, 15, 14, 10, 14, 12, 11, 14, 12, 12, 13, 13, 14, 14, 13, 14,
        11, 15, 14, 13, 15, 12, 11, 13, 16, 15, 14, 14, 11, 11, 12, 14, 15, 13,
        13, 14, 13, 14, 16, 12, 13, 16, 15, 15, 14, 13, 16, 16, 15, 11, 13, 14,
        16, 13, 11, 14, 12, 15, 15, 16, 14, 13])
prdicted_tokens = tensor([[ 9,  5, 13,  ...,  1, 11, 11],
        [ 5,  3,  6,  ...,  1, 11, 11],
        [ 4,  8, 13,  ...,  1, 11, 11],
     

Validation epoch 1:  27%|██▋       | 1115/4114 [00:04<00:11, 251.94it/s]

prdicted_tokens = tensor([[ 2, 10,  8,  ..., 11, 11, 11],
        [ 9,  3,  6,  ...,  6,  1, 11],
        [ 5,  5,  6,  ...,  1, 11, 11],
        ...,
        [ 8,  9, 11,  ...,  4,  1, 11],
        [ 2,  5, 11,  ...,  1, 11, 11],
        [ 8,  8,  5,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 6, 1, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 4, 1, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([12, 15, 14, 14, 13, 13, 12, 11, 13, 12, 13, 14, 13, 15, 12, 12, 14, 12,
        15, 12, 12, 15, 15, 12, 12, 12, 15, 14, 13, 12, 11, 13, 15, 12, 11, 14,
        12, 12, 15, 15, 13, 16, 15, 13, 13, 16, 13, 14, 12, 11, 15, 14, 14, 14,
        16, 13, 16, 11, 10, 13, 15, 15, 14, 13])
prdicted_tokens = tensor([[ 9, 13,  3,  ..., 11, 11, 11],
        [ 8,  5,  4,  ..., 11, 11, 11],
        [ 8,  5,  8,  ..., 11, 11, 11],
        ...,
        [ 5,  3,  5,  ..., 1

Validation epoch 1:  28%|██▊       | 1167/4114 [00:04<00:11, 252.57it/s]

prdicted_tokens = tensor([[ 5,  8,  9,  ...,  1, 11, 11],
        [ 9, 10, 13,  ...,  9,  4,  1],
        [ 9,  9, 13,  ..., 11, 11, 11],
        ...,
        [ 8,  7, 13,  ..., 11, 11, 11],
        [ 8,  3,  6,  ..., 11, 11, 11],
        [ 8,  9, 11,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 9, 4, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]])
batch_y_lens = tensor([15, 17, 13, 13, 13, 13, 11, 13, 11, 13, 12, 14, 15, 12, 14, 15, 15, 11,
        13, 16, 11, 15, 14, 14, 15, 13, 12, 10, 13, 15, 16, 10, 14, 10, 14, 15,
        12, 14, 15, 13, 14, 14, 11, 15, 12, 15, 13, 14, 14, 14, 11, 14, 13, 15,
        11, 14, 15, 16, 10, 16, 15, 13, 13, 15])
prdicted_tokens = tensor([[ 9,  8,  5,  ...,  1, 11, 11],
        [ 3,  2,  5,  ..., 11, 11, 11],
        [ 8,  3, 10,  ..., 11, 11, 11],
        ...,
        [ 5,  5, 13,  ..., 1

Validation epoch 1:  30%|██▉       | 1219/4114 [00:04<00:11, 242.66it/s]

prdicted_tokens = tensor([[ 8,  5, 10,  ...,  1, 11, 11],
        [11,  5,  3,  ..., 11, 11, 11],
        [ 8,  4,  6,  ..., 11, 11, 11],
        ...,
        [ 3,  6, 13,  ..., 11, 11, 11],
        [ 9,  2, 13,  ..., 11, 11, 11],
        [ 4, 11, 10,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([15, 12, 14, 14, 12, 15, 17, 12, 15, 16, 13, 15, 11, 16, 11, 13, 13, 13,
        12, 15, 16, 15, 13, 11, 12, 12, 13, 12, 13, 15, 12, 12, 12, 14, 13, 13,
        10, 11, 16, 14, 13, 11, 13, 17, 15, 13, 15, 16, 14, 16, 13, 16, 16, 14,
        14, 13, 15, 12, 15, 11, 15, 11, 13, 13])
prdicted_tokens = tensor([[ 8,  9,  5,  ..., 11, 11, 11],
        [ 8,  3, 11,  ..., 11, 11, 11],
        [ 8,  3,  7,  ...,  4,  1, 11],
        ...,
        [ 8,  4,  4,  ..., 1

Validation epoch 1:  31%|███       | 1270/4114 [00:05<00:11, 246.20it/s]

batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([15, 14, 15, 14, 14, 15, 15, 14, 16, 13, 16, 11, 14, 14, 11, 12, 13, 13,
        11, 13, 15, 12, 15, 12, 12, 14, 14, 17, 15, 11, 11, 15, 13, 13, 14, 12,
        15, 13, 11, 12, 12, 16, 15, 15, 14, 12, 15, 13, 12, 14, 16, 13, 13, 16,
        12, 12, 15, 16, 14, 14, 15, 13, 12, 13])
prdicted_tokens = tensor([[ 5,  9,  9,  ..., 11, 11, 11],
        [ 4,  9,  5,  ..., 11, 11, 11],
        [ 8,  4,  7,  ..., 11,  1, 11],
        ...,
        [ 3, 10,  6,  ..., 11, 11, 11],
        [ 8,  9,  8,  ..., 11, 11, 11],
        [ 8,  3,  4,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ..., 11,  1,  0],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  

Validation epoch 1:  32%|███▏      | 1321/4114 [00:05<00:11, 241.67it/s]

prdicted_tokens = tensor([[ 5,  8,  9,  ..., 11, 11, 11],
        [ 8,  5,  4,  ..., 11, 11, 11],
        [ 8,  9,  5,  ..., 11, 11, 11],
        ...,
        [ 2, 10,  6,  ..., 11, 11, 11],
        [ 5,  6,  9,  ..., 11, 11, 11],
        [ 8,  3,  7,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([12, 14, 14, 12, 14, 16, 15, 14, 16, 16, 16, 12, 14, 14, 13, 15,  9, 14,
        12, 14, 16, 14, 11, 16, 11, 14, 13, 15, 15, 15, 14, 15, 13, 11, 11, 14,
        15, 15, 14, 16, 12, 13, 13, 11, 10, 12, 13, 15, 13, 14, 10, 16, 17, 14,
        12, 15, 14, 15, 16, 14, 13,  9, 13, 14])
prdicted_tokens = tensor([[ 8,  5,  9,  ...,  5,  4,  1],
        [ 8,  9,  7,  ...,  1, 11, 11],
        [ 3,  5,  6,  ..., 11, 11, 11],
        ...,
        [ 8,  9,  8,  ...,  

Validation epoch 1:  33%|███▎      | 1373/4114 [00:05<00:10, 250.32it/s]

prdicted_tokens = tensor([[ 8,  9,  8,  ...,  1, 11, 11],
        [ 8,  9,  4,  ...,  2,  1, 11],
        [ 8,  3,  9,  ..., 11, 11, 11],
        ...,
        [ 4,  7,  5,  ..., 11, 11, 11],
        [ 5,  4,  6,  ...,  1, 11, 11],
        [ 5, 11,  9,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 2, 1, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]])
batch_y_lens = tensor([15, 16, 14, 12, 14, 13, 13, 13, 16, 16, 12, 12, 15, 12, 11, 13, 14, 12,
        12, 15, 12, 17, 12, 14, 11, 15, 15, 14, 13, 14, 14, 15, 16, 11, 16, 11,
        15, 17, 12, 12, 16, 14, 12, 14, 15, 14, 16, 13, 15, 14, 15, 11, 13, 16,
        12, 11, 16, 14, 14, 15, 12, 11, 15, 15])
prdicted_tokens = tensor([[ 3, 10,  5,  ..., 11, 11, 11],
        [ 9,  4,  6,  ...,  6,  1, 11],
        [ 9,  7,  5,  ..., 11, 11, 11],
        ...,
        [ 4,  5, 11,  ..., 1

Validation epoch 1:  35%|███▍      | 1425/4114 [00:05<00:10, 248.82it/s]

batch_y_lens = tensor([15, 13, 13, 13, 12, 13, 12, 14, 10, 13, 15, 13, 14, 14, 13, 15, 14, 13,
        15, 16, 14, 16, 12, 15, 14, 11, 10, 15, 14, 14, 11, 13, 16, 11, 13, 14,
        13, 12, 15, 12, 13, 11, 11, 12, 14, 10, 16, 10, 16, 12, 14, 13, 14, 13,
        12, 16, 12, 14, 14, 11, 13, 12, 13, 16])
prdicted_tokens = tensor([[ 4,  6,  5,  ..., 11, 11, 11],
        [ 5, 11,  9,  ..., 11, 11, 11],
        [ 8,  9,  4,  ...,  9,  1, 11],
        ...,
        [ 5,  3,  5,  ..., 11, 11, 11],
        [ 9, 10, 13,  ..., 11, 11, 11],
        [ 5,  8, 13,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 9, 1, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([13, 14, 16, 14, 14, 11, 14, 15, 15, 14, 12, 13, 16, 15, 14, 15, 15, 15,
        15, 14, 11, 14, 15, 14, 11, 15, 13, 14, 13, 14, 15, 16, 15, 14, 16, 11,
    

Validation epoch 1:  36%|███▌      | 1477/4114 [00:05<00:10, 252.05it/s]

prdicted_tokens = tensor([[ 8,  4,  3,  ..., 11, 11, 11],
        [ 5, 11,  5,  ...,  1, 11, 11],
        [ 9, 13,  4,  ..., 11, 11, 11],
        ...,
        [ 8,  4,  5,  ..., 11, 11, 11],
        [ 5,  5,  5,  ..., 11, 11, 11],
        [ 8,  4,  3,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]])
batch_y_lens = tensor([14, 15, 12, 13, 12, 13, 10, 12, 14, 11, 14, 15, 16, 16, 14, 16, 12, 15,
        15, 15, 14, 15, 12, 15, 14, 13, 12, 11, 15, 13, 16, 14, 14, 16, 13, 16,
        11, 12, 13, 14, 14, 12, 13, 15, 12, 17, 12, 14, 15, 14, 13, 16, 11, 16,
        15, 12, 14, 14, 16, 15, 15, 14, 13, 15])
prdicted_tokens = tensor([[ 8,  9,  9,  ..., 11, 11, 11],
        [ 9,  2, 13,  ..., 11, 11, 11],
        [ 7,  5,  5,  ..., 11, 11, 11],
        ...,
        [ 8,  4,  8,  ..., 1

Validation epoch 1:  37%|███▋      | 1529/4114 [00:06<00:10, 251.98it/s]

batch_y_lens = tensor([13, 13, 15, 11, 13, 14, 13, 12, 14, 11, 16, 14, 11, 13, 11, 12, 15, 14,
        14, 12, 11, 12, 12, 15, 11, 13, 11, 15, 14, 14, 16, 12, 15, 13, 15, 13,
        15, 14, 12, 16, 15, 13, 12, 15, 14, 14, 15, 11, 15, 13, 12, 14, 11, 15,
        15, 14, 12, 14, 12, 15, 15, 15, 12, 15])
prdicted_tokens = tensor([[ 3,  7,  6,  ..., 11, 11, 11],
        [ 9,  4,  5,  ..., 11, 11, 11],
        [11, 13,  7,  ..., 11, 11, 11],
        ...,
        [ 5,  5,  5,  ...,  1, 11, 11],
        [ 8,  3,  8,  ..., 11, 11, 11],
        [ 8,  3, 11,  ...,  2,  1, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 2, 1, 0]])
batch_y_lens = tensor([13, 12, 13, 16, 14, 12, 13, 15, 13, 12, 16, 13, 11, 16, 14, 13, 13, 15,
        11, 13, 11, 11, 15, 14, 11, 14, 14, 13, 15, 16, 12, 15, 11, 12, 13, 13,
    

Validation epoch 1:  38%|███▊      | 1581/4114 [00:06<00:09, 254.05it/s]

prdicted_tokens = tensor([[ 8,  7, 13,  ..., 11, 11, 11],
        [ 4,  6, 13,  ...,  1, 11, 11],
        [ 8,  5,  3,  ..., 11, 11, 11],
        ...,
        [ 5,  9, 13,  ..., 11, 11, 11],
        [ 4,  6,  5,  ..., 11, 11, 11],
        [ 8,  3,  9,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([12, 15, 14, 15, 12, 12, 13, 14, 12, 13, 14, 13, 13, 13, 14, 14, 13, 13,
        17, 10, 13, 15, 10, 14, 15, 13, 13, 11, 16, 16, 13, 14, 13, 14, 12, 16,
        16, 15, 15, 14, 17, 14, 11, 10, 14, 14, 12, 14, 12, 15, 14, 13, 15, 15,
        15, 11, 15, 12, 12, 15, 14, 10, 12, 12])
prdicted_tokens = tensor([[ 9, 10,  5,  ..., 11, 11, 11],
        [ 4,  3,  3,  ..., 11, 11, 11],
        [ 4,  9,  9,  ..., 11, 11, 11],
        ...,
        [ 8,  3,  7,  ..., 1

Validation epoch 1:  40%|███▉      | 1633/4114 [00:06<00:09, 254.27it/s]

prdicted_tokens = tensor([[ 4,  9, 13,  ..., 11, 11, 11],
        [ 3,  9,  5,  ..., 11, 11, 11],
        [ 8,  5,  3,  ...,  1, 11, 11],
        ...,
        [ 3,  7,  6,  ..., 11, 11, 11],
        [ 9,  9,  5,  ..., 11, 11, 11],
        [ 9, 11, 13,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([13, 11, 14, 13, 11, 13, 13, 11, 16, 12, 13, 13, 16, 12, 13, 16, 11, 15,
        11, 13, 11, 13, 14, 14, 11, 13, 14, 16, 12, 12, 12, 13, 13, 15, 14, 13,
        13, 12, 13, 13, 11, 15, 12, 14, 14, 14, 10, 12, 14, 13, 10, 12, 14, 13,
        14, 15, 12, 15, 11, 15, 13, 12, 12, 12])
prdicted_tokens = tensor([[ 8,  5,  8,  ...,  7,  1, 11],
        [ 8,  3, 10,  ..., 10,  1, 11],
        [ 5,  5,  5,  ...,  1, 11, 11],
        ...,
        [ 4,  2,  5,  ..., 1

Validation epoch 1:  41%|████      | 1685/4114 [00:06<00:09, 253.29it/s]

prdicted_tokens = tensor([[ 9,  4,  6,  ...,  8,  1, 11],
        [ 3,  5, 13,  ..., 11, 11, 11],
        [ 5,  5, 13,  ..., 11, 11, 11],
        ...,
        [ 5, 11,  5,  ..., 11, 11, 11],
        [ 9,  9,  5,  ..., 11, 11, 11],
        [ 8,  9,  3,  ...,  7, 10,  1]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  8,  1,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  7, 10,  1]])
batch_y_lens = tensor([15, 13, 10, 14, 11, 12, 14, 15, 14, 15, 14, 15, 12, 13, 12, 15, 14, 16,
        14, 15, 16, 12, 16, 16, 13, 11, 14, 12, 14, 11, 14, 14, 12, 11, 11, 14,
        12, 11, 15, 13, 16, 15, 13, 13, 13, 13, 13, 13, 13, 15, 13, 15, 14, 15,
        14, 15, 14, 15, 13, 13, 11, 11, 13, 16])
prdicted_tokens = tensor([[ 5,  6,  9,  ...,  7, 10,  1],
        [ 8,  9,  9,  ..., 11, 11, 11],
        [ 8,  8, 13,  ...,  8,  1, 11],
     

Validation epoch 1:  42%|████▏     | 1738/4114 [00:06<00:09, 254.91it/s]

prdicted_tokens = tensor([[ 8,  9,  4,  ..., 11, 11, 11],
        [ 5,  9, 13,  ..., 11, 11, 11],
        [ 9,  8,  5,  ..., 11, 11, 11],
        ...,
        [ 8,  3,  5,  ..., 11, 11, 11],
        [ 4, 11, 10,  ..., 11, 11, 11],
        [ 9,  5,  5,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]])
batch_y_lens = tensor([13, 12, 13, 11, 12, 14, 12, 13, 10, 16, 13, 13, 14, 16, 11, 11, 12, 15,
        13, 13, 12, 12, 12, 17, 14, 14, 16,  9, 15, 14, 16, 16, 10, 13, 15, 13,
        12, 14, 12, 15, 13, 15, 14, 11, 13, 12, 12, 12, 16, 13, 14, 15, 13, 15,
        15, 15, 14, 15, 14, 12, 13, 13, 12, 15])
prdicted_tokens = tensor([[ 8,  4,  8,  ...,  9,  1, 11],
        [ 2, 10,  4,  ..., 11, 11, 11],
        [ 3, 13,  7,  ...,  1, 11, 11],
        ...,
        [ 8,  4,  3,  ...,  

Validation epoch 1:  44%|████▎     | 1790/4114 [00:07<00:09, 247.04it/s]

prdicted_tokens = tensor([[ 3,  3, 13,  ...,  1, 11, 11],
        [ 4,  2,  5,  ..., 11, 11, 11],
        [ 9,  9,  5,  ..., 11, 11, 11],
        ...,
        [ 9,  9, 10,  ..., 11, 11, 11],
        [ 3,  7,  5,  ..., 11, 11, 11],
        [ 3,  6,  5,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([15, 14, 14, 15, 12, 11, 14, 11, 14, 13, 15, 13, 15, 12, 15, 12, 14, 14,
        12, 10, 10,  8, 14, 15, 14, 14, 12, 13, 12, 13, 14, 13, 13, 12, 16, 12,
        14, 11, 13, 14, 13, 13, 17, 13, 12, 15, 16, 14, 14, 13, 16, 14, 13, 13,
        13, 13, 15, 13, 15, 14, 12, 10, 13, 13])
prdicted_tokens = tensor([[ 5,  4,  6,  ..., 11, 11, 11],
        [ 5,  6,  9,  ..., 11, 11, 11],
        [ 5,  5,  2,  ..., 11, 11, 11],
        ...,
        [ 3,  7,  5,  ...,  

Validation epoch 1:  45%|████▌     | 1869/4114 [00:07<00:08, 254.26it/s]

prdicted_tokens = tensor([[ 9,  4,  3,  ..., 11, 11, 11],
        [ 3,  9, 13,  ...,  2,  9,  1],
        [10,  5,  8,  ..., 11, 11, 11],
        ...,
        [ 8,  9,  2,  ..., 11, 11, 11],
        [ 3,  7, 13,  ..., 11, 11, 11],
        [ 9,  4,  5,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 2, 9, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([13, 16, 13, 15, 13, 15, 12, 16, 13, 14, 11, 14, 15, 10, 14,  9, 15, 13,
        12, 16, 11, 13, 13, 14, 14, 15, 13, 13, 13, 14, 14, 13, 14, 11, 15, 14,
        15, 15, 11, 13, 14, 14, 15, 13, 12, 11, 14, 12, 14, 12, 15, 15, 16, 14,
        12, 12, 15, 16, 16, 15, 14, 13, 13, 13])
prdicted_tokens = tensor([[ 9,  9,  3,  ..., 11, 11, 11],
        [ 9,  9,  9,  ...,  9,  4,  1],
        [ 9,  9,  5,  ..., 11, 11, 11],
        ...,
        [ 4,  9, 13,  ...,  

Validation epoch 1:  47%|████▋     | 1921/4114 [00:07<00:08, 253.73it/s]

prdicted_tokens = tensor([[ 9,  5, 13,  ...,  3,  1, 11],
        [ 3,  6,  5,  ...,  1, 11, 11],
        [ 5, 10,  5,  ...,  1, 11, 11],
        ...,
        [ 3,  5,  6,  ...,  8,  1, 11],
        [ 3,  4,  5,  ..., 11,  1, 11],
        [ 9, 10, 13,  ...,  7,  1, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  3,  1,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  8,  1,  0],
        [ 0,  0,  0,  ..., 11,  1,  0],
        [ 0,  0,  0,  ...,  7,  1,  0]])
batch_y_lens = tensor([15, 14, 14, 16, 12, 16, 12, 14, 14, 12, 13, 14, 12, 16, 14, 14, 14, 14,
        16, 14, 13, 12, 10, 14, 12, 12, 12, 13, 15, 14, 11, 15, 14, 13, 15, 15,
        12, 12, 15, 15, 12, 16, 11, 16, 12, 15, 13, 12, 12, 16, 13, 14, 15, 13,
        16, 10, 14, 16, 15, 10, 14, 15, 15, 15])
prdicted_tokens = tensor([[ 3, 11, 10,  ..., 11, 11, 11],
        [ 8,  9,  4,  ...,  2,  1, 11],
        [ 9,  2,  5,  ..., 11, 11, 11],
     

Validation epoch 1:  47%|████▋     | 1947/4114 [00:07<00:08, 252.20it/s]

prdicted_tokens = tensor([[ 9,  5,  5,  ..., 11, 11, 11],
        [ 3, 11,  5,  ..., 11, 11, 11],
        [ 4,  4,  5,  ..., 11, 11, 11],
        ...,
        [ 3, 10,  5,  ..., 11, 11, 11],
        [ 4,  7, 13,  ..., 11, 11, 11],
        [ 5,  5,  6,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([10, 11, 12, 14, 14, 16, 15, 13, 16, 13, 13, 11, 14, 16, 17, 12, 15, 15,
        12, 13, 11, 15, 14, 14, 13, 13,  9, 13, 13, 12, 17, 15, 15, 11, 13, 15,
        13, 14, 14, 13, 11, 16, 16, 12, 10, 11,  9, 15, 15, 13, 12, 12, 14, 15,
        13, 12, 14, 15, 11,  9, 14, 14, 11, 14])
prdicted_tokens = tensor([[10, 13, 11,  ..., 11, 11, 11],
        [ 9,  5, 10,  ..., 11, 11, 11],
        [ 4,  4,  5,  ..., 11, 11, 11],
        ...,
        [ 5,  3,  5,  ..., 1

Validation epoch 1:  49%|████▊     | 1999/4114 [00:07<00:08, 237.92it/s]

prdicted_tokens = tensor([[ 5,  4,  6,  ...,  4, 10,  1],
        [ 4,  5,  5,  ...,  5,  1, 11],
        [ 5, 11,  5,  ..., 11, 11, 11],
        ...,
        [ 3,  3, 13,  ..., 11,  1, 11],
        [ 3, 13,  4,  ..., 11, 11, 11],
        [ 8,  7, 13,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  4, 10,  1],
        [ 0,  0,  0,  ...,  5,  1,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ..., 11,  1,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]])
batch_y_lens = tensor([16, 15, 11, 11, 14, 12, 15, 14, 14, 13, 14, 13, 13, 13, 11, 16, 16, 12,
        13, 14, 12, 14, 13, 15, 12, 12, 16, 14, 13, 12, 13, 15, 14, 15, 16, 14,
        15, 14, 14, 13, 13, 14, 10, 15, 12, 12, 12, 13, 14, 13, 13, 14, 12, 13,
        12, 14, 14, 13, 14, 13, 12, 15, 12, 12])
prdicted_tokens = tensor([[ 5,  2,  5,  ...,  1, 11, 11],
        [ 8,  5,  8,  ..., 11, 11, 11],
        [ 3,  3,  5,  ..., 11, 11, 11],
     

Validation epoch 1:  50%|████▉     | 2050/4114 [00:08<00:08, 236.87it/s]

prdicted_tokens = tensor([[ 8,  9,  4,  ...,  1, 11, 11],
        [ 9, 11, 13,  ...,  9,  3,  1],
        [ 3,  2,  6,  ...,  1, 11, 11],
        ...,
        [ 9,  3, 13,  ..., 11, 11, 11],
        [ 4,  8,  5,  ..., 11, 11, 11],
        [ 8,  4,  3,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 9, 3, 1],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]])
batch_y_lens = tensor([15, 17, 15, 13, 12, 15, 12, 14, 14, 13, 13, 12, 16, 11, 15, 11, 13, 13,
        14, 11, 17, 15, 14, 17, 13, 15, 14, 15, 15, 15, 15, 16, 14, 12, 15, 13,
        11, 15, 14, 14, 14, 12, 12, 12, 13, 12, 15, 14, 12, 16, 14, 16, 14, 14,
        14, 15, 15, 16, 13, 16, 11, 10, 11, 15])
prdicted_tokens = tensor([[ 5,  2, 13,  ..., 11, 11, 11],
        [ 8,  5,  6,  ...,  8,  1, 11],
        [ 8,  5,  6,  ...,  1, 11, 11],
        ...,
        [ 4,  7, 13,  ..., 1

Validation epoch 1:  51%|█████     | 2100/4114 [00:08<00:08, 242.79it/s]

prdicted_tokens = tensor([[ 9,  7,  5,  ..., 11, 11, 11],
        [ 3,  9,  5,  ..., 11, 11, 11],
        [ 3,  5,  3,  ..., 11, 11, 11],
        ...,
        [10,  9,  8,  ..., 11, 11, 11],
        [10,  9,  5,  ..., 11, 11, 11],
        [ 5,  9,  9,  ..., 10,  1, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ..., 10,  1,  0]])
batch_y_lens = tensor([14, 12, 10, 14, 11, 16, 12, 13, 15, 13, 12, 14, 14, 14, 14, 14, 11, 14,
        12, 12, 11, 16, 13, 13, 13, 15,  9, 12, 15, 16, 12, 13, 13, 17, 15, 14,
        16, 14, 13, 13, 14, 12, 12, 11, 10, 13, 13, 13, 16, 15, 12, 13, 11, 13,
        14, 11, 14, 12, 14, 14, 15, 13,  9, 16])
prdicted_tokens = tensor([[ 5,  4,  5,  ..., 11, 11, 11],
        [ 9,  9, 13,  ..., 11, 11, 11],
        [ 3,  9,  5,  ..., 11, 11, 11],
     

Validation epoch 1:  52%|█████▏    | 2151/4114 [00:08<00:08, 243.46it/s]

prdicted_tokens = tensor([[ 9,  9,  9,  ..., 11, 11, 11],
        [ 5,  8,  5,  ...,  1, 11, 11],
        [ 9,  2, 10,  ..., 10,  2,  1],
        ...,
        [ 9,  9, 13,  ...,  9,  8,  1],
        [ 4,  4,  5,  ..., 11, 11, 11],
        [ 8,  3,  2,  ..., 10,  1, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ..., 10,  2,  1],
        ...,
        [ 0,  0,  0,  ...,  9,  8,  1],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ..., 10,  1,  0]])
batch_y_lens = tensor([13, 14, 16, 12, 13, 13, 15, 13, 15, 16, 15, 11, 14, 12, 12, 10, 14, 16,
        15, 16, 12, 12, 11, 13, 13, 15, 14, 10, 14, 15, 12, 16, 12, 14, 12, 15,
        14, 12, 13, 15, 13, 13,  9, 16, 14, 14, 11, 11, 14, 13, 16, 10, 16, 11,
        15, 14,  9, 10, 12, 13, 15, 16, 12, 15])
prdicted_tokens = tensor([[ 8,  3,  6,  ...,  1, 11, 11],
        [ 3, 13,  4,  ..., 11, 11, 11],
        [ 3,  2,  6,  ..., 11, 11, 11],
     

Validation epoch 1:  54%|█████▎    | 2202/4114 [00:08<00:07, 245.61it/s]

prdicted_tokens = tensor([[ 5, 13,  4,  ..., 11, 11, 11],
        [ 4,  9,  5,  ...,  1, 11, 11],
        [ 8,  9,  9,  ...,  1, 11, 11],
        ...,
        [ 8,  9, 13,  ..., 11,  2,  1],
        [ 9, 11,  5,  ...,  1, 11, 11],
        [ 3,  7, 13,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        ...,
        [ 0,  0,  0,  ..., 11,  2,  1],
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]])
batch_y_lens = tensor([11, 14, 14, 11, 14, 13, 11, 14, 10, 12, 13, 14, 13, 15, 14, 14, 13, 13,
        12, 16, 10, 16, 10, 15, 11, 16, 16, 15, 15, 16, 15, 14, 14, 12, 13, 13,
        13, 13, 13, 14, 12, 13, 15, 11, 15, 16, 13, 15, 14, 12, 12, 13, 11, 12,
        11, 12, 14, 12, 13, 16, 12, 16, 14, 13])
prdicted_tokens = tensor([[ 8,  5,  2,  ..., 11, 11, 11],
        [ 8,  7,  5,  ..., 11, 11, 11],
        [ 8,  3,  6,  ..., 11, 11, 11],
     

Validation epoch 1:  55%|█████▍    | 2252/4114 [00:09<00:08, 231.85it/s]

prdicted_tokens = tensor([[10, 13,  5,  ..., 11, 11, 11],
        [ 7,  9,  3,  ..., 11, 11, 11],
        [ 8,  5,  4,  ...,  7,  1, 11],
        ...,
        [ 3,  5,  5,  ..., 11, 11, 11],
        [ 8,  5,  5,  ..., 11, 11, 11],
        [ 9,  7,  5,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 7, 1, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([12, 11, 16, 11, 14, 14, 12, 13, 15, 11, 12, 14, 15, 14, 13, 10, 13, 14,
        13, 13, 14, 16, 15, 12, 11, 11, 13, 15, 16, 14, 13, 17, 13, 10, 12, 13,
        13, 11, 14, 13, 13, 12, 11, 10, 15, 15, 13, 13, 15, 15, 13, 16, 13, 13,
        16, 16, 16, 14, 16, 13, 12, 11, 11, 13])
prdicted_tokens = tensor([[ 5,  6, 13,  ..., 11, 11, 11],
        [ 9,  7, 13,  ..., 11, 11, 11],
        [ 3,  7, 13,  ..., 11, 11, 11],
        ...,
        [ 5, 10, 13,  ..., 1

Validation epoch 1:  56%|█████▌    | 2301/4114 [00:09<00:07, 237.07it/s]

prdicted_tokens = tensor([[ 8,  3, 13,  ..., 11, 11, 11],
        [ 3,  5, 11,  ..., 11, 11, 11],
        [ 4,  5,  6,  ..., 11, 11, 11],
        ...,
        [ 9,  8,  9,  ...,  1, 11, 11],
        [ 5,  5, 13,  ...,  3,  7,  1],
        [ 8,  4,  3,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 3, 7, 1],
        [0, 0, 0,  ..., 1, 0, 0]])
batch_y_lens = tensor([13, 14, 11, 11, 14, 12, 14, 11, 16, 16, 15, 12, 15, 12, 14, 11, 13, 14,
        14, 10, 14, 13, 13, 16, 14, 12, 15, 14, 15, 11, 14, 15, 14, 14, 11, 11,
        11, 16, 16, 15, 14, 14, 14, 15, 16, 15, 16, 14, 15, 13, 13, 14, 16, 13,
        11, 14, 13, 14, 13, 16, 10, 15, 17, 15])
prdicted_tokens = tensor([[ 5,  3,  5,  ..., 11, 11, 11],
        [ 8,  9,  9,  ...,  1, 11, 11],
        [ 9,  4,  5,  ..., 11, 11, 11],
        ...,
        [ 4,  2,  6,  ..., 1

Validation epoch 1:  57%|█████▋    | 2351/4114 [00:09<00:07, 241.39it/s]

prdicted_tokens = tensor([[ 9,  3,  6,  ...,  2, 10,  1],
        [ 4,  2,  6,  ..., 11, 11, 11],
        [ 8,  9,  9,  ...,  5,  6,  1],
        ...,
        [ 8,  9, 10,  ...,  4,  2,  1],
        [ 4,  6, 13,  ..., 11, 11, 11],
        [ 5,  2, 13,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  2, 10,  1],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  5,  6,  1],
        ...,
        [ 0,  0,  0,  ...,  4,  2,  1],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]])
batch_y_lens = tensor([16, 13, 16, 11, 13, 15, 16, 11, 12, 12, 12, 14, 13, 12, 10, 16, 14, 15,
        13, 12, 14, 13, 15, 15, 12, 14, 15, 11, 16, 14, 13, 16, 12, 11, 11, 16,
        15, 13, 13, 12,  9, 11, 13, 13, 14, 10, 16, 15, 11, 14, 15, 10, 15, 16,
        15, 12, 12, 15, 13, 12, 14, 16, 12, 13])
prdicted_tokens = tensor([[ 9,  8,  5,  ..., 11, 11, 11],
        [ 8,  3,  8,  ...,  2,  1, 11],
        [ 9, 10,  9,  ..., 11, 11, 11],
     

Validation epoch 1:  58%|█████▊    | 2402/4114 [00:09<00:07, 237.80it/s]

prdicted_tokens = tensor([[ 9, 10, 13,  ..., 11, 11, 11],
        [ 3,  3, 13,  ...,  2,  3,  1],
        [ 8,  3,  2,  ...,  1, 11, 11],
        ...,
        [ 5,  5,  5,  ..., 11, 11, 11],
        [ 9,  4,  3,  ..., 11, 11, 11],
        [ 8,  5,  7,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 2, 3, 1],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([12, 17, 15, 11, 15, 13, 13, 13, 12, 14, 16, 16, 14, 10, 10, 13, 16, 14,
        13, 14, 10, 15, 13, 14, 14, 10, 12, 13, 16, 15, 16, 14, 15, 16, 15, 12,
        13, 12, 12, 15, 13, 15, 14, 14, 15, 15, 17, 15, 14, 14, 16, 14, 13, 17,
        11, 12, 16, 12, 11, 15, 14, 10, 13, 14])
prdicted_tokens = tensor([[ 9,  2, 13,  ..., 11, 11, 11],
        [ 3,  9,  5,  ..., 11, 11, 11],
        [ 9,  4,  3,  ..., 11, 11, 11],
        ...,
        [11,  9,  8,  ..., 1

Validation epoch 1:  60%|█████▉    | 2452/4114 [00:09<00:06, 242.82it/s]

prdicted_tokens = tensor([[ 8,  7,  5,  ..., 11, 11, 11],
        [11,  5,  4,  ..., 11, 11, 11],
        [ 4,  4,  5,  ..., 11, 11, 11],
        ...,
        [ 4,  5,  5,  ..., 11, 11, 11],
        [ 4,  4, 13,  ...,  4,  2,  1],
        [ 8,  3,  3,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 4, 2, 1],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([13, 11, 11, 14, 15, 12, 13, 12, 15, 13, 14, 12, 11, 13, 12, 14, 12, 13,
        13, 11, 12, 15, 16, 14, 12, 15, 11, 12, 13, 12, 13, 13, 15, 12, 14, 12,
        16, 12, 14, 14, 13, 14, 13, 14, 12, 15, 15, 15, 14, 13, 14, 15, 13, 12,
        16, 15, 12, 13, 10, 14, 11, 13, 16, 13])
prdicted_tokens = tensor([[ 5,  3, 13,  ...,  9,  1, 11],
        [ 9,  7,  5,  ...,  1, 11, 11],
        [ 5,  5,  6,  ..., 11, 11, 11],
        ...,
        [ 3,  2, 13,  ..., 1

Validation epoch 1:  61%|██████    | 2502/4114 [00:10<00:06, 244.15it/s]

batch_y_lens = tensor([11, 13, 13, 11, 13, 14, 12, 16, 14, 16, 12, 13, 13, 16, 12, 14, 13, 13,
        15, 14, 12, 14, 14, 13, 14, 13, 11, 14, 12, 15, 12, 12, 16, 12, 13, 13,
        12, 13, 13, 16, 14, 14, 13, 16, 13, 11, 15, 12, 15, 16, 15, 13, 13, 14,
        15, 15, 14, 14, 13, 15, 15, 13, 14, 15])
prdicted_tokens = tensor([[ 8,  8, 13,  ..., 11, 11, 11],
        [ 8,  5,  8,  ...,  1, 11, 11],
        [11,  9,  4,  ..., 11, 11, 11],
        ...,
        [ 3,  5, 13,  ..., 11, 11, 11],
        [ 8,  9,  9,  ..., 11, 11, 11],
        [ 3,  4, 13,  ...,  4,  2,  1]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 4, 2, 1]])
batch_y_lens = tensor([13, 14, 13, 15, 12, 11, 14, 14, 13, 14, 14, 12, 14, 13, 14, 16, 11, 16,
        14, 14, 12, 14, 11, 16, 14, 14, 12, 14, 13, 13, 13, 12, 15, 12, 12, 10,
    

Validation epoch 1:  62%|██████▏   | 2553/4114 [00:10<00:06, 247.02it/s]

prdicted_tokens = tensor([[ 4,  7, 13,  ..., 11, 11, 11],
        [ 8,  9,  4,  ..., 11, 11, 11],
        [ 4, 10, 13,  ...,  6,  4,  1],
        ...,
        [ 3,  9,  5,  ..., 11, 11, 11],
        [ 8,  3,  8,  ...,  2,  2,  1],
        [ 9, 13,  6,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 6, 4, 1],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 2, 2, 1],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([12, 13, 16, 13, 15,  9, 12, 13, 12, 14, 13, 13, 12, 14, 12, 11, 13, 15,
        14, 12, 13, 12, 10, 14, 11, 15, 13, 15, 12, 13, 14, 13, 11, 14, 11, 15,
        15, 14, 13, 14, 12, 13, 15, 14, 14, 13, 16, 15, 13, 13, 13, 12, 15, 13,
        15, 13, 15, 16, 11, 14, 13, 12, 16, 12])
prdicted_tokens = tensor([[ 8,  3,  9,  ...,  1, 11, 11],
        [ 8,  5, 11,  ..., 11, 11, 11],
        [ 3,  6,  6,  ..., 11, 11, 11],
        ...,
        [ 8,  4,  2,  ..., 1

Validation epoch 1:  63%|██████▎   | 2603/4114 [00:10<00:06, 246.08it/s]

prdicted_tokens = tensor([[ 5,  4,  6,  ...,  1, 11, 11],
        [ 8,  9,  2,  ...,  6,  1, 11],
        [11,  9,  8,  ..., 11, 11, 11],
        ...,
        [ 8,  3,  5,  ..., 11, 11, 11],
        [ 8,  2, 10,  ..., 11, 11, 11],
        [ 3,  7,  6,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 6, 1, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([15, 16, 14, 15, 14, 14, 13, 12, 14, 16, 17, 16, 14, 16, 12, 12, 13, 10,
         9,  8, 13, 13, 14, 15, 14, 16, 11, 11, 12, 14, 13, 15, 16, 14, 14, 13,
        14, 11, 13, 14, 15, 13, 16, 16, 11, 16, 11, 16, 12, 14, 13, 15, 16, 12,
        12, 13, 15, 14, 13, 13, 14, 12, 14, 14])
prdicted_tokens = tensor([[ 3,  5,  6,  ...,  9,  1, 11],
        [ 8,  9, 10,  ...,  1, 11, 11],
        [ 8,  3,  3,  ...,  2,  1, 11],
        ...,
        [ 2, 10,  3,  ..., 1

Validation epoch 1:  65%|██████▍   | 2656/4114 [00:10<00:05, 251.16it/s]

prdicted_tokens = tensor([[ 9,  5,  3,  ..., 11, 11, 11],
        [ 3,  8,  6,  ..., 10,  1, 11],
        [ 8,  5,  7,  ...,  1, 11, 11],
        ...,
        [ 8,  4, 10,  ..., 11, 11, 11],
        [ 3, 10, 13,  ..., 11, 11, 11],
        [ 3,  2, 13,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ..., 10,  1,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]])
batch_y_lens = tensor([12, 15, 14, 16, 15, 16, 16, 14, 13, 12,  9, 14, 11, 12, 16, 13, 11, 12,
        16, 14, 13, 15, 15, 14, 10, 13, 13, 12, 13, 15, 11, 13, 10, 14, 15, 15,
        15, 14, 12, 13, 12, 11, 14, 13, 14, 14, 14, 14, 12, 13, 14, 16, 13, 11,
        13, 14, 16, 13, 14, 13, 15, 12, 13, 13])
prdicted_tokens = tensor([[ 5,  3,  5,  ...,  1, 11, 11],
        [ 4,  8, 13,  ..., 10,  8,  1],
        [ 9,  9,  9,  ..., 11, 11, 11],
     

Validation epoch 1:  66%|██████▌   | 2708/4114 [00:10<00:05, 242.41it/s]

prdicted_tokens = tensor([[ 9, 10, 13,  ..., 11, 11, 11],
        [ 3,  4,  6,  ...,  1, 11, 11],
        [ 8,  9,  9,  ...,  1, 11, 11],
        ...,
        [ 9,  2, 13,  ..., 11, 11, 11],
        [ 8,  5,  8,  ...,  1, 11, 11],
        [ 4,  9,  9,  ..., 11,  7,  1]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ..., 11,  7,  1]])
batch_y_lens = tensor([10, 14, 14, 12, 10, 12, 15, 14, 12, 14, 15, 14, 14, 12, 14, 15, 15, 14,
        12, 14, 13, 11, 16, 12, 12, 13, 16, 11, 13, 13, 14, 13, 15, 16, 16, 14,
        11, 14, 14, 10, 14, 13, 13, 14, 15, 14, 14, 12, 13, 11, 14, 15, 13, 16,
        14, 13, 14, 12, 12, 14, 16, 11, 14, 16])
prdicted_tokens = tensor([[ 8,  4,  4,  ...,  1, 11, 11],
        [ 8,  3,  9,  ...,  6,  9,  1],
        [ 9, 10,  5,  ...,  1, 11, 11],
     

Validation epoch 1:  67%|██████▋   | 2760/4114 [00:11<00:05, 248.82it/s]

prdicted_tokens = tensor([[ 8,  4,  9,  ..., 10,  1, 11],
        [ 9, 11, 13,  ...,  1, 11, 11],
        [ 8,  9,  5,  ..., 11, 11, 11],
        ...,
        [ 5,  5, 11,  ..., 11, 11, 11],
        [ 5,  7,  5,  ..., 11, 11, 11],
        [ 9,  2, 10,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ..., 10,  1,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  1,  0,  0]])
batch_y_lens = tensor([16, 15, 12, 13, 15, 15, 11, 14, 14, 14, 11, 16, 12, 15, 17, 11, 16, 14,
        15, 12, 10, 14, 12, 13, 11,  9, 15, 13, 12, 12, 16, 15, 15, 11, 12, 14,
        12, 12, 12, 12, 14, 12, 15, 11, 10, 15, 14, 12, 15, 11, 13, 16, 12, 14,
        17, 13, 16, 12, 10, 11, 13, 14, 13, 15])
prdicted_tokens = tensor([[ 8,  3, 11,  ..., 11, 11, 11],
        [ 7, 13,  5,  ..., 11, 11, 11],
        [ 9,  5,  5,  ..., 11, 11, 11],
     

Validation epoch 1:  68%|██████▊   | 2812/4114 [00:11<00:05, 248.27it/s]

prdicted_tokens = tensor([[ 9,  8,  9,  ...,  5, 10,  1],
        [ 5, 13,  7,  ...,  1, 11, 11],
        [ 4,  3,  7,  ...,  1, 11, 11],
        ...,
        [ 7, 13,  7,  ...,  7,  5,  1],
        [ 3, 10, 13,  ..., 11, 11, 11],
        [ 3,  5,  4,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  5, 10,  1],
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  7,  5,  1],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]])
batch_y_lens = tensor([16, 14, 14, 11, 12, 10, 14, 11, 13, 12, 15, 15, 14,  9, 10, 16, 13, 14,
        14, 13, 15, 12, 15, 16, 11, 16, 16, 10, 12, 14, 11, 16, 13, 14, 14, 12,
        15, 16, 14, 10, 11, 15, 14, 15, 14, 16, 11, 13, 15, 13, 13, 12, 15, 12,
        14, 16, 16, 11, 13, 13, 15, 16, 11, 12])
prdicted_tokens = tensor([[ 9, 11, 13,  ...,  7,  1, 11],
        [ 3,  4,  5,  ...,  9,  8,  1],
        [ 5,  7,  5,  ..., 11, 11, 11],
     

Validation epoch 1:  70%|██████▉   | 2862/4114 [00:11<00:05, 246.50it/s]

prdicted_tokens = tensor([[ 2,  5, 11,  ...,  1, 11, 11],
        [ 8,  8, 13,  ..., 11, 11, 11],
        [ 4, 10, 13,  ..., 11, 11, 11],
        ...,
        [ 4,  5, 13,  ...,  1, 11, 11],
        [ 9,  3,  6,  ...,  1, 11, 11],
        [ 5, 10,  3,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([14, 12, 12, 11, 13, 13, 10, 13, 15, 15, 14, 10, 14, 11, 11, 15, 13, 13,
        14, 12, 14, 15, 13, 16, 12, 13, 13, 14, 11, 11, 13, 14, 12, 11, 12, 13,
        14, 13, 12, 11, 15, 11, 15, 13, 10, 12, 14, 14, 14, 13, 14, 13, 11, 16,
        15, 13, 16, 16, 15, 12, 13, 14, 14, 11])
prdicted_tokens = tensor([[ 8,  5, 10,  ..., 11, 11, 11],
        [ 9,  9,  9,  ...,  1, 11, 11],
        [ 8,  5, 11,  ...,  2,  1, 11],
        ...,
        [ 9,  2, 13,  ..., 1

Validation epoch 1:  71%|███████   | 2915/4114 [00:11<00:04, 251.58it/s]

prdicted_tokens = tensor([[ 8,  3, 11,  ...,  1, 11, 11],
        [ 9,  9,  9,  ..., 11, 11, 11],
        [ 5,  7,  9,  ..., 11, 11, 11],
        ...,
        [ 9, 13,  6,  ..., 11, 11, 11],
        [ 8,  4,  7,  ..., 11, 11, 11],
        [ 8,  5, 11,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([15, 14, 14, 15, 14, 13, 16, 13, 12, 14, 11, 12, 11, 12, 16, 14, 16, 12,
        13, 13, 14, 16, 10, 11, 15, 15, 14, 16, 15, 10, 16, 10, 13, 14, 13, 13,
        12, 11, 13, 14, 14, 13, 14, 10, 14, 15, 13, 11, 15, 14, 14, 10, 12, 16,
        12, 15, 12, 17, 11, 16, 13, 14, 13, 14])
prdicted_tokens = tensor([[ 8,  4,  3,  ..., 11, 10,  1],
        [ 5,  7, 13,  ..., 11, 11, 11],
        [ 4,  7, 10,  ..., 11, 11, 11],
        ...,
        [ 8,  4,  8,  ...,  

Validation epoch 1:  72%|███████▏  | 2967/4114 [00:11<00:04, 251.71it/s]

prdicted_tokens = tensor([[ 9,  5,  5,  ..., 11, 11, 11],
        [ 8,  5,  2,  ..., 11, 11, 11],
        [ 4,  4,  5,  ..., 11, 11, 11],
        ...,
        [ 4, 10, 13,  ..., 11, 11, 11],
        [ 8,  3,  8,  ..., 11, 11, 11],
        [ 5,  4,  5,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([12, 14, 12, 14, 16, 11, 15, 15, 13, 14, 15, 16, 16, 13, 13, 16, 16,  8,
        12, 15, 12, 15, 16, 14, 12, 17, 13, 16, 14, 14, 15, 13, 13, 15, 15, 11,
        13, 13, 15, 14, 13, 17, 13, 12, 13, 15, 13, 13, 16, 16, 14, 14, 12, 14,
        15, 13, 11, 12, 15, 14, 14, 13, 13, 12])
prdicted_tokens = tensor([[ 9,  3,  5,  ..., 11, 11, 11],
        [ 5,  7, 13,  ..., 11, 11, 11],
        [ 4,  6,  5,  ..., 11, 11, 11],
        ...,
        [ 8,  5, 10,  ..., 1

Validation epoch 1:  74%|███████▍  | 3046/4114 [00:12<00:04, 255.51it/s]

prdicted_tokens = tensor([[ 8,  9,  4,  ..., 11, 11, 11],
        [ 3,  4,  6,  ..., 11, 11, 11],
        [ 4,  2,  5,  ..., 11, 11, 11],
        ...,
        [ 8,  4,  5,  ..., 11, 11, 11],
        [ 8,  5,  3,  ...,  6,  1, 11],
        [ 9,  5,  5,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 6, 1, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([14, 14, 12, 16, 12, 13, 15, 14, 12, 14, 13, 13, 15, 12, 13, 14, 15, 12,
        15, 12, 16, 15, 14, 12, 10, 14, 15, 12, 10, 10, 15, 17, 14, 13, 13, 15,
        11, 14, 14, 16, 14, 13, 14, 14, 12, 12, 14, 12, 10, 14, 14, 15, 14, 16,
        12, 14, 11, 14, 12, 15, 13, 12, 16, 14])
prdicted_tokens = tensor([[ 9,  7,  5,  ...,  1, 11, 11],
        [ 9,  5, 10,  ..., 11, 11, 11],
        [ 3,  3,  6,  ..., 11, 11, 11],
        ...,
        [ 3,  2,  5,  ..., 1

Validation epoch 1:  75%|███████▍  | 3072/4114 [00:12<00:04, 254.54it/s]

prdicted_tokens = tensor([[ 9,  9,  5,  ...,  1, 11, 11],
        [ 8,  4,  8,  ...,  1, 11, 11],
        [ 9,  7,  5,  ...,  4, 11,  1],
        ...,
        [ 5,  8,  9,  ...,  2,  1, 11],
        [ 9,  5,  3,  ..., 11, 11, 11],
        [ 9,  9, 13,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ...,  4, 11,  1],
        ...,
        [ 0,  0,  0,  ...,  2,  1,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  1,  0,  0]])
batch_y_lens = tensor([14, 14, 16, 12, 12, 13, 13, 13, 16, 11, 13, 14, 11, 13, 11, 11, 13, 10,
        13, 15, 13, 10, 12, 16, 11, 14, 15, 14, 14, 12, 15, 16, 16, 15, 14, 16,
        12, 15, 13, 16, 15, 12, 14, 14, 14, 14, 14, 16, 12, 12, 15, 14, 13, 16,
        15, 14, 14, 13, 15, 14, 16, 15,  8, 14])
prdicted_tokens = tensor([[ 8,  3,  6,  ..., 11, 11, 11],
        [ 8,  7,  5,  ..., 11, 11, 11],
        [ 3,  3,  5,  ..., 11, 11, 11],
     

Validation epoch 1:  76%|███████▌  | 3124/4114 [00:12<00:03, 252.09it/s]

batch_y_lens = tensor([15, 15, 10, 16, 14, 14, 13, 13, 13, 14, 12, 11, 11, 14, 14, 14, 13, 12,
        12, 14, 16, 15, 14, 13, 14, 14, 10, 12, 13, 11, 15, 13, 13, 17, 15, 14,
        13, 13, 11, 12, 16, 13, 12, 13, 15, 13, 16, 11, 13, 16, 14, 16, 12, 11,
        10, 13, 14, 14, 12, 15, 14, 11, 11, 13])
prdicted_tokens = tensor([[ 8, 11,  9,  ...,  1, 11, 11],
        [ 9, 13,  4,  ...,  1, 11, 11],
        [ 5,  6,  5,  ..., 11, 11, 11],
        ...,
        [ 9,  5,  9,  ..., 11, 11, 11],
        [ 3,  3,  6,  ...,  5,  1, 11],
        [ 5, 10,  9,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 5, 1, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([14, 14, 13, 13, 14, 12, 16, 14, 12, 12, 13, 12, 13, 16, 14, 15, 10, 13,
        15, 12, 14, 14, 14, 11, 14, 15, 15, 14, 14, 14, 15, 13, 14, 14, 13, 13,
    

Validation epoch 1:  77%|███████▋  | 3176/4114 [00:12<00:03, 253.11it/s]

prdicted_tokens = tensor([[ 8,  9,  5,  ...,  1, 11, 11],
        [ 5,  6, 13,  ..., 11, 11, 11],
        [ 3,  6,  6,  ..., 11, 11, 11],
        ...,
        [ 8,  5,  4,  ..., 11, 11, 11],
        [ 3,  2,  5,  ...,  6,  6,  1],
        [ 4,  9, 13,  ...,  7,  1, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 6, 6, 1],
        [0, 0, 0,  ..., 7, 1, 0]])
batch_y_lens = tensor([14, 13, 12, 12, 12, 15, 16, 14, 14, 12, 11, 11, 13, 13, 10, 14, 13, 15,
        14, 16, 14, 13, 13, 14, 16, 10, 13, 14, 12, 15, 12, 13, 12, 16, 10, 14,
        14, 11, 14, 14, 15, 15, 11, 15, 13, 12, 11, 15, 14, 12, 16, 13, 12, 15,
        13, 13, 12, 15, 15, 14, 13, 12, 16, 15])
prdicted_tokens = tensor([[ 5,  5,  6,  ..., 11, 11, 11],
        [ 3, 10,  5,  ..., 11, 11, 11],
        [ 9,  5,  5,  ..., 11, 11, 11],
        ...,
        [ 9,  9,  4,  ..., 1

Validation epoch 1:  78%|███████▊  | 3229/4114 [00:12<00:03, 251.05it/s]

prdicted_tokens = tensor([[ 3,  7, 13,  ..., 11, 11, 11],
        [ 9, 11,  9,  ...,  1, 11, 11],
        [ 4,  6,  9,  ..., 11, 11, 11],
        ...,
        [ 8,  3,  2,  ...,  2,  1, 11],
        [ 8,  4,  2,  ...,  1, 11, 11],
        [ 5,  7,  5,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 2, 1, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([14, 15, 14, 16, 14, 11, 13, 14, 14, 10, 11, 11, 11, 11, 13, 16, 15, 17,
        14, 15, 14, 15, 12, 14, 15, 14, 14, 11, 12, 16, 12, 13, 15, 16, 13, 14,
        12, 14, 15, 13, 12, 15, 14, 16, 13,  9, 13, 12, 13, 16, 11, 11, 14, 14,
        13, 13, 15, 13, 14, 15, 14, 16, 15, 12])
prdicted_tokens = tensor([[ 3, 10,  6,  ..., 11, 11, 11],
        [ 8,  4,  4,  ...,  1, 11, 11],
        [ 5,  3, 13,  ..., 11, 11, 11],
        ...,
        [ 9,  7,  9,  ...,  

Validation epoch 1:  80%|███████▉  | 3280/4114 [00:13<00:03, 238.40it/s]

prdicted_tokens = tensor([[ 8,  5,  4,  ..., 11, 11, 11],
        [ 4,  6,  5,  ..., 11, 11, 11],
        [ 8,  3,  6,  ...,  1, 11, 11],
        ...,
        [ 3,  8,  6,  ..., 11, 11, 11],
        [ 3, 10, 13,  ..., 11, 11, 11],
        [ 8,  3,  8,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([11, 11, 15, 14, 12, 15, 14, 15, 13, 11, 15, 11, 15, 13, 14, 13, 13, 11,
        13, 14, 13, 12, 16, 16, 14, 13, 12, 11, 17, 13, 16, 12, 13, 15, 15, 13,
        13, 16, 14, 14, 15, 10, 11, 15, 11, 12, 15, 15, 11, 14, 12, 14, 11, 10,
        15, 12, 10, 15, 12, 13, 15, 12, 12, 14])
prdicted_tokens = tensor([[ 3,  8,  6,  ...,  1, 11, 11],
        [ 8,  4,  3,  ..., 11, 11, 11],
        [ 4,  8,  6,  ..., 11, 11, 11],
        ...,
        [ 9,  9,  5,  ..., 1

Validation epoch 1:  81%|████████  | 3330/4114 [00:13<00:03, 235.24it/s]

prdicted_tokens = tensor([[ 3,  5, 13,  ..., 11, 11, 11],
        [ 5,  6, 13,  ...,  1, 11, 11],
        [ 8,  5,  5,  ..., 11, 11, 11],
        ...,
        [ 9,  9,  9,  ...,  5,  8,  1],
        [ 5,  3, 13,  ..., 11, 11, 11],
        [ 3,  3, 13,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 5, 8, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([13, 14, 12, 10, 14, 15, 15, 12, 16, 14, 15, 12, 13, 14, 11, 12, 15, 13,
         9, 13, 12, 15, 12, 14, 16, 11, 14, 14, 16, 14, 14, 12, 14, 13, 14, 13,
        14, 15, 14, 14, 15, 14, 11, 15, 14, 12, 13, 12, 12, 11, 12, 15, 11, 14,
        11, 14, 11, 11, 13, 15, 11, 16, 13, 12])
prdicted_tokens = tensor([[ 3, 11, 10,  ...,  7,  6,  1],
        [ 3,  2,  5,  ..., 11, 11, 11],
        [ 5,  8, 13,  ..., 11, 11, 11],
        ...,
        [ 3,  3,  6,  ...,  

Validation epoch 1:  82%|████████▏ | 3380/4114 [00:13<00:03, 239.71it/s]

prdicted_tokens = tensor([[ 3,  2,  6,  ..., 11, 11, 11],
        [ 9,  2, 10,  ...,  2,  1, 11],
        [ 9,  2, 10,  ..., 11, 11, 11],
        ...,
        [ 8,  3,  7,  ...,  5,  1, 11],
        [ 8,  4,  4,  ...,  3,  7,  1],
        [ 9,  4,  6,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 2, 1, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 5, 1, 0],
        [0, 0, 0,  ..., 3, 7, 1],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([12, 15, 13, 12, 13, 13, 11, 13, 11, 14, 13, 13, 11, 13, 15, 15, 14, 12,
        13, 14, 13, 15, 14, 12, 15, 14, 15, 12, 11, 15, 15, 11, 15, 14, 16, 11,
        13, 14, 14, 13, 12, 11, 16, 14, 11, 14, 11, 14, 13, 15, 13, 15, 15, 14,
        12, 15, 15, 15, 16, 15, 14, 15, 16, 13])
prdicted_tokens = tensor([[ 5,  5,  5,  ..., 11, 11, 11],
        [ 5,  2,  6,  ..., 11, 11, 11],
        [ 8,  4,  6,  ...,  8,  1, 11],
        ...,
        [ 4,  6,  5,  ...,  

Validation epoch 1:  83%|████████▎ | 3432/4114 [00:13<00:02, 244.33it/s]

prdicted_tokens = tensor([[ 8,  9,  7,  ...,  4,  1, 11],
        [ 5,  7,  5,  ...,  9,  1, 11],
        [ 9,  5,  8,  ...,  7,  1, 11],
        ...,
        [ 8,  4, 10,  ...,  1, 11, 11],
        [ 9,  3, 13,  ..., 11, 11, 11],
        [ 5, 11,  9,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 4, 1, 0],
        [0, 0, 0,  ..., 9, 1, 0],
        [0, 0, 0,  ..., 7, 1, 0],
        ...,
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([15, 15, 15, 16, 13, 16,  9, 15, 11, 13, 11, 11, 15, 14, 16, 13, 14, 12,
        16, 12, 14,  9, 14, 12, 14, 14, 15, 14, 11, 14, 16, 14, 12, 12, 13, 16,
        11, 13, 12, 14, 16, 16, 11, 14, 10, 12, 12, 15, 14, 16, 11, 11, 11, 12,
        12, 16, 16, 14, 15, 13, 15, 14, 11, 13])
prdicted_tokens = tensor([[ 9,  5,  5,  ...,  3,  1, 11],
        [ 5,  2,  5,  ..., 11, 11, 11],
        [ 8,  9,  5,  ...,  3,  1, 11],
        ...,
        [ 9,  9,  9,  ..., 1

Validation epoch 1:  85%|████████▍ | 3482/4114 [00:14<00:02, 238.66it/s]

prdicted_tokens = tensor([[ 9, 11, 13,  ..., 11, 11, 11],
        [ 8,  3,  2,  ...,  1, 11, 11],
        [ 3,  9,  6,  ...,  2,  6,  1],
        ...,
        [ 5,  8,  9,  ..., 11, 11, 11],
        [ 9,  9,  9,  ...,  1, 11, 11],
        [ 2, 13,  5,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 2, 6, 1],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]])
batch_y_lens = tensor([13, 14, 16, 11, 15, 10, 11, 11, 16, 14, 13, 14, 13, 12, 11, 11, 13, 12,
        12, 13, 15, 11, 14, 11, 14, 11, 14, 15, 11, 14, 12, 12, 15, 16, 14, 14,
        15, 15, 12, 12, 14, 14, 14, 11, 14, 12, 14, 16, 16, 13, 14, 12, 13, 16,
        12, 11, 12, 13, 12, 13, 14, 11, 14, 14])
prdicted_tokens = tensor([[ 4,  9, 13,  ..., 11, 11, 11],
        [ 4, 10, 13,  ...,  1, 11, 11],
        [ 4,  7, 13,  ..., 11, 11, 11],
        ...,
        [ 8,  7, 13,  ..., 1

Validation epoch 1:  86%|████████▌ | 3533/4114 [00:14<00:02, 242.99it/s]

prdicted_tokens = tensor([[ 9,  4,  5,  ...,  4,  1, 11],
        [ 4,  9, 13,  ..., 11, 11, 11],
        [ 4,  5,  6,  ..., 11, 11, 11],
        ...,
        [11,  5,  3,  ..., 11, 11, 11],
        [ 4,  3,  3,  ..., 11, 11, 11],
        [ 4,  5,  6,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 4, 1, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]])
batch_y_lens = tensor([16, 12, 13, 14, 14, 11, 15, 12, 14, 13, 14, 13, 14, 13, 13, 11, 13, 12,
        14, 13, 15, 14, 15, 14, 15, 13, 12, 16, 13, 15, 15, 17, 14, 15, 16, 15,
        13, 12, 13, 14, 12, 13, 12, 14, 16, 15, 13, 11, 12, 12, 11, 11, 13, 16,
        13,  9, 10, 12, 16, 12, 16, 10, 11, 15])
prdicted_tokens = tensor([[ 8,  4,  6,  ...,  7,  1, 11],
        [ 9, 11, 13,  ...,  5,  3,  1],
        [ 8,  5, 10,  ...,  1, 11, 11],
        ...,
        [ 8,  4,  5,  ..., 1

Validation epoch 1:  87%|████████▋ | 3585/4114 [00:14<00:02, 247.15it/s]

prdicted_tokens = tensor([[ 8,  9,  2,  ...,  1, 11, 11],
        [ 3,  2, 13,  ...,  2,  1, 11],
        [ 8,  4,  3,  ...,  7,  1, 11],
        ...,
        [ 8,  4, 11,  ...,  8,  1, 11],
        [ 9,  7,  9,  ..., 11, 11, 11],
        [ 5, 10,  5,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 2, 1, 0],
        [0, 0, 0,  ..., 7, 1, 0],
        ...,
        [0, 0, 0,  ..., 8, 1, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([15, 16, 16, 13, 11, 10, 12, 10, 16, 15, 12, 14, 14, 13, 13, 13, 13, 13,
        13, 12, 12, 14, 15, 12, 13,  9, 12, 13, 16, 10, 13, 12, 12, 14, 11, 11,
        11, 17, 13, 14, 14, 15, 11, 11, 12, 12, 15, 10, 14, 14, 14, 13, 14, 14,
        13, 12, 10, 15, 13, 15, 12, 16, 11, 10])
prdicted_tokens = tensor([[ 8, 13,  4,  ..., 11, 11, 11],
        [ 9, 13,  3,  ..., 11, 11, 11],
        [ 4,  6,  5,  ..., 11, 11, 11],
        ...,
        [ 4,  5,  6,  ..., 1

Validation epoch 1:  88%|████████▊ | 3635/4114 [00:14<00:01, 244.70it/s]

prdicted_tokens = tensor([[ 8,  5, 11,  ...,  1, 11, 11],
        [ 8, 13,  8,  ..., 11, 11, 11],
        [ 3,  3,  5,  ..., 11, 11, 11],
        ...,
        [ 3,  4, 13,  ...,  1, 11, 11],
        [ 5,  9,  9,  ..., 11, 11, 11],
        [ 8,  5,  8,  ...,  5,  2,  1]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 5, 2, 1]])
batch_y_lens = tensor([14, 11, 12, 14, 13, 13, 16, 15, 14, 15, 11, 14, 14, 13, 12, 15, 14, 15,
        13, 11, 16, 11, 13, 11, 14, 12, 13, 10, 11, 11, 12, 14, 14, 10, 15, 12,
        13, 15, 16, 13, 12, 15, 13, 14, 14, 14, 15, 16, 16, 15, 10, 15, 16, 14,
        13, 12, 10, 12, 15, 13, 15, 14, 13, 16])
prdicted_tokens = tensor([[ 9,  9,  9,  ..., 11, 11, 11],
        [ 9, 13,  8,  ..., 11, 11, 11],
        [ 3, 10,  6,  ..., 11, 11, 11],
        ...,
        [ 8,  9,  9,  ...,  

Validation epoch 1:  90%|████████▉ | 3685/4114 [00:14<00:01, 247.02it/s]

prdicted_tokens = tensor([[ 9,  9,  9,  ...,  8,  1, 11],
        [ 4,  3, 11,  ..., 11, 11, 11],
        [ 3,  7, 13,  ..., 11, 11, 11],
        ...,
        [ 2, 13,  4,  ..., 11, 11, 11],
        [ 8,  5,  5,  ...,  7, 11,  1],
        [ 4,  9,  9,  ...,  5,  1, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  8,  1,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  7, 11,  1],
        [ 0,  0,  0,  ...,  5,  1,  0]])
batch_y_lens = tensor([15, 12, 13, 16, 10, 13, 12, 15, 11, 10, 12, 11, 12, 12, 13, 15, 15, 14,
        14, 13, 11, 16, 14, 12, 14, 15, 15, 12, 14, 14, 12, 12, 14, 10, 12, 15,
        16, 12, 10, 14, 13, 12, 14, 13, 15, 12, 12, 12, 12, 11, 13, 12, 11, 12,
        15, 12, 14, 13, 13, 15, 10, 13, 16, 15])
prdicted_tokens = tensor([[ 8,  4,  3,  ..., 11, 11, 11],
        [ 5,  9, 13,  ..., 11, 11, 11],
        [ 4,  7, 13,  ...,  3,  1, 11],
     

Validation epoch 1:  91%|█████████ | 3736/4114 [00:15<00:01, 247.70it/s]

prdicted_tokens = tensor([[ 8,  9, 10,  ...,  7,  1, 11],
        [ 4,  3,  6,  ...,  1, 11, 11],
        [ 5,  5,  4,  ..., 11, 11, 11],
        ...,
        [ 4,  6,  5,  ...,  1, 11, 11],
        [ 5,  6,  9,  ..., 11, 11, 11],
        [11,  5, 11,  ..., 11,  1, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  7,  1,  0],
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ..., 11,  1,  0]])
batch_y_lens = tensor([15, 14, 11, 12, 15, 15, 11, 13, 12, 14, 11, 11, 13, 15, 13, 11, 16, 15,
        15, 11, 15, 14, 13, 12, 11, 16, 14, 11, 15, 14, 16, 10, 12, 16, 12, 15,
        15, 15, 13, 16, 15, 13, 14, 12, 13, 12, 14, 13, 15, 12, 13, 15, 15, 13,
        16, 13, 13, 14, 13, 13, 12, 14, 13, 15])
prdicted_tokens = tensor([[ 3,  5,  5,  ..., 11, 11, 11],
        [ 2, 10,  8,  ..., 11, 11, 11],
        [ 3,  3,  6,  ...,  1, 11, 11],
     

Validation epoch 1:  92%|█████████▏| 3789/4114 [00:15<00:01, 249.72it/s]

prdicted_tokens = tensor([[ 9,  2, 13,  ...,  1, 11, 11],
        [ 9,  9,  9,  ..., 11, 11, 11],
        [ 9,  9,  5,  ..., 11, 11, 11],
        ...,
        [ 9, 11,  9,  ...,  1, 11, 11],
        [ 3,  4,  5,  ..., 11, 11, 11],
        [ 8,  4,  7,  ...,  2,  2,  1]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 2, 2, 1]])
batch_y_lens = tensor([14, 11, 12, 13, 14, 14, 14, 13, 16, 16, 12, 13, 13, 15, 13, 10, 10, 15,
        13, 12, 16, 14, 13, 12, 15, 12, 12, 12, 16, 13, 14, 13, 16, 11, 13, 15,
        15, 14, 16, 13, 12, 15, 13, 10, 15, 13, 12, 13, 14, 12, 16, 14, 13,  9,
        15, 14, 14, 12, 13, 13, 13, 14, 12, 16])
prdicted_tokens = tensor([[ 8, 10, 13,  ...,  9,  1, 11],
        [ 4,  9, 13,  ..., 11, 11, 11],
        [ 4,  9, 13,  ..., 11, 11, 11],
        ...,
        [ 3,  3,  5,  ...,  

Validation epoch 1:  93%|█████████▎| 3839/4114 [00:15<00:01, 244.40it/s]

prdicted_tokens = tensor([[ 3,  5,  6,  ..., 11, 11, 11],
        [ 4,  8,  6,  ...,  4,  1, 11],
        [ 4, 11,  5,  ..., 11, 11, 11],
        ...,
        [ 9, 10, 13,  ..., 11, 11, 11],
        [ 9, 11,  5,  ..., 11, 11, 11],
        [ 9,  4,  3,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 4, 1, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]])
batch_y_lens = tensor([13, 15, 12, 11, 15, 16, 16, 14, 13, 15, 14,  9, 14, 13, 14, 12, 13, 11,
        13, 16, 13, 11, 12, 12, 13, 15, 12, 14, 12, 13, 13, 12, 11, 15, 16, 15,
        13, 15, 16, 10, 13, 14, 14, 13, 15, 11, 16, 16, 12, 12, 12, 14, 14, 13,
        13, 14, 16, 12, 15, 16, 16, 12, 12, 14])
prdicted_tokens = tensor([[ 8,  9,  9,  ..., 11, 11, 11],
        [ 8,  5,  8,  ...,  1, 11, 11],
        [ 8, 13,  5,  ..., 11, 11, 11],
        ...,
        [ 8,  5,  9,  ...,  

Validation epoch 1:  95%|█████████▍| 3890/4114 [00:15<00:00, 245.04it/s]

prdicted_tokens = tensor([[ 3, 10,  5,  ..., 11, 11, 11],
        [ 5,  8, 13,  ..., 11, 11, 11],
        [ 9,  4,  5,  ..., 11, 11, 11],
        ...,
        [ 5, 10,  9,  ...,  4,  6,  1],
        [ 4,  3,  3,  ..., 11, 11, 11],
        [ 8,  9,  8,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 4, 6, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([13, 11, 12, 14, 14, 13, 15, 13, 13, 11, 11, 13, 11, 12, 16, 12, 15, 10,
        13, 14, 13, 11, 11, 13, 13, 13, 15, 13, 14, 13, 16, 13, 12, 15, 11, 11,
        13, 13, 15, 16, 12, 14, 12, 15, 15, 14, 14, 13, 14, 16, 10, 14, 13, 15,
        14, 11, 13, 15, 12, 14, 16, 16, 10, 13])
prdicted_tokens = tensor([[ 5,  6, 13,  ...,  5,  1, 11],
        [ 8,  3,  6,  ...,  1, 11, 11],
        [ 3,  8, 13,  ..., 11, 11, 11],
        ...,
        [ 9, 13,  4,  ..., 1

Validation epoch 1:  96%|█████████▌| 3942/4114 [00:15<00:00, 248.93it/s]

prdicted_tokens = tensor([[ 4,  4,  6,  ..., 11, 11, 11],
        [ 8,  4,  5,  ..., 11,  7,  1],
        [ 8,  9, 13,  ..., 11, 11, 11],
        ...,
        [ 3,  6,  6,  ...,  1, 11, 11],
        [ 5,  4,  6,  ...,  2,  1, 11],
        [ 8,  4,  6,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ..., 11,  7,  1],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ...,  2,  1,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]])
batch_y_lens = tensor([12, 16, 13, 13, 15, 13, 16, 13, 16, 16, 16, 12, 14, 14, 13, 16, 13, 16,
        13, 13, 15, 14, 13, 13, 14, 16, 14, 13, 13, 12, 13, 15, 16, 13, 15, 11,
        11, 16, 11, 14, 16, 11, 13, 14, 16, 16, 16, 14, 16, 15, 14, 15, 12, 11,
        16, 14, 13, 14, 14, 12, 11, 14, 15, 12])
prdicted_tokens = tensor([[ 3,  9,  5,  ..., 11, 11, 11],
        [ 8,  4,  6,  ..., 11, 11, 11],
        [ 9,  9,  9,  ..., 11, 11, 11],
     

Validation epoch 1:  97%|█████████▋| 3993/4114 [00:16<00:00, 248.83it/s]

prdicted_tokens = tensor([[ 4,  4, 13,  ...,  1, 11, 11],
        [ 5, 11,  9,  ..., 10,  1, 11],
        [ 9,  5, 13,  ..., 11, 11, 11],
        ...,
        [ 5, 13,  3,  ..., 11, 11, 11],
        [ 9,  4,  6,  ..., 11, 11, 11],
        [ 8,  5,  9,  ...,  4,  6,  1]], device='cuda:0')
batch_y = tensor([[ 0,  0,  0,  ...,  1,  0,  0],
        [ 0,  0,  0,  ..., 10,  1,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  4,  6,  1]])
batch_y_lens = tensor([14, 15, 10, 14, 14, 13, 15, 14, 14, 16, 14, 14, 14, 10, 11, 13, 16, 16,
         9, 16, 14, 16, 14, 14, 12, 11, 14, 13, 12, 13, 13, 14, 13, 11, 12, 14,
        12, 12, 13, 12, 15, 14, 13, 12, 14, 11, 11, 12, 11, 11, 14, 14,  9, 15,
        12, 13, 11, 14, 15, 12, 14, 11, 12, 16])
prdicted_tokens = tensor([[ 4,  2,  5,  ..., 11, 11, 11],
        [ 5,  3,  5,  ..., 11, 11, 11],
        [ 4,  6,  5,  ..., 11, 11, 11],
     

Validation epoch 1:  99%|█████████▉| 4072/4114 [00:16<00:00, 250.70it/s]

prdicted_tokens = tensor([[ 5, 10, 13,  ...,  6,  1, 11],
        [ 4,  7, 10,  ...,  1, 11, 11],
        [ 4,  2, 13,  ...,  1, 11, 11],
        ...,
        [ 5, 11,  5,  ..., 11, 11, 11],
        [ 5, 10,  9,  ...,  9,  8,  1],
        [ 3,  3, 13,  ..., 11, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 6, 1, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 9, 8, 1],
        [0, 0, 0,  ..., 0, 0, 0]])
batch_y_lens = tensor([15, 14, 14, 13, 12, 15, 15, 15, 14, 13, 14, 11, 15, 11, 14, 15, 14, 14,
        15, 15, 13, 15, 12, 16, 16, 16, 12, 13, 11, 14, 14, 11, 16, 16, 15, 14,
        14, 11, 14, 14, 12, 13, 11, 13, 13, 15, 13, 16, 12, 11, 15, 12, 12, 13,
        13, 11, 11, 14, 13, 13, 13, 12, 16, 12])
prdicted_tokens = tensor([[ 8,  5,  4,  ...,  1, 11, 11],
        [ 3, 10, 13,  ...,  1, 11, 11],
        [ 3,  4, 13,  ..., 10,  1, 11],
        ...,
        [ 3, 11, 13,  ..., 1

Validation epoch 1: 100%|██████████| 4114/4114 [00:16<00:00, 247.85it/s]

prdicted_tokens = tensor([[ 8, 13,  3,  ..., 11, 11, 11],
        [ 9,  8,  5,  ..., 11, 11, 11],
        [ 3,  6, 13,  ...,  1, 11, 11],
        ...,
        [ 9,  4,  6,  ...,  8,  1, 11],
        [ 4, 11,  5,  ..., 11, 11, 11],
        [ 3,  5,  6,  ...,  1, 11, 11]], device='cuda:0')
batch_y = tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 8, 1, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]])
batch_y_lens = tensor([11, 13, 14, 13, 16, 14, 14, 14, 13, 12, 14, 16, 15, 14,  9, 11, 11, 10,
        11, 14, 11, 15, 15, 12, 10, 10, 16, 10, 13, 12, 14, 15, 14, 14, 14, 13,
        15, 13, 13, 15, 13, 13, 14, 14, 12, 11, 11, 14, 14, 15, 16, 13, 15, 16,
        13, 14, 15, 12, 14, 15, 15, 15, 12, 14])
prdicted_tokens = tensor([[ 8,  3, 10,  ...,  1, 11, 11],
        [ 4, 10,  6,  ..., 11, 11, 11],
        [ 3,  5,  6,  ..., 11, 11, 11],
        ...,
        [ 8,  5,  8,  ...,  




# Generation
Use `model.generator` and provide an initial character to automatically generate a sequence.

In [129]:
model = model.to("cpu")
print("".join(model.generator('1+1=')))

TypeError: argmax() got an unexpected keyword argument 'num'