[![Kaggle](https://img.shields.io/badge/Kaggle-Notebook-blue?logo=kaggle)](https://www.kaggle.com/code/bnarayanareddy/iiith-neurallangmodel-3)


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import math
import time
import os
import random
import random
import re
import unicodedata
from tqdm import tqdm

2025-11-14 16:34:51.693942: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763138091.869645      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763138091.921157      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [2]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from tokenizers.trainers import BpeTrainer

In [3]:
# Config
seed = 1234
DATA_DIR = 'data'
SEQ_LEN = 50
VOCAB_SIZE = 5000
EMB_DIM = 128
ENC_HIDDEN_DIM = 256
DEC_HIDDEN_DIM = 256
DROPOUT = 0.3
N_EPOCHS = 200
LEARNING_RATE = 1e-2
BATCH_SIZE = 64
# Output directories
CHECKPOINT_DIR = 'checkpoints'
LOG_DIR = 'runs'
tokenizer_path = "tknzer_dir"
special_tokens = ["<pad>", "<st>", "<end>", "<unk>"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
seed_everything(seed)

In [5]:
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

In [6]:
def load_txt(txt_file):
    with open(txt_file, "r", encoding="utf-8") as f:
        text = f.read()
    return text

In [7]:
dataset_path = "/kaggle/input/iiith-assignment2-dataset/dataset/Pride_and_Prejudice-Jane_Austen.txt"

In [8]:
def normalize_text(text, lowercase = True):
    if lowercase:
        text = text.lower()
    # Unicode Normalization (e.g., converting fancy quotes to standard ones)
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[\*_`\[\]\^{}]', '', text) # handle _word_ and *word*
    text = re.sub(r'\b(mr|mrs|ms|dr|st)\.', r'\1', text, flags=re.IGNORECASE) # handle mr. mrs. etc
    text = text.strip()
    return text

In [9]:
custom_dataset = normalize_text(load_txt(dataset_path))

In [10]:
len(set(custom_dataset)), len(list(custom_dataset))

(56, 702635)

In [11]:
def get_tokenizer(dataset, vocab_size, save_fldr, name = 'bpe'):
    if os.path.exists(save_fldr):
        save_path = os.path.join(save_fldr, 'tokenizer.json')
        return Tokenizer.from_file(save_path)
    if name == 'bpe':
        tokenizer = Tokenizer(models.BPE())
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=special_tokens)
        tokenizer.train_from_iterator([dataset], trainer)
        if save_fldr:
            os.makedirs(save_fldr, exist_ok=True)
            tokenizer.save(os.path.join(save_fldr, 'tokenizer.json'))
            return tokenizer

In [12]:
bpe_tokenizer = get_tokenizer(custom_dataset, VOCAB_SIZE, tokenizer_path, 'bpe')






In [13]:
tokenized_data = bpe_tokenizer.encode(custom_dataset).ids

In [14]:
len(tokenized_data)

154018

In [15]:
def get_data(data, split, train_percent = 0.8):
    n = len(data)
    split_percent = train_percent
    if split == "train":
        return data[ : int((split_percent)*n)]
    elif split == "val":
        return data[int((split_percent)*n) : ]
    else:
        return None

In [16]:
train_data = get_data(tokenized_data,'train', train_percent = 0.8)
val_data = get_data(tokenized_data,'val', train_percent = 0.8)

In [17]:
len(train_data)

123214

In [18]:
# Special token ids
PAD_TOKEN = bpe_tokenizer.token_to_id('<pad>')
ST_TOKEN = bpe_tokenizer.token_to_id('<st>')
END_TOKEN = bpe_tokenizer.token_to_id('<end>')
FULL_STOP = bpe_tokenizer.token_to_id('.')
QUESTION_TOKEN = bpe_tokenizer.token_to_id('?')
EXCLAMATION = bpe_tokenizer.token_to_id('!')
special_ids = PAD_TOKEN, ST_TOKEN, END_TOKEN, FULL_STOP, QUESTION_TOKEN, EXCLAMATION
special_ids

(0, 1, 2, 15, 29, 4)

In [19]:
# Dataset
class SimpleDataset(Dataset):
    def __init__(self, data, special_ids, seq_len):
        super().__init__()
        self.token_ids = data
        self.seq_len = seq_len
        self.pad, self.st, self.end, self.full_stop, self.quest, self.excl= special_ids
        self.sentence_enders = {self.full_stop, self.quest, self.excl}
        
        self.inputs = []
        self.decoder_inputs = []
        self.decoder_targets = []
        
        
        self.create_sequences()
        
    def create_sequences(self):
        tokens = self.token_ids
        n = len(tokens)
        L = self.seq_len
        
        i = 0
        
        while i < n-1:
            enc_seq = []
            seq_end_idx = i 
            for j in range(L):
                current_idx = i + j
                if current_idx >= n:
                    break
                token = tokens[current_idx]
                
                enc_seq.append(token)
                seq_end_idx = current_idx
                
                if token in self.sentence_enders:
                    break
            
            
            if not enc_seq or len(enc_seq) < 2:
                i = seq_end_idx + 1 # Move to the next token
                continue

            dec_target_seq = enc_seq[1:] + [self.end]
            dec_input_seq = [self.st] + dec_target_seq[:-1]
                
            # Pad sequences if shorter
            enc_seq_len = len(enc_seq)
            enc_seq = enc_seq + [self.pad] * max(L - enc_seq_len, 0)
            dec_input_seq = dec_input_seq + [self.pad] * max(L - enc_seq_len, 0)
            dec_target_seq = dec_target_seq + [self.pad] * max(L - enc_seq_len, 0)

            self.inputs.append(enc_seq)
            self.decoder_inputs.append(dec_input_seq)
            self.decoder_targets.append(dec_target_seq)
            
            i = seq_end_idx + 1
    
    def __len__(self):
        return len(self.inputs)
    
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.inputs[idx], dtype=torch.long), # encoder input 50 timesteps
            torch.tensor(self.decoder_inputs[idx], dtype=torch.long), # decoder inputs shifter from target 50 steps
            torch.tensor(self.decoder_targets[idx], dtype=torch.long) # decoder targets 50 steps
        )

In [20]:
train_dataset = SimpleDataset(train_data, special_ids, SEQ_LEN)
val_dataset = SimpleDataset(val_data, special_ids, SEQ_LEN)

In [21]:
len(train_dataset), len(val_dataset)

(4445, 1350)

In [22]:
# Example
val_dataset[0]

(tensor([ 370, 2784,   13,  406,  876,   15,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0]),
 tensor([   1, 2784,   13,  406,  876,   15,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0]),
 tensor([2784,   13,  406,  876,   15,    2,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0, 

In [23]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [24]:
%%writefile model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

# ENCODER
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.enc_hidden_dim = hidden_dim
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=4,
            batch_first=True,
            bidirectional=True,
            dropout=dropout
        )
    
    def forward(self, x):
        # x: [batch, seq_len]
        emb = self.embedding(x)  # [batch, seq_len, emb_dim]
        outputs, (h, c) = self.lstm(emb)  # outputs: [batch, seq_len, 2*hidden]
        return outputs, (h, c)

# ATTENTION 
class Attention(nn.Module):
    def __init__(self, enc_hidden_dim, dec_hidden_dim):
        super().__init__()
        
        # We need to align the dimensions of encoder outputs and decoder hidden state
        # enc_hidden_dim is for one direction, but encoder is bidirectional (2*)
        self.attn = nn.Linear((enc_hidden_dim * 2) + dec_hidden_dim, dec_hidden_dim)
        self.v = nn.Linear(dec_hidden_dim, 1, bias = False)
        
    def forward(self, dec_hidden, enc_outputs):
        # dec_hidden: [batch, dec_hidden_dim] (from the *top layer* of decoder)
        # enc_outputs: [batch, src_len, enc_hidden_dim * 2]
        
        batch_size = enc_outputs.shape[0]
        src_len = enc_outputs.shape[1]
        
        # Repeat decoder hidden state src_len times to concatenate
        # dec_hidden: [batch, src_len, dec_hidden_dim]
        dec_hidden = dec_hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        # energy: [batch, src_len, (enc_hidden * 2) + dec_hidden]
        energy_input = torch.cat((dec_hidden, enc_outputs), dim = 2)
        
        # energy: [batch, src_len, dec_hidden_dim]
        energy = torch.tanh(self.attn(energy_input))
        
        # v(energy): [batch, src_len, 1] -> [batch, src_len]
        attention = self.v(energy).squeeze(2)
        
        # Return softmax'd weights
        return F.softmax(attention, dim=1)

# Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, enc_hidden_dim, dec_hidden_dim, dropout=0.1):
        super().__init__()
        self.dec_hidden_dim = dec_hidden_dim
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        
        self.attention = Attention(enc_hidden_dim, dec_hidden_dim)
        
        self.lstm = nn.LSTM(
            input_size=emb_dim + (enc_hidden_dim * 2),
            hidden_size=dec_hidden_dim,
            num_layers=4,
            batch_first=True,
            dropout=dropout
        )
        
        self.fc = nn.Linear(dec_hidden_dim + (enc_hidden_dim * 2), vocab_size)
    
    def forward(self, dec_input, dec_hidden, enc_outputs):
        # dec_input: [batch] -> current token IDs
        # dec_hidden: (h, c) from previous step
        # enc_outputs: [batch, src_len, 2*enc_hidden_dim]
        
        # dec_input [batch] -> [batch, 1]
        dec_input = dec_input.unsqueeze(1)
        
        # emb: [batch, 1, emb_dim]
        emb = self.embedding(dec_input)
        
       
        a = self.attention(dec_hidden[0][-1], enc_outputs)
        
        a = a.unsqueeze(1)
        
        context = torch.bmm(a, enc_outputs)
        
        lstm_input = torch.cat([emb, context], dim=2)
        output, dec_hidden = self.lstm(lstm_input, dec_hidden)
        
        output = output.squeeze(1)
        context = context.squeeze(1)

        concat_output = torch.cat([output, context], dim=1)
        
        logits = self.fc(concat_output)
        
        return logits, dec_hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        self.enc_num_layers = self.encoder.lstm.num_layers
        self.enc_num_directions = 2 if self.encoder.lstm.bidirectional else 1
        self.enc_hidden_dim = self.encoder.enc_hidden_dim
        self.dec_hidden_dim = self.decoder.dec_hidden_dim
        
        self.fc_hidden = nn.Linear(self.enc_hidden_dim * self.enc_num_directions, self.dec_hidden_dim)
        self.fc_cell = nn.Linear(self.enc_hidden_dim * self.enc_num_directions, self.dec_hidden_dim)

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.fc.out_features
        
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        enc_outputs, (h, c) = self.encoder(src)
        
        h = h.view(self.enc_num_layers, self.enc_num_directions, batch_size, self.enc_hidden_dim)
        c = c.view(self.enc_num_layers, self.enc_num_directions, batch_size, self.enc_hidden_dim)
        
        h_cat = torch.cat((h[:, 0, :, :], h[:, 1, :, :]), dim=2)
        c_cat = torch.cat((c[:, 0, :, :], c[:, 1, :, :]), dim=2)
        
        dec_hidden = (torch.tanh(self.fc_hidden(h_cat)), 
                      torch.tanh(self.fc_cell(c_cat)))
        
        dec_input = trg[:, 0]
        
        # Loop from 0, store at t
        for t in range(trg_len):
            
            # The decoder's forward pass now includes the attention mechanism
            logits, dec_hidden = self.decoder(dec_input, dec_hidden, enc_outputs)
            
            outputs[:, t] = logits
            
            use_teacher_force = True
            
            if use_teacher_force:
                if t < trg_len - 1:
                    dec_input = trg[:, t+1]
            
        return outputs

Writing model.py


In [25]:
from model import Encoder, Decoder, Seq2Seq

In [26]:
encoder = Encoder(VOCAB_SIZE, EMB_DIM, ENC_HIDDEN_DIM, dropout=DROPOUT)
decoder = Decoder(VOCAB_SIZE, EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, dropout=DROPOUT)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

In [27]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(5000, 128)
    (lstm): LSTM(128, 256, num_layers=4, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(5000, 128)
    (attention): Attention(
      (attn): Linear(in_features=768, out_features=256, bias=True)
      (v): Linear(in_features=256, out_features=1, bias=False)
    )
    (lstm): LSTM(640, 256, num_layers=4, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=768, out_features=5000, bias=True)
  )
  (fc_hidden): Linear(in_features=512, out_features=256, bias=True)
  (fc_cell): Linear(in_features=512, out_features=256, bias=True)
)

In [28]:
def load_ckpt(model):
    if os.path.exists(CHECKPOINT_DIR):
        models = [os.path.join(CHECKPOINT_DIR,ckpt) for ckpt in os.listdir(CHECKPOINT_DIR)]
        if len(models) > 0:
            sorted_models = sorted(models, key=lambda x: x.split("_val")[1].split('.pt')[0])
            model.load_state_dict(torch.load(sorted_models[0], map_location=DEVICE))
            print(f"Loaded checkpoint from {sorted_models[0]}")
            best_loss = float(os.path.basename(sorted_models[0]).split("_val")[1].split('.pt')[0])
            last_epoch = int(os.path.basename(sorted_models[0]).split("_val")[0].split('Epoch')[1])
            return best_loss, last_epoch
    

In [29]:
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5)

In [30]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)

In [31]:
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for i, (src, dec_in, trg) in enumerate(iterator):
        
        src, dec_in, trg = src.to(DEVICE), dec_in.to(DEVICE), trg.to(DEVICE)
        
        optimizer.zero_grad()
        output = model(src, dec_in)
        
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [32]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, (src, dec_in, trg) in enumerate(iterator):
            src, dec_in, trg = src.to(DEVICE), dec_in.to(DEVICE), trg.to(DEVICE)
            output = model(src, dec_in)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [33]:
def generate_seq(model, texts, tokenizer, seq_len, special_ids, gen_len=100):
    model.eval()
    samples = []
    pad, st, end, full_stop, quest, excl= special_ids

    with torch.no_grad():
        for text in texts:
            # prepare encoder input
            text = normalize_text(text)
            ids = tokenizer.encode(text).ids
            if len(ids) < seq_len:
                ids = ids + [pad] * (seq_len - len(ids))
            else:
                print("Warning: More than seq length -- considering first 50 tokens:")
                ids = ids[:seq_len]

            src = torch.tensor([ids], dtype=torch.long, device=DEVICE)
            # print(src.shape)
            # print(src)

            # start decoder with only the start token (length = 1)
            dec_in = torch.tensor([[st]], dtype=torch.long, device=DEVICE)

            generated = []
            for step in range(gen_len):
                out = model(src, dec_in)          # [1, cur_dec_len, vocab_size]
                next_token = int(out[0, -1].argmax().cpu().item())  # last timestep prediction
                generated.append(next_token)
                
                # if next_token == end:
                #     break

                

                # append predicted token to decoder input for next step
                dec_in = torch.cat(
                    [dec_in, torch.tensor([[next_token]], dtype=torch.long, device=DEVICE)],
                    dim=1
                )
            # convert ids -> tokens
            # print("Out Tokens", generated)
            tokens = [tokenizer.id_to_token(tid) for tid in generated]
            samples.append((text, tokens))

    print("=== Sample generations ===")
    for idx, (inp, toks) in enumerate(samples, 1):
        print(f"[{idx}] INPUT : {inp}")
        print(f"OUTPUT: {' '.join(toks)}")
    print("======================================================")
    return samples

In [34]:
writer = SummaryWriter(LOG_DIR)

In [35]:
# text from internet
"""If I could have ceased what pendulums swung, or wheels turned, 
or water clocks emptied, then, in order to keep the Fates from marching in time,
I would have, for though it is what a boy naturally wishes when he fears change will 
come upon what he loves and take it away, a man remembers it, too, and in his heart 
wishes the same when all around him he feels only loss, loss that has been his 
companion for some time, and promises to remain at his side."""


"""
It is quite an interesting format, if you don't like the opening you are reading,
then you can skip and go to the next book opening. If the opening does capture your attention
you can click at the bottom to reveal the title and author. It reminds me of blind date books 
where bookstores will wrap a random book up in paper (to prevent the title and author from being seen)
and will write brief descriptions about it.


"""

"\nIt is quite an interesting format, if you don't like the opening you are reading,\nthen you can skip and go to the next book opening. If the opening does capture your attention\nyou can click at the bottom to reveal the title and author. It reminds me of blind date books \nwhere bookstores will wrap a random book up in paper (to prevent the title and author from being seen)\nand will write brief descriptions about it.\n\n\n"

In [36]:
CUSTOM_TEXTS = [
                "If I could have ceased",
                "It is quite an interesting format, if you"
        ]

In [37]:
def run_lm(predict_while_train = True, resume_ckpt = True):
    best_valid_loss = float('inf')
    last_epoch = 0
    if resume_ckpt:
        if os.listdir(CHECKPOINT_DIR):
            best_valid_loss, last_epoch = load_ckpt(model)
    
    for epoch in tqdm(range(N_EPOCHS), desc="Started Training"):
        epoch += last_epoch
        start_time = time.time()
        
        if predict_while_train:
            if epoch%5 == 0:
                generate_seq(model, CUSTOM_TEXTS, bpe_tokenizer, SEQ_LEN, special_ids, gen_len=50)

        train_loss = train(model, train_loader, optimizer, criterion)
        valid_loss = evaluate(model, val_loader, criterion)

        scheduler.step(valid_loss)
        
        end_time = time.time()
        
        epoch_mins = int((end_time - start_time) / 60)
        epoch_secs = int((end_time - start_time) - (epoch_mins * 60))

        # Save checkpoint if validation loss has improved
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), os.path.join(CHECKPOINT_DIR, f'Epoch{epoch}_val{valid_loss}.pt'))
            print(f"Checkpoint saved: New best validation loss {best_valid_loss:.3f}")

        # TensorBoard logging
        writer.add_scalar('Loss/train', train_loss, epoch)
        writer.add_scalar('Perplexity/train', math.exp(train_loss), epoch)
        writer.add_scalar('Loss/validation', valid_loss, epoch)
        writer.add_scalar('Perplexity/validation', math.exp(valid_loss), epoch)
        writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], epoch)

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [38]:
run_lm(predict_while_train = True, resume_ckpt = True)

Started Training:   0%|          | 0/200 [00:00<?, ?it/s]

=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser
[2] INPUT : it is quite an interesting format, if you
OUTPUT: asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser asser


Started Training:   0%|          | 1/200 [00:23<1:19:16, 23.90s/it]

Checkpoint saved: New best validation loss 6.409
Epoch: 01 | Time: 0m 23s
	Train Loss: 6.361 | Train PPL: 578.872
	 Val. Loss: 6.409 |  Val. PPL: 607.430


Started Training:   1%|          | 2/200 [00:45<1:13:36, 22.31s/it]

Checkpoint saved: New best validation loss 6.295
Epoch: 02 | Time: 0m 21s
	Train Loss: 6.088 | Train PPL: 440.755
	 Val. Loss: 6.295 |  Val. PPL: 541.855


Started Training:   2%|▏         | 3/200 [01:06<1:11:33, 21.79s/it]

Checkpoint saved: New best validation loss 6.183
Epoch: 03 | Time: 0m 21s
	Train Loss: 5.967 | Train PPL: 390.307
	 Val. Loss: 6.183 |  Val. PPL: 484.253


Started Training:   2%|▏         | 4/200 [01:27<1:10:23, 21.55s/it]

Checkpoint saved: New best validation loss 6.168
Epoch: 04 | Time: 0m 21s
	Train Loss: 5.878 | Train PPL: 357.052
	 Val. Loss: 6.168 |  Val. PPL: 477.406


Started Training:   2%|▎         | 5/200 [01:48<1:09:35, 21.41s/it]

Checkpoint saved: New best validation loss 6.131
Epoch: 05 | Time: 0m 21s
	Train Loss: 5.805 | Train PPL: 331.825
	 Val. Loss: 6.131 |  Val. PPL: 459.852
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> not not . . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: <end> not not , , , . . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:   3%|▎         | 6/200 [02:12<1:11:27, 22.10s/it]

Checkpoint saved: New best validation loss 6.079
Epoch: 06 | Time: 0m 23s
	Train Loss: 5.743 | Train PPL: 312.141
	 Val. Loss: 6.079 |  Val. PPL: 436.377


Started Training:   4%|▎         | 7/200 [02:33<1:10:06, 21.80s/it]

Checkpoint saved: New best validation loss 6.058
Epoch: 07 | Time: 0m 21s
	Train Loss: 5.660 | Train PPL: 287.047
	 Val. Loss: 6.058 |  Val. PPL: 427.535


Started Training:   4%|▍         | 8/200 [02:54<1:09:03, 21.58s/it]

Checkpoint saved: New best validation loss 5.990
Epoch: 08 | Time: 0m 21s
	Train Loss: 5.548 | Train PPL: 256.602
	 Val. Loss: 5.990 |  Val. PPL: 399.421


Started Training:   4%|▍         | 9/200 [03:15<1:08:14, 21.44s/it]

Checkpoint saved: New best validation loss 5.554
Epoch: 09 | Time: 0m 21s
	Train Loss: 5.316 | Train PPL: 203.632
	 Val. Loss: 5.554 |  Val. PPL: 258.367


Started Training:   5%|▌         | 10/200 [03:36<1:07:39, 21.36s/it]

Checkpoint saved: New best validation loss 5.147
Epoch: 10 | Time: 0m 21s
	Train Loss: 5.004 | Train PPL: 148.980
	 Val. Loss: 5.147 |  Val. PPL: 171.896
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could be . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: ! your every off this , it ? <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:   6%|▌         | 11/200 [04:00<1:09:16, 21.99s/it]

Checkpoint saved: New best validation loss 4.954
Epoch: 11 | Time: 0m 23s
	Train Loss: 4.717 | Train PPL: 111.842
	 Val. Loss: 4.954 |  Val. PPL: 141.745


Started Training:   6%|▌         | 12/200 [04:21<1:08:04, 21.73s/it]

Checkpoint saved: New best validation loss 4.627
Epoch: 12 | Time: 0m 21s
	Train Loss: 4.362 | Train PPL:  78.412
	 Val. Loss: 4.627 |  Val. PPL: 102.199


Started Training:   6%|▋         | 13/200 [04:42<1:07:05, 21.53s/it]

Epoch: 13 | Time: 0m 21s
	Train Loss: 4.112 | Train PPL:  61.064
	 Val. Loss: 4.680 |  Val. PPL: 107.753


Started Training:   7%|▋         | 14/200 [05:03<1:06:24, 21.42s/it]

Checkpoint saved: New best validation loss 4.246
Epoch: 14 | Time: 0m 21s
	Train Loss: 3.870 | Train PPL:  47.932
	 Val. Loss: 4.246 |  Val. PPL:  69.841


Started Training:   8%|▊         | 15/200 [05:24<1:05:48, 21.34s/it]

Checkpoint saved: New best validation loss 4.240
Epoch: 15 | Time: 0m 21s
	Train Loss: 3.649 | Train PPL:  38.423
	 Val. Loss: 4.240 |  Val. PPL:  69.412
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> would have way <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: <end> quite an way ting , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:   8%|▊         | 16/200 [05:47<1:07:18, 21.95s/it]

Epoch: 16 | Time: 0m 23s
	Train Loss: 3.359 | Train PPL:  28.773
	 Val. Loss: 4.714 |  Val. PPL: 111.540


Started Training:   8%|▊         | 17/200 [06:09<1:06:11, 21.70s/it]

Checkpoint saved: New best validation loss 4.008
Epoch: 17 | Time: 0m 21s
	Train Loss: 3.429 | Train PPL:  30.856
	 Val. Loss: 4.008 |  Val. PPL:  55.013


Started Training:   9%|▉         | 18/200 [06:30<1:05:20, 21.54s/it]

Checkpoint saved: New best validation loss 3.928
Epoch: 18 | Time: 0m 21s
	Train Loss: 3.126 | Train PPL:  22.782
	 Val. Loss: 3.928 |  Val. PPL:  50.782


Started Training:  10%|▉         | 19/200 [06:51<1:04:39, 21.43s/it]

Checkpoint saved: New best validation loss 3.677
Epoch: 19 | Time: 0m 21s
	Train Loss: 2.995 | Train PPL:  19.985
	 Val. Loss: 3.677 |  Val. PPL:  39.533


Started Training:  10%|█         | 20/200 [07:12<1:03:57, 21.32s/it]

Epoch: 20 | Time: 0m 21s
	Train Loss: 2.788 | Train PPL:  16.245
	 Val. Loss: 3.682 |  Val. PPL:  39.743
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have wound <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: <end> quite an four cks , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  10%|█         | 21/200 [07:35<1:05:26, 21.93s/it]

Checkpoint saved: New best validation loss 3.606
Epoch: 21 | Time: 0m 23s
	Train Loss: 2.811 | Train PPL:  16.632
	 Val. Loss: 3.606 |  Val. PPL:  36.811


Started Training:  11%|█         | 22/200 [07:56<1:04:20, 21.69s/it]

Checkpoint saved: New best validation loss 3.512
Epoch: 22 | Time: 0m 21s
	Train Loss: 2.636 | Train PPL:  13.958
	 Val. Loss: 3.512 |  Val. PPL:  33.510


Started Training:  12%|█▏        | 23/200 [08:18<1:03:23, 21.49s/it]

Epoch: 23 | Time: 0m 21s
	Train Loss: 2.601 | Train PPL:  13.477
	 Val. Loss: 4.107 |  Val. PPL:  60.766


Started Training:  12%|█▏        | 24/200 [08:39<1:02:38, 21.36s/it]

Epoch: 24 | Time: 0m 21s
	Train Loss: 2.426 | Train PPL:  11.319
	 Val. Loss: 3.540 |  Val. PPL:  34.478


Started Training:  12%|█▎        | 25/200 [09:00<1:02:03, 21.28s/it]

Checkpoint saved: New best validation loss 3.510
Epoch: 25 | Time: 0m 21s
	Train Loss: 2.319 | Train PPL:  10.169
	 Val. Loss: 3.510 |  Val. PPL:  33.450
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have thinking <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: <end> quite an four displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  13%|█▎        | 26/200 [09:23<1:03:26, 21.87s/it]

Epoch: 26 | Time: 0m 23s
	Train Loss: 2.260 | Train PPL:   9.582
	 Val. Loss: 3.702 |  Val. PPL:  40.521


Started Training:  14%|█▎        | 27/200 [09:44<1:02:22, 21.63s/it]

Epoch: 27 | Time: 0m 21s
	Train Loss: 2.088 | Train PPL:   8.065
	 Val. Loss: 3.816 |  Val. PPL:  45.428


Started Training:  14%|█▍        | 28/200 [10:05<1:01:30, 21.45s/it]

Epoch: 28 | Time: 0m 21s
	Train Loss: 1.909 | Train PPL:   6.747
	 Val. Loss: 3.579 |  Val. PPL:  35.825


Started Training:  14%|█▍        | 29/200 [10:26<1:00:49, 21.34s/it]

Checkpoint saved: New best validation loss 3.395
Epoch: 29 | Time: 0m 21s
	Train Loss: 2.079 | Train PPL:   7.996
	 Val. Loss: 3.395 |  Val. PPL:  29.811


Started Training:  15%|█▌        | 30/200 [10:47<1:00:14, 21.26s/it]

Epoch: 30 | Time: 0m 21s
	Train Loss: 1.798 | Train PPL:   6.040
	 Val. Loss: 3.627 |  Val. PPL:  37.609
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have consequence <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: <end> quite an four happiness , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  16%|█▌        | 31/200 [11:10<1:01:34, 21.86s/it]

Epoch: 31 | Time: 0m 23s
	Train Loss: 1.825 | Train PPL:   6.205
	 Val. Loss: 3.408 |  Val. PPL:  30.191


Started Training:  16%|█▌        | 32/200 [11:31<1:00:30, 21.61s/it]

Epoch: 32 | Time: 0m 21s
	Train Loss: 1.740 | Train PPL:   5.699
	 Val. Loss: 3.803 |  Val. PPL:  44.854


Started Training:  16%|█▋        | 33/200 [11:53<59:42, 21.45s/it]  

Epoch: 33 | Time: 0m 21s
	Train Loss: 1.732 | Train PPL:   5.655
	 Val. Loss: 3.451 |  Val. PPL:  31.525


Started Training:  17%|█▋        | 34/200 [12:14<59:07, 21.37s/it]

Checkpoint saved: New best validation loss 3.360
Epoch: 34 | Time: 0m 21s
	Train Loss: 1.683 | Train PPL:   5.379
	 Val. Loss: 3.360 |  Val. PPL:  28.776


Started Training:  18%|█▊        | 35/200 [12:35<58:28, 21.27s/it]

Epoch: 35 | Time: 0m 21s
	Train Loss: 1.537 | Train PPL:   4.651
	 Val. Loss: 4.239 |  Val. PPL:  69.365
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have abilities <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an re displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  18%|█▊        | 36/200 [12:58<59:45, 21.86s/it]

Epoch: 36 | Time: 0m 23s
	Train Loss: 1.588 | Train PPL:   4.896
	 Val. Loss: 3.421 |  Val. PPL:  30.586


Started Training:  18%|█▊        | 37/200 [13:19<58:41, 21.60s/it]

Epoch: 37 | Time: 0m 20s
	Train Loss: 1.388 | Train PPL:   4.006
	 Val. Loss: 3.399 |  Val. PPL:  29.948


Started Training:  19%|█▉        | 38/200 [13:40<57:52, 21.43s/it]

Epoch: 38 | Time: 0m 21s
	Train Loss: 1.514 | Train PPL:   4.545
	 Val. Loss: 3.723 |  Val. PPL:  41.406


Started Training:  20%|█▉        | 39/200 [14:01<57:12, 21.32s/it]

Epoch: 39 | Time: 0m 21s
	Train Loss: 1.472 | Train PPL:   4.357
	 Val. Loss: 3.475 |  Val. PPL:  32.288


Started Training:  20%|██        | 40/200 [14:22<56:37, 21.24s/it]

Epoch: 40 | Time: 0m 21s
	Train Loss: 1.700 | Train PPL:   5.476
	 Val. Loss: 4.112 |  Val. PPL:  61.050
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have abilities <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: you quite an four fault , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  20%|██        | 41/200 [14:45<57:55, 21.86s/it]

Checkpoint saved: New best validation loss 3.258
Epoch: 41 | Time: 0m 23s
	Train Loss: 1.336 | Train PPL:   3.804
	 Val. Loss: 3.258 |  Val. PPL:  26.009


Started Training:  21%|██        | 42/200 [15:07<57:00, 21.65s/it]

Checkpoint saved: New best validation loss 3.184
Epoch: 42 | Time: 0m 21s
	Train Loss: 1.032 | Train PPL:   2.808
	 Val. Loss: 3.184 |  Val. PPL:  24.144


Started Training:  22%|██▏       | 43/200 [15:28<56:12, 21.48s/it]

Checkpoint saved: New best validation loss 3.148
Epoch: 43 | Time: 0m 21s
	Train Loss: 0.938 | Train PPL:   2.554
	 Val. Loss: 3.148 |  Val. PPL:  23.287


Started Training:  22%|██▏       | 44/200 [15:49<55:27, 21.33s/it]

Epoch: 44 | Time: 0m 20s
	Train Loss: 0.874 | Train PPL:   2.398
	 Val. Loss: 3.292 |  Val. PPL:  26.908


Started Training:  22%|██▎       | 45/200 [16:10<54:54, 21.26s/it]

Checkpoint saved: New best validation loss 3.089
Epoch: 45 | Time: 0m 21s
	Train Loss: 0.820 | Train PPL:   2.271
	 Val. Loss: 3.089 |  Val. PPL:  21.949
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: have could have abilities <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: an quite an four creature , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  23%|██▎       | 46/200 [16:33<56:05, 21.85s/it]

Epoch: 46 | Time: 0m 23s
	Train Loss: 0.774 | Train PPL:   2.169
	 Val. Loss: 3.126 |  Val. PPL:  22.775


Started Training:  24%|██▎       | 47/200 [16:54<55:10, 21.64s/it]

Checkpoint saved: New best validation loss 3.057
Epoch: 47 | Time: 0m 21s
	Train Loss: 0.766 | Train PPL:   2.151
	 Val. Loss: 3.057 |  Val. PPL:  21.265


Started Training:  24%|██▍       | 48/200 [17:15<54:21, 21.46s/it]

Epoch: 48 | Time: 0m 21s
	Train Loss: 0.719 | Train PPL:   2.053
	 Val. Loss: 3.059 |  Val. PPL:  21.315


Started Training:  24%|██▍       | 49/200 [17:36<53:41, 21.33s/it]

Checkpoint saved: New best validation loss 3.014
Epoch: 49 | Time: 0m 20s
	Train Loss: 0.686 | Train PPL:   1.985
	 Val. Loss: 3.014 |  Val. PPL:  20.360


Started Training:  25%|██▌       | 50/200 [17:57<53:08, 21.26s/it]

Checkpoint saved: New best validation loss 2.964
Epoch: 50 | Time: 0m 21s
	Train Loss: 0.673 | Train PPL:   1.959
	 Val. Loss: 2.964 |  Val. PPL:  19.374
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: have could have abilities <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: an quite an four displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  26%|██▌       | 51/200 [18:21<54:17, 21.86s/it]

Checkpoint saved: New best validation loss 2.963
Epoch: 51 | Time: 0m 23s
	Train Loss: 0.678 | Train PPL:   1.970
	 Val. Loss: 2.963 |  Val. PPL:  19.365


Started Training:  26%|██▌       | 52/200 [18:42<53:16, 21.60s/it]

Epoch: 52 | Time: 0m 20s
	Train Loss: 0.650 | Train PPL:   1.916
	 Val. Loss: 3.064 |  Val. PPL:  21.405


Started Training:  26%|██▋       | 53/200 [19:03<52:31, 21.44s/it]

Checkpoint saved: New best validation loss 2.930
Epoch: 53 | Time: 0m 20s
	Train Loss: 0.620 | Train PPL:   1.859
	 Val. Loss: 2.930 |  Val. PPL:  18.728


Started Training:  27%|██▋       | 54/200 [19:24<51:55, 21.34s/it]

Checkpoint saved: New best validation loss 2.928
Epoch: 54 | Time: 0m 21s
	Train Loss: 0.611 | Train PPL:   1.842
	 Val. Loss: 2.928 |  Val. PPL:  18.684


Started Training:  28%|██▊       | 55/200 [19:45<51:21, 21.25s/it]

Epoch: 55 | Time: 0m 21s
	Train Loss: 0.572 | Train PPL:   1.772
	 Val. Loss: 2.965 |  Val. PPL:  19.395
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: have could have abilities <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: four quite an four displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! ! ! ! ! ! ! ! ! !


Started Training:  28%|██▊       | 56/200 [20:08<52:29, 21.87s/it]

Checkpoint saved: New best validation loss 2.908
Epoch: 56 | Time: 0m 23s
	Train Loss: 0.542 | Train PPL:   1.720
	 Val. Loss: 2.908 |  Val. PPL:  18.319


Started Training:  28%|██▊       | 57/200 [20:29<51:32, 21.62s/it]

Checkpoint saved: New best validation loss 2.883
Epoch: 57 | Time: 0m 20s
	Train Loss: 0.568 | Train PPL:   1.765
	 Val. Loss: 2.883 |  Val. PPL:  17.863


Started Training:  29%|██▉       | 58/200 [20:50<50:45, 21.44s/it]

Epoch: 58 | Time: 0m 21s
	Train Loss: 0.529 | Train PPL:   1.697
	 Val. Loss: 3.047 |  Val. PPL:  21.057


Started Training:  30%|██▉       | 59/200 [21:11<50:06, 21.32s/it]

Epoch: 59 | Time: 0m 21s
	Train Loss: 0.486 | Train PPL:   1.625
	 Val. Loss: 2.997 |  Val. PPL:  20.025


Started Training:  30%|███       | 60/200 [21:32<49:32, 21.23s/it]

Epoch: 60 | Time: 0m 21s
	Train Loss: 0.476 | Train PPL:   1.610
	 Val. Loss: 2.885 |  Val. PPL:  17.909
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: have could have abilities <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: it quite an four displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! ! ! ! ! ! ! ! ! ! !


Started Training:  30%|███       | 61/200 [21:55<50:36, 21.84s/it]

Checkpoint saved: New best validation loss 2.847
Epoch: 61 | Time: 0m 23s
	Train Loss: 0.466 | Train PPL:   1.594
	 Val. Loss: 2.847 |  Val. PPL:  17.234


Started Training:  31%|███       | 62/200 [22:17<49:41, 21.61s/it]

Epoch: 62 | Time: 0m 21s
	Train Loss: 0.509 | Train PPL:   1.663
	 Val. Loss: 2.944 |  Val. PPL:  18.996


Started Training:  32%|███▏      | 63/200 [22:38<48:57, 21.44s/it]

Epoch: 63 | Time: 0m 21s
	Train Loss: 0.443 | Train PPL:   1.558
	 Val. Loss: 2.866 |  Val. PPL:  17.572


Started Training:  32%|███▏      | 64/200 [22:59<48:17, 21.31s/it]

Epoch: 64 | Time: 0m 20s
	Train Loss: 0.472 | Train PPL:   1.603
	 Val. Loss: 3.322 |  Val. PPL:  27.707


Started Training:  32%|███▎      | 65/200 [23:20<47:46, 21.23s/it]

Checkpoint saved: New best validation loss 2.835
Epoch: 65 | Time: 0m 20s
	Train Loss: 0.459 | Train PPL:   1.583
	 Val. Loss: 2.835 |  Val. PPL:  17.035
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: have could have abilities <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: it quite an four displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! ! ! ! ! ! ! ! !


Started Training:  33%|███▎      | 66/200 [23:43<48:45, 21.83s/it]

Epoch: 66 | Time: 0m 23s
	Train Loss: 0.402 | Train PPL:   1.495
	 Val. Loss: 2.898 |  Val. PPL:  18.135


Started Training:  34%|███▎      | 67/200 [24:04<47:53, 21.60s/it]

Epoch: 67 | Time: 0m 21s
	Train Loss: 0.634 | Train PPL:   1.886
	 Val. Loss: 2.858 |  Val. PPL:  17.426


Started Training:  34%|███▍      | 68/200 [24:25<47:08, 21.42s/it]

Epoch: 68 | Time: 0m 21s
	Train Loss: 0.421 | Train PPL:   1.523
	 Val. Loss: 2.860 |  Val. PPL:  17.460


Started Training:  34%|███▍      | 69/200 [24:46<46:28, 21.29s/it]

Epoch: 69 | Time: 0m 20s
	Train Loss: 0.383 | Train PPL:   1.467
	 Val. Loss: 2.884 |  Val. PPL:  17.892


Started Training:  35%|███▌      | 70/200 [25:07<46:01, 21.24s/it]

Checkpoint saved: New best validation loss 2.828
Epoch: 70 | Time: 0m 21s
	Train Loss: 0.356 | Train PPL:   1.428
	 Val. Loss: 2.828 |  Val. PPL:  16.913
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: have could have abilities <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: an quite an four displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! ! ! ! ! ! ! !


Started Training:  36%|███▌      | 71/200 [25:30<47:00, 21.86s/it]

Epoch: 71 | Time: 0m 23s
	Train Loss: 0.356 | Train PPL:   1.428
	 Val. Loss: 2.950 |  Val. PPL:  19.100


Started Training:  36%|███▌      | 72/200 [25:51<46:09, 21.64s/it]

Checkpoint saved: New best validation loss 2.823
Epoch: 72 | Time: 0m 21s
	Train Loss: 0.387 | Train PPL:   1.472
	 Val. Loss: 2.823 |  Val. PPL:  16.830


Started Training:  36%|███▋      | 73/200 [26:13<45:26, 21.47s/it]

Epoch: 73 | Time: 0m 21s
	Train Loss: 0.387 | Train PPL:   1.472
	 Val. Loss: 2.910 |  Val. PPL:  18.362


Started Training:  37%|███▋      | 74/200 [26:34<44:57, 21.41s/it]

Checkpoint saved: New best validation loss 2.815
Epoch: 74 | Time: 0m 21s
	Train Loss: 0.328 | Train PPL:   1.388
	 Val. Loss: 2.815 |  Val. PPL:  16.692


Started Training:  38%|███▊      | 75/200 [26:55<44:27, 21.34s/it]

Epoch: 75 | Time: 0m 21s
	Train Loss: 0.320 | Train PPL:   1.377
	 Val. Loss: 2.894 |  Val. PPL:  18.060
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: have could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: if quite an four displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! ! ! ! ! ! ! ! ! ! !


Started Training:  38%|███▊      | 76/200 [27:18<45:19, 21.93s/it]

Epoch: 76 | Time: 0m 23s
	Train Loss: 0.372 | Train PPL:   1.451
	 Val. Loss: 2.859 |  Val. PPL:  17.440


Started Training:  38%|███▊      | 77/200 [27:39<44:25, 21.67s/it]

Epoch: 77 | Time: 0m 21s
	Train Loss: 0.285 | Train PPL:   1.330
	 Val. Loss: 2.989 |  Val. PPL:  19.874


Started Training:  39%|███▉      | 78/200 [28:00<43:42, 21.50s/it]

Epoch: 78 | Time: 0m 21s
	Train Loss: 0.429 | Train PPL:   1.536
	 Val. Loss: 2.860 |  Val. PPL:  17.455


Started Training:  40%|███▉      | 79/200 [28:22<43:07, 21.38s/it]

Epoch: 79 | Time: 0m 21s
	Train Loss: 0.291 | Train PPL:   1.337
	 Val. Loss: 3.062 |  Val. PPL:  21.366


Started Training:  40%|████      | 80/200 [28:43<42:38, 21.32s/it]

Checkpoint saved: New best validation loss 2.808
Epoch: 80 | Time: 0m 21s
	Train Loss: 0.303 | Train PPL:   1.354
	 Val. Loss: 2.808 |  Val. PPL:  16.574
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: have could have abilities <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an four displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> is is is is is is is is is is is


Started Training:  40%|████      | 81/200 [29:06<43:27, 21.91s/it]

Epoch: 81 | Time: 0m 23s
	Train Loss: 0.273 | Train PPL:   1.314
	 Val. Loss: 2.823 |  Val. PPL:  16.823


Started Training:  41%|████      | 82/200 [29:27<42:37, 21.67s/it]

Epoch: 82 | Time: 0m 21s
	Train Loss: 0.248 | Train PPL:   1.281
	 Val. Loss: 2.879 |  Val. PPL:  17.790


Started Training:  42%|████▏     | 83/200 [29:48<41:57, 21.52s/it]

Checkpoint saved: New best validation loss 2.775
Epoch: 83 | Time: 0m 21s
	Train Loss: 0.251 | Train PPL:   1.285
	 Val. Loss: 2.775 |  Val. PPL:  16.041


Started Training:  42%|████▏     | 84/200 [30:09<41:19, 21.37s/it]

Epoch: 84 | Time: 0m 21s
	Train Loss: 0.262 | Train PPL:   1.299
	 Val. Loss: 2.813 |  Val. PPL:  16.666


Started Training:  42%|████▎     | 85/200 [30:30<40:47, 21.29s/it]

Epoch: 85 | Time: 0m 21s
	Train Loss: 0.266 | Train PPL:   1.305
	 Val. Loss: 2.913 |  Val. PPL:  18.413
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: have could have abilities <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! is is is is is is is is is is is is


Started Training:  43%|████▎     | 86/200 [30:54<41:35, 21.89s/it]

Epoch: 86 | Time: 0m 23s
	Train Loss: 0.687 | Train PPL:   1.988
	 Val. Loss: 3.143 |  Val. PPL:  23.166


Started Training:  44%|████▎     | 87/200 [31:15<40:47, 21.66s/it]

Epoch: 87 | Time: 0m 21s
	Train Loss: 0.369 | Train PPL:   1.447
	 Val. Loss: 2.857 |  Val. PPL:  17.401


Started Training:  44%|████▍     | 88/200 [31:36<40:06, 21.49s/it]

Epoch: 88 | Time: 0m 21s
	Train Loss: 0.235 | Train PPL:   1.265
	 Val. Loss: 2.801 |  Val. PPL:  16.469


Started Training:  44%|████▍     | 89/200 [31:57<39:31, 21.36s/it]

Epoch: 89 | Time: 0m 21s
	Train Loss: 0.231 | Train PPL:   1.260
	 Val. Loss: 2.887 |  Val. PPL:  17.935


Started Training:  45%|████▌     | 90/200 [32:18<39:01, 21.29s/it]

Epoch: 90 | Time: 0m 21s
	Train Loss: 0.186 | Train PPL:   1.205
	 Val. Loss: 2.876 |  Val. PPL:  17.746
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an four displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is


Started Training:  46%|████▌     | 91/200 [32:41<39:45, 21.88s/it]

Epoch: 91 | Time: 0m 23s
	Train Loss: 0.179 | Train PPL:   1.196
	 Val. Loss: 2.831 |  Val. PPL:  16.969


Started Training:  46%|████▌     | 92/200 [33:02<38:57, 21.64s/it]

Epoch: 92 | Time: 0m 21s
	Train Loss: 0.173 | Train PPL:   1.189
	 Val. Loss: 2.881 |  Val. PPL:  17.832


Started Training:  46%|████▋     | 93/200 [33:24<38:17, 21.47s/it]

Epoch: 93 | Time: 0m 21s
	Train Loss: 0.172 | Train PPL:   1.187
	 Val. Loss: 2.840 |  Val. PPL:  17.115


Started Training:  47%|████▋     | 94/200 [33:45<37:44, 21.36s/it]

Epoch: 94 | Time: 0m 21s
	Train Loss: 0.170 | Train PPL:   1.185
	 Val. Loss: 2.870 |  Val. PPL:  17.644


Started Training:  48%|████▊     | 95/200 [34:06<37:14, 21.28s/it]

Epoch: 95 | Time: 0m 21s
	Train Loss: 0.169 | Train PPL:   1.185
	 Val. Loss: 2.903 |  Val. PPL:  18.229
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an four displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! ! is is is is is is is is is is is


Started Training:  48%|████▊     | 96/200 [34:29<37:59, 21.92s/it]

Epoch: 96 | Time: 0m 23s
	Train Loss: 0.161 | Train PPL:   1.175
	 Val. Loss: 2.881 |  Val. PPL:  17.838


Started Training:  48%|████▊     | 97/200 [34:50<37:13, 21.68s/it]

Epoch: 97 | Time: 0m 21s
	Train Loss: 0.161 | Train PPL:   1.175
	 Val. Loss: 2.872 |  Val. PPL:  17.664


Started Training:  49%|████▉     | 98/200 [35:11<36:33, 21.51s/it]

Epoch: 98 | Time: 0m 21s
	Train Loss: 0.160 | Train PPL:   1.173
	 Val. Loss: 2.861 |  Val. PPL:  17.475


Started Training:  50%|████▉     | 99/200 [35:32<36:00, 21.39s/it]

Epoch: 99 | Time: 0m 21s
	Train Loss: 0.160 | Train PPL:   1.173
	 Val. Loss: 2.850 |  Val. PPL:  17.287


Started Training:  50%|█████     | 100/200 [35:54<35:28, 21.28s/it]

Epoch: 100 | Time: 0m 21s
	Train Loss: 0.160 | Train PPL:   1.173
	 Val. Loss: 2.858 |  Val. PPL:  17.432
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! ! is is is is is is is is is is is


Started Training:  50%|█████     | 101/200 [36:17<36:08, 21.91s/it]

Epoch: 101 | Time: 0m 23s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.863 |  Val. PPL:  17.509


Started Training:  51%|█████     | 102/200 [36:38<35:27, 21.71s/it]

Epoch: 102 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.866 |  Val. PPL:  17.570


Started Training:  52%|█████▏    | 103/200 [36:59<34:49, 21.54s/it]

Epoch: 103 | Time: 0m 21s
	Train Loss: 0.160 | Train PPL:   1.173
	 Val. Loss: 2.875 |  Val. PPL:  17.721


Started Training:  52%|█████▏    | 104/200 [37:20<34:15, 21.41s/it]

Epoch: 104 | Time: 0m 21s
	Train Loss: 0.158 | Train PPL:   1.171
	 Val. Loss: 2.869 |  Val. PPL:  17.615


Started Training:  52%|█████▎    | 105/200 [37:42<33:46, 21.33s/it]

Epoch: 105 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.168
	 Val. Loss: 2.867 |  Val. PPL:  17.584
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  53%|█████▎    | 106/200 [38:05<34:22, 21.94s/it]

Epoch: 106 | Time: 0m 23s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.862 |  Val. PPL:  17.498


Started Training:  54%|█████▎    | 107/200 [38:26<33:37, 21.70s/it]

Epoch: 107 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.862 |  Val. PPL:  17.503


Started Training:  54%|█████▍    | 108/200 [38:47<33:00, 21.52s/it]

Epoch: 108 | Time: 0m 21s
	Train Loss: 0.164 | Train PPL:   1.179
	 Val. Loss: 2.861 |  Val. PPL:  17.474


Started Training:  55%|█████▍    | 109/200 [39:08<32:27, 21.40s/it]

Epoch: 109 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.864 |  Val. PPL:  17.529


Started Training:  55%|█████▌    | 110/200 [39:29<31:58, 21.31s/it]

Epoch: 110 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.169
	 Val. Loss: 2.862 |  Val. PPL:  17.489
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  56%|█████▌    | 111/200 [39:53<32:30, 21.91s/it]

Epoch: 111 | Time: 0m 23s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.862 |  Val. PPL:  17.495


Started Training:  56%|█████▌    | 112/200 [40:14<31:46, 21.67s/it]

Epoch: 112 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.564


Started Training:  56%|█████▋    | 113/200 [40:35<31:10, 21.50s/it]

Epoch: 113 | Time: 0m 21s
	Train Loss: 0.154 | Train PPL:   1.167
	 Val. Loss: 2.866 |  Val. PPL:  17.562


Started Training:  57%|█████▋    | 114/200 [40:56<30:40, 21.40s/it]

Epoch: 114 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.575


Started Training:  57%|█████▊    | 115/200 [41:17<30:13, 21.33s/it]

Epoch: 115 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.866 |  Val. PPL:  17.574
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  58%|█████▊    | 116/200 [41:41<30:41, 21.93s/it]

Epoch: 116 | Time: 0m 23s
	Train Loss: 0.158 | Train PPL:   1.171
	 Val. Loss: 2.868 |  Val. PPL:  17.593


Started Training:  58%|█████▊    | 117/200 [42:02<29:58, 21.67s/it]

Epoch: 117 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.866 |  Val. PPL:  17.572


Started Training:  59%|█████▉    | 118/200 [42:23<29:23, 21.51s/it]

Epoch: 118 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.568


Started Training:  60%|█████▉    | 119/200 [42:44<28:53, 21.40s/it]

Epoch: 119 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.168
	 Val. Loss: 2.867 |  Val. PPL:  17.578


Started Training:  60%|██████    | 120/200 [43:05<28:24, 21.31s/it]

Epoch: 120 | Time: 0m 21s
	Train Loss: 0.160 | Train PPL:   1.174
	 Val. Loss: 2.867 |  Val. PPL:  17.581
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  60%|██████    | 121/200 [43:28<28:50, 21.90s/it]

Epoch: 121 | Time: 0m 23s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.867 |  Val. PPL:  17.584


Started Training:  61%|██████    | 122/200 [43:49<28:10, 21.68s/it]

Epoch: 122 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.867 |  Val. PPL:  17.578


Started Training:  62%|██████▏   | 123/200 [44:11<27:36, 21.51s/it]

Epoch: 123 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.867 |  Val. PPL:  17.577


Started Training:  62%|██████▏   | 124/200 [44:32<27:05, 21.39s/it]

Epoch: 124 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.171
	 Val. Loss: 2.867 |  Val. PPL:  17.584


Started Training:  62%|██████▎   | 125/200 [44:53<26:38, 21.31s/it]

Epoch: 125 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.867 |  Val. PPL:  17.577
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  63%|██████▎   | 126/200 [45:16<27:02, 21.92s/it]

Epoch: 126 | Time: 0m 23s
	Train Loss: 0.158 | Train PPL:   1.172
	 Val. Loss: 2.866 |  Val. PPL:  17.574


Started Training:  64%|██████▎   | 127/200 [45:37<26:22, 21.68s/it]

Epoch: 127 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.574


Started Training:  64%|██████▍   | 128/200 [45:58<25:48, 21.51s/it]

Epoch: 128 | Time: 0m 21s
	Train Loss: 0.154 | Train PPL:   1.167
	 Val. Loss: 2.866 |  Val. PPL:  17.572


Started Training:  64%|██████▍   | 129/200 [46:19<25:18, 21.38s/it]

Epoch: 129 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.866 |  Val. PPL:  17.572


Started Training:  65%|██████▌   | 130/200 [46:41<24:50, 21.30s/it]

Epoch: 130 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.575
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  66%|██████▌   | 131/200 [47:04<25:10, 21.90s/it]

Epoch: 131 | Time: 0m 23s
	Train Loss: 0.159 | Train PPL:   1.172
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  66%|██████▌   | 132/200 [47:25<24:32, 21.66s/it]

Epoch: 132 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  66%|██████▋   | 133/200 [47:46<23:59, 21.49s/it]

Epoch: 133 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  67%|██████▋   | 134/200 [48:07<23:31, 21.39s/it]

Epoch: 134 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  68%|██████▊   | 135/200 [48:28<23:04, 21.30s/it]

Epoch: 135 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.867 |  Val. PPL:  17.576
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  68%|██████▊   | 136/200 [48:52<23:21, 21.89s/it]

Epoch: 136 | Time: 0m 23s
	Train Loss: 0.154 | Train PPL:   1.167
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  68%|██████▊   | 137/200 [49:13<22:44, 21.66s/it]

Epoch: 137 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  69%|██████▉   | 138/200 [49:34<22:13, 21.50s/it]

Epoch: 138 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  70%|██████▉   | 139/200 [49:55<21:45, 21.41s/it]

Epoch: 139 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.167
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  70%|███████   | 140/200 [50:16<21:19, 21.33s/it]

Epoch: 140 | Time: 0m 21s
	Train Loss: 0.154 | Train PPL:   1.166
	 Val. Loss: 2.867 |  Val. PPL:  17.576
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  70%|███████   | 141/200 [50:39<21:33, 21.93s/it]

Epoch: 141 | Time: 0m 23s
	Train Loss: 0.158 | Train PPL:   1.172
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  71%|███████   | 142/200 [51:01<20:59, 21.71s/it]

Epoch: 142 | Time: 0m 21s
	Train Loss: 0.158 | Train PPL:   1.171
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  72%|███████▏  | 143/200 [51:22<20:27, 21.54s/it]

Epoch: 143 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  72%|███████▏  | 144/200 [51:43<19:58, 21.40s/it]

Epoch: 144 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  72%|███████▎  | 145/200 [52:04<19:32, 21.31s/it]

Epoch: 145 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.867 |  Val. PPL:  17.576
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  73%|███████▎  | 146/200 [52:27<19:43, 21.92s/it]

Epoch: 146 | Time: 0m 23s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.867 |  Val. PPL:  17.577


Started Training:  74%|███████▎  | 147/200 [52:48<19:09, 21.68s/it]

Epoch: 147 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  74%|███████▍  | 148/200 [53:10<18:38, 21.50s/it]

Epoch: 148 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.866 |  Val. PPL:  17.575


Started Training:  74%|███████▍  | 149/200 [53:31<18:10, 21.38s/it]

Epoch: 149 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.866 |  Val. PPL:  17.575


Started Training:  75%|███████▌  | 150/200 [53:52<17:44, 21.30s/it]

Epoch: 150 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.867 |  Val. PPL:  17.576
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  76%|███████▌  | 151/200 [54:15<17:53, 21.91s/it]

Epoch: 151 | Time: 0m 23s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.575


Started Training:  76%|███████▌  | 152/200 [54:36<17:19, 21.66s/it]

Epoch: 152 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.167
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  76%|███████▋  | 153/200 [54:57<16:50, 21.51s/it]

Epoch: 153 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  77%|███████▋  | 154/200 [55:18<16:24, 21.41s/it]

Epoch: 154 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  78%|███████▊  | 155/200 [55:40<15:59, 21.33s/it]

Epoch: 155 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.867 |  Val. PPL:  17.576
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  78%|███████▊  | 156/200 [56:03<16:05, 21.93s/it]

Epoch: 156 | Time: 0m 23s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.575


Started Training:  78%|███████▊  | 157/200 [56:24<15:33, 21.70s/it]

Epoch: 157 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  79%|███████▉  | 158/200 [56:45<15:04, 21.54s/it]

Epoch: 158 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  80%|███████▉  | 159/200 [57:06<14:38, 21.42s/it]

Epoch: 159 | Time: 0m 21s
	Train Loss: 0.160 | Train PPL:   1.173
	 Val. Loss: 2.867 |  Val. PPL:  17.577


Started Training:  80%|████████  | 160/200 [57:28<14:13, 21.33s/it]

Epoch: 160 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.168
	 Val. Loss: 2.867 |  Val. PPL:  17.577
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  80%|████████  | 161/200 [57:51<14:14, 21.92s/it]

Epoch: 161 | Time: 0m 23s
	Train Loss: 0.161 | Train PPL:   1.175
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  81%|████████  | 162/200 [58:12<13:44, 21.68s/it]

Epoch: 162 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  82%|████████▏ | 163/200 [58:33<13:15, 21.49s/it]

Epoch: 163 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  82%|████████▏ | 164/200 [58:54<12:49, 21.36s/it]

Epoch: 164 | Time: 0m 21s
	Train Loss: 0.159 | Train PPL:   1.172
	 Val. Loss: 2.867 |  Val. PPL:  17.577


Started Training:  82%|████████▎ | 165/200 [59:15<12:25, 21.29s/it]

Epoch: 165 | Time: 0m 21s
	Train Loss: 0.158 | Train PPL:   1.171
	 Val. Loss: 2.867 |  Val. PPL:  17.577
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  83%|████████▎ | 166/200 [59:39<12:24, 21.91s/it]

Epoch: 166 | Time: 0m 23s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.867 |  Val. PPL:  17.577


Started Training:  84%|████████▎ | 167/200 [1:00:00<11:55, 21.67s/it]

Epoch: 167 | Time: 0m 21s
	Train Loss: 0.154 | Train PPL:   1.166
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  84%|████████▍ | 168/200 [1:00:21<11:28, 21.50s/it]

Epoch: 168 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  84%|████████▍ | 169/200 [1:00:42<11:04, 21.45s/it]

Epoch: 169 | Time: 0m 21s
	Train Loss: 0.154 | Train PPL:   1.167
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  85%|████████▌ | 170/200 [1:01:03<10:40, 21.36s/it]

Epoch: 170 | Time: 0m 21s
	Train Loss: 0.154 | Train PPL:   1.167
	 Val. Loss: 2.866 |  Val. PPL:  17.575
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  86%|████████▌ | 171/200 [1:01:27<10:37, 21.98s/it]

Epoch: 171 | Time: 0m 23s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.575


Started Training:  86%|████████▌ | 172/200 [1:01:48<10:08, 21.73s/it]

Epoch: 172 | Time: 0m 21s
	Train Loss: 0.154 | Train PPL:   1.167
	 Val. Loss: 2.866 |  Val. PPL:  17.575


Started Training:  86%|████████▋ | 173/200 [1:02:09<09:42, 21.57s/it]

Epoch: 173 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.867 |  Val. PPL:  17.575


Started Training:  87%|████████▋ | 174/200 [1:02:30<09:18, 21.46s/it]

Epoch: 174 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.867 |  Val. PPL:  17.575


Started Training:  88%|████████▊ | 175/200 [1:02:51<08:54, 21.37s/it]

Epoch: 175 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.866 |  Val. PPL:  17.575
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  88%|████████▊ | 176/200 [1:03:15<08:47, 21.99s/it]

Epoch: 176 | Time: 0m 23s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.867 |  Val. PPL:  17.576


Started Training:  88%|████████▊ | 177/200 [1:03:36<08:20, 21.75s/it]

Epoch: 177 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.167
	 Val. Loss: 2.866 |  Val. PPL:  17.575


Started Training:  89%|████████▉ | 178/200 [1:03:57<07:54, 21.58s/it]

Epoch: 178 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.574


Started Training:  90%|████████▉ | 179/200 [1:04:18<07:30, 21.47s/it]

Epoch: 179 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.575


Started Training:  90%|█████████ | 180/200 [1:04:40<07:07, 21.38s/it]

Epoch: 180 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.866 |  Val. PPL:  17.575
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  90%|█████████ | 181/200 [1:05:03<06:57, 21.97s/it]

Epoch: 181 | Time: 0m 23s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.866 |  Val. PPL:  17.574


Started Training:  91%|█████████ | 182/200 [1:05:24<06:30, 21.72s/it]

Epoch: 182 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.574


Started Training:  92%|█████████▏| 183/200 [1:05:45<06:06, 21.54s/it]

Epoch: 183 | Time: 0m 21s
	Train Loss: 0.160 | Train PPL:   1.173
	 Val. Loss: 2.866 |  Val. PPL:  17.574


Started Training:  92%|█████████▏| 184/200 [1:06:06<05:42, 21.40s/it]

Epoch: 184 | Time: 0m 21s
	Train Loss: 0.154 | Train PPL:   1.167
	 Val. Loss: 2.866 |  Val. PPL:  17.573


Started Training:  92%|█████████▎| 185/200 [1:06:27<05:19, 21.33s/it]

Epoch: 185 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.866 |  Val. PPL:  17.573
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  93%|█████████▎| 186/200 [1:06:51<05:07, 21.95s/it]

Epoch: 186 | Time: 0m 23s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.574


Started Training:  94%|█████████▎| 187/200 [1:07:12<04:42, 21.71s/it]

Epoch: 187 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.866 |  Val. PPL:  17.573


Started Training:  94%|█████████▍| 188/200 [1:07:33<04:18, 21.54s/it]

Epoch: 188 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.866 |  Val. PPL:  17.573


Started Training:  94%|█████████▍| 189/200 [1:07:54<03:55, 21.40s/it]

Epoch: 189 | Time: 0m 21s
	Train Loss: 0.159 | Train PPL:   1.172
	 Val. Loss: 2.866 |  Val. PPL:  17.573


Started Training:  95%|█████████▌| 190/200 [1:08:15<03:33, 21.32s/it]

Epoch: 190 | Time: 0m 21s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 2.866 |  Val. PPL:  17.573
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  96%|█████████▌| 191/200 [1:08:39<03:17, 21.92s/it]

Epoch: 191 | Time: 0m 23s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.866 |  Val. PPL:  17.573


Started Training:  96%|█████████▌| 192/200 [1:09:00<02:53, 21.67s/it]

Epoch: 192 | Time: 0m 21s
	Train Loss: 0.154 | Train PPL:   1.166
	 Val. Loss: 2.866 |  Val. PPL:  17.573


Started Training:  96%|█████████▋| 193/200 [1:09:21<02:30, 21.50s/it]

Epoch: 193 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.168
	 Val. Loss: 2.866 |  Val. PPL:  17.574


Started Training:  97%|█████████▋| 194/200 [1:09:42<02:08, 21.39s/it]

Epoch: 194 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 2.866 |  Val. PPL:  17.573


Started Training:  98%|█████████▊| 195/200 [1:10:03<01:46, 21.31s/it]

Epoch: 195 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.573
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: , quite an equal displeased , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> ! ! ! is is is is is is is is is is is is


Started Training:  98%|█████████▊| 196/200 [1:10:26<01:27, 21.91s/it]

Epoch: 196 | Time: 0m 23s
	Train Loss: 0.156 | Train PPL:   1.168
	 Val. Loss: 2.866 |  Val. PPL:  17.574


Started Training:  98%|█████████▊| 197/200 [1:10:48<01:05, 21.68s/it]

Epoch: 197 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.167
	 Val. Loss: 2.866 |  Val. PPL:  17.573


Started Training:  99%|█████████▉| 198/200 [1:11:09<00:43, 21.52s/it]

Epoch: 198 | Time: 0m 21s
	Train Loss: 0.153 | Train PPL:   1.166
	 Val. Loss: 2.866 |  Val. PPL:  17.573


Started Training: 100%|█████████▉| 199/200 [1:11:30<00:21, 21.42s/it]

Epoch: 199 | Time: 0m 21s
	Train Loss: 0.155 | Train PPL:   1.167
	 Val. Loss: 2.866 |  Val. PPL:  17.573


Started Training: 100%|██████████| 200/200 [1:11:51<00:00, 21.56s/it]

Epoch: 200 | Time: 0m 21s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.866 |  Val. PPL:  17.573





In [39]:
generate_seq(model, ["Hi, introduce yourself"], bpe_tokenizer, SEQ_LEN, special_ids, gen_len=100)

=== Sample generations ===
[1] INPUT : hi, introduce yourself
OUTPUT: <end> hy ! <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


[('hi, introduce yourself',
  ['<end>',
   'hy',
   '!',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<en