In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import math
import time
import os
import random
import random
import re
import unicodedata
from tqdm import tqdm

2025-11-14 11:26:47.001398: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763119607.212596      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763119607.270692      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [2]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from tokenizers.trainers import BpeTrainer

In [3]:
# Config
seed = 1234
DATA_DIR = 'data'
SEQ_LEN = 50
VOCAB_SIZE = 5000
EMB_DIM = 256
ENC_HIDDEN_DIM = 512
DEC_HIDDEN_DIM = 512
DROPOUT = 0.3
N_EPOCHS = 200
LEARNING_RATE = 1e-2
BATCH_SIZE = 64
# Output directories
CHECKPOINT_DIR = 'checkpoints'
LOG_DIR = 'runs'
tokenizer_path = "tknzer_dir"
special_tokens = ["<pad>", "<st>", "<end>", "<unk>"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
seed_everything(seed)

In [5]:
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

In [6]:
def load_txt(txt_file):
    with open(txt_file, "r", encoding="utf-8") as f:
        text = f.read()
    return text

In [7]:
dataset_path = "/kaggle/input/iiith-assignment2-dataset/dataset/Pride_and_Prejudice-Jane_Austen.txt"

In [8]:
def normalize_text(text, lowercase = True):
    if lowercase:
        text = text.lower()
    # Unicode Normalization (e.g., converting fancy quotes to standard ones)
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[\*_`\[\]\^{}]', '', text) # handle _word_ and *word*
    text = re.sub(r'\b(mr|mrs|ms|dr|st)\.', r'\1', text, flags=re.IGNORECASE) # handle mr. mrs. etc
    text = text.strip()
    return text

In [9]:
custom_dataset = normalize_text(load_txt(dataset_path))

In [10]:
len(set(custom_dataset)), len(list(custom_dataset))

(56, 702635)

In [11]:
def get_tokenizer(dataset, vocab_size, save_fldr, name = 'bpe'):
    if os.path.exists(save_fldr):
        save_path = os.path.join(save_fldr, 'tokenizer.json')
        return Tokenizer.from_file(save_path)
    if name == 'bpe':
        tokenizer = Tokenizer(models.BPE())
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=special_tokens)
        tokenizer.train_from_iterator([dataset], trainer)
        if save_fldr:
            os.makedirs(save_fldr, exist_ok=True)
            tokenizer.save(os.path.join(save_fldr, 'tokenizer.json'))
            return tokenizer

In [12]:
bpe_tokenizer = get_tokenizer(custom_dataset, VOCAB_SIZE, tokenizer_path, 'bpe')






In [13]:
tokenized_data = bpe_tokenizer.encode(custom_dataset).ids

In [14]:
len(tokenized_data)

154018

In [15]:
def get_data(data, split, train_percent = 0.8):
    n = len(data)
    split_percent = train_percent
    if split == "train":
        return data[ : int((split_percent)*n)]
    elif split == "val":
        return data[int((split_percent)*n) : ]
    else:
        return None

In [16]:
train_data = get_data(tokenized_data,'train', train_percent = 0.8)
val_data = get_data(tokenized_data,'val', train_percent = 0.8)

In [17]:
len(train_data)

123214

In [18]:
# Special token ids
PAD_TOKEN = bpe_tokenizer.token_to_id('<pad>')
ST_TOKEN = bpe_tokenizer.token_to_id('<st>')
END_TOKEN = bpe_tokenizer.token_to_id('<end>')
FULL_STOP = bpe_tokenizer.token_to_id('.')
QUESTION_TOKEN = bpe_tokenizer.token_to_id('?')
EXCLAMATION = bpe_tokenizer.token_to_id('!')
special_ids = PAD_TOKEN, ST_TOKEN, END_TOKEN, FULL_STOP, QUESTION_TOKEN, EXCLAMATION
special_ids

(0, 1, 2, 15, 29, 4)

In [19]:
# Dataset
class SimpleDataset(Dataset):
    def __init__(self, data, special_ids, seq_len):
        super().__init__()
        self.token_ids = data
        self.seq_len = seq_len
        self.pad, self.st, self.end, self.full_stop, self.quest, self.excl= special_ids
        self.sentence_enders = {self.full_stop, self.quest, self.excl}
        
        self.inputs = []
        self.decoder_inputs = []
        self.decoder_targets = []
        
        
        self.create_sequences()
        
    def create_sequences(self):
        tokens = self.token_ids
        n = len(tokens)
        L = self.seq_len
        
        i = 0
        
        while i < n-1:
            enc_seq = []
            seq_end_idx = i 
            for j in range(L):
                current_idx = i + j
                if current_idx >= n:
                    break
                token = tokens[current_idx]
                
                enc_seq.append(token)
                seq_end_idx = current_idx
                
                if token in self.sentence_enders:
                    break
            
            
            if not enc_seq or len(enc_seq) < 2:
                i = seq_end_idx + 1 # Move to the next token
                continue

            dec_target_seq = enc_seq[1:] + [self.end]
            dec_input_seq = [self.st] + dec_target_seq[:-1]
                
            # Pad sequences if shorter
            enc_seq_len = len(enc_seq)
            enc_seq = enc_seq + [self.pad] * max(L - enc_seq_len, 0)
            dec_input_seq = dec_input_seq + [self.pad] * max(L - enc_seq_len, 0)
            dec_target_seq = dec_target_seq + [self.pad] * max(L - enc_seq_len, 0)

            self.inputs.append(enc_seq)
            self.decoder_inputs.append(dec_input_seq)
            self.decoder_targets.append(dec_target_seq)
            
            i = seq_end_idx + 1
    
    def __len__(self):
        return len(self.inputs)
    
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.inputs[idx], dtype=torch.long), # encoder input 50 timesteps
            torch.tensor(self.decoder_inputs[idx], dtype=torch.long), # decoder inputs shifter from target 50 steps
            torch.tensor(self.decoder_targets[idx], dtype=torch.long) # decoder targets 50 steps
        )

In [20]:
train_dataset = SimpleDataset(train_data, special_ids, SEQ_LEN)
val_dataset = SimpleDataset(val_data, special_ids, SEQ_LEN)

In [21]:
len(train_dataset), len(val_dataset)

(4445, 1350)

In [22]:
# Example
val_dataset[0]

(tensor([ 370, 2784,   13,  406,  876,   15,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0]),
 tensor([   1, 2784,   13,  406,  876,   15,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0]),
 tensor([2784,   13,  406,  876,   15,    2,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0, 

In [23]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [24]:
%%writefile model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

# ENCODER
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.enc_hidden_dim = hidden_dim
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=dropout
        )
    
    def forward(self, x):
        # x: [batch, seq_len]
        emb = self.embedding(x)  # [batch, seq_len, emb_dim]
        outputs, (h, c) = self.lstm(emb)  # outputs: [batch, seq_len, 2*hidden]
        return outputs, (h, c)

# ATTENTION 
class Attention(nn.Module):
    def __init__(self, enc_hidden_dim, dec_hidden_dim):
        super().__init__()
        
        # We need to align the dimensions of encoder outputs and decoder hidden state
        # enc_hidden_dim is for one direction, but encoder is bidirectional (2*)
        self.attn = nn.Linear((enc_hidden_dim * 2) + dec_hidden_dim, dec_hidden_dim)
        self.v = nn.Linear(dec_hidden_dim, 1, bias = False)
        
    def forward(self, dec_hidden, enc_outputs):
        # dec_hidden: [batch, dec_hidden_dim] (from the *top layer* of decoder)
        # enc_outputs: [batch, src_len, enc_hidden_dim * 2]
        
        batch_size = enc_outputs.shape[0]
        src_len = enc_outputs.shape[1]
        
        # Repeat decoder hidden state src_len times to concatenate
        # dec_hidden: [batch, src_len, dec_hidden_dim]
        dec_hidden = dec_hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        # energy: [batch, src_len, (enc_hidden * 2) + dec_hidden]
        energy_input = torch.cat((dec_hidden, enc_outputs), dim = 2)
        
        # energy: [batch, src_len, dec_hidden_dim]
        energy = torch.tanh(self.attn(energy_input))
        
        # v(energy): [batch, src_len, 1] -> [batch, src_len]
        attention = self.v(energy).squeeze(2)
        
        # Return softmax'd weights
        return F.softmax(attention, dim=1)

# Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, enc_hidden_dim, dec_hidden_dim, dropout=0.1):
        super().__init__()
        self.dec_hidden_dim = dec_hidden_dim
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        
        self.attention = Attention(enc_hidden_dim, dec_hidden_dim)
        
        self.lstm = nn.LSTM(
            input_size=emb_dim + (enc_hidden_dim * 2),
            hidden_size=dec_hidden_dim,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )
        
        self.fc = nn.Linear(dec_hidden_dim + (enc_hidden_dim * 2), vocab_size)
    
    def forward(self, dec_input, dec_hidden, enc_outputs):
        # dec_input: [batch] -> current token IDs
        # dec_hidden: (h, c) from previous step
        # enc_outputs: [batch, src_len, 2*enc_hidden_dim]
        
        # dec_input [batch] -> [batch, 1]
        dec_input = dec_input.unsqueeze(1)
        
        # emb: [batch, 1, emb_dim]
        emb = self.embedding(dec_input)
        
       
        a = self.attention(dec_hidden[0][-1], enc_outputs)
        
        a = a.unsqueeze(1)
        
        context = torch.bmm(a, enc_outputs)
        
        lstm_input = torch.cat([emb, context], dim=2)
        output, dec_hidden = self.lstm(lstm_input, dec_hidden)
        
        output = output.squeeze(1)
        context = context.squeeze(1)

        concat_output = torch.cat([output, context], dim=1)
        
        logits = self.fc(concat_output)
        
        return logits, dec_hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        self.enc_num_layers = self.encoder.lstm.num_layers
        self.enc_num_directions = 2 if self.encoder.lstm.bidirectional else 1
        self.enc_hidden_dim = self.encoder.enc_hidden_dim
        self.dec_hidden_dim = self.decoder.dec_hidden_dim
        
        self.fc_hidden = nn.Linear(self.enc_hidden_dim * self.enc_num_directions, self.dec_hidden_dim)
        self.fc_cell = nn.Linear(self.enc_hidden_dim * self.enc_num_directions, self.dec_hidden_dim)

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.fc.out_features
        
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        enc_outputs, (h, c) = self.encoder(src)
        
        h = h.view(self.enc_num_layers, self.enc_num_directions, batch_size, self.enc_hidden_dim)
        c = c.view(self.enc_num_layers, self.enc_num_directions, batch_size, self.enc_hidden_dim)
        
        h_cat = torch.cat((h[:, 0, :, :], h[:, 1, :, :]), dim=2)
        c_cat = torch.cat((c[:, 0, :, :], c[:, 1, :, :]), dim=2)
        
        dec_hidden = (torch.tanh(self.fc_hidden(h_cat)), 
                      torch.tanh(self.fc_cell(c_cat)))
        
        dec_input = trg[:, 0]
        
        # Loop from 0, store at t
        for t in range(trg_len):
            
            # The decoder's forward pass now includes the attention mechanism
            logits, dec_hidden = self.decoder(dec_input, dec_hidden, enc_outputs)
            
            outputs[:, t] = logits
            
            use_teacher_force = random.random() < teacher_forcing_ratio
            
            if use_teacher_force:
                if t < trg_len - 1:
                    dec_input = trg[:, t+1]
                else:
                    break
            else:
                top1 = logits.argmax(1)
                dec_input = top1
            
        return outputs

Writing model.py


In [25]:
from model import Encoder, Decoder, Seq2Seq

In [26]:
encoder = Encoder(VOCAB_SIZE, EMB_DIM, ENC_HIDDEN_DIM, dropout=DROPOUT)
decoder = Decoder(VOCAB_SIZE, EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, dropout=DROPOUT)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

In [27]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(5000, 256)
    (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(5000, 256)
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (lstm): LSTM(1280, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=1536, out_features=5000, bias=True)
  )
  (fc_hidden): Linear(in_features=1024, out_features=512, bias=True)
  (fc_cell): Linear(in_features=1024, out_features=512, bias=True)
)

In [28]:
def load_ckpt(model):
    if os.path.exists(CHECKPOINT_DIR):
        models = [os.path.join(CHECKPOINT_DIR,ckpt) for ckpt in os.listdir(CHECKPOINT_DIR)]
        if len(models) > 0:
            sorted_models = sorted(models, key=lambda x: x.split("_val")[1].split('.pt')[0])
            model.load_state_dict(torch.load(sorted_models[0], map_location=DEVICE))
            print(f"Loaded checkpoint from {sorted_models[0]}")
            best_loss = float(os.path.basename(sorted_models[0]).split("_val")[1].split('.pt')[0])
            last_epoch = int(os.path.basename(sorted_models[0]).split("_val")[0].split('Epoch')[1])
            return best_loss, last_epoch
    

In [29]:
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5)

In [30]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)

In [31]:
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for i, (src, dec_in, trg) in enumerate(iterator):
        
        src, dec_in, trg = src.to(DEVICE), dec_in.to(DEVICE), trg.to(DEVICE)
        
        optimizer.zero_grad()
        output = model(src, dec_in)
        
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [32]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, (src, dec_in, trg) in enumerate(iterator):
            src, dec_in, trg = src.to(DEVICE), dec_in.to(DEVICE), trg.to(DEVICE)
            output = model(src, dec_in)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [33]:
def generate_seq(model, texts, tokenizer, seq_len, special_ids, gen_len=100):
    model.eval()
    samples = []
    pad, st, end, full_stop, quest, excl= special_ids

    with torch.no_grad():
        for text in texts:
            # prepare encoder input
            text = normalize_text(text)
            ids = tokenizer.encode(text).ids
            if len(ids) < seq_len:
                ids = ids + [pad] * (seq_len - len(ids))
            else:
                print("Warning: More than seq length -- considering first 50 tokens:")
                ids = ids[:seq_len]

            src = torch.tensor([ids], dtype=torch.long, device=DEVICE)
            # print(src.shape)
            # print(src)

            # start decoder with only the start token (length = 1)
            dec_in = torch.tensor([[st]], dtype=torch.long, device=DEVICE)

            generated = []
            for step in range(gen_len):
                out = model(src, dec_in)          # [1, cur_dec_len, vocab_size]
                next_token = int(out[0, -1].argmax().cpu().item())  # last timestep prediction
                generated.append(next_token)
                
                # if next_token == end:
                #     break

                

                # append predicted token to decoder input for next step
                dec_in = torch.cat(
                    [dec_in, torch.tensor([[next_token]], dtype=torch.long, device=DEVICE)],
                    dim=1
                )
            # convert ids -> tokens
            # print("Out Tokens", generated)
            tokens = [tokenizer.id_to_token(tid) for tid in generated]
            samples.append((text, tokens))

    print("=== Sample generations ===")
    for idx, (inp, toks) in enumerate(samples, 1):
        print(f"[{idx}] INPUT : {inp}")
        print(f"OUTPUT: {' '.join(toks)}")
    print("======================================================")
    return samples

In [34]:
writer = SummaryWriter(LOG_DIR)

In [35]:
# text from internet
"""If I could have ceased what pendulums swung, or wheels turned, 
or water clocks emptied, then, in order to keep the Fates from marching in time,
I would have, for though it is what a boy naturally wishes when he fears change will 
come upon what he loves and take it away, a man remembers it, too, and in his heart 
wishes the same when all around him he feels only loss, loss that has been his 
companion for some time, and promises to remain at his side."""


"""
It is quite an interesting format, if you don't like the opening you are reading,
then you can skip and go to the next book opening. If the opening does capture your attention
you can click at the bottom to reveal the title and author. It reminds me of blind date books 
where bookstores will wrap a random book up in paper (to prevent the title and author from being seen)
and will write brief descriptions about it.


"""

"\nIt is quite an interesting format, if you don't like the opening you are reading,\nthen you can skip and go to the next book opening. If the opening does capture your attention\nyou can click at the bottom to reveal the title and author. It reminds me of blind date books \nwhere bookstores will wrap a random book up in paper (to prevent the title and author from being seen)\nand will write brief descriptions about it.\n\n\n"

In [36]:
CUSTOM_TEXTS = [
                "If I could have ceased",
                "It is quite an interesting format, if you"
        ]

In [37]:
def run_lm(predict_while_train = True, resume_ckpt = True):
    best_valid_loss = float('inf')
    last_epoch = 0
    if resume_ckpt:
        if os.listdir(CHECKPOINT_DIR):
            best_valid_loss, last_epoch = load_ckpt(model)
    
    for epoch in tqdm(range(N_EPOCHS), desc="Started Training"):
        epoch += last_epoch
        start_time = time.time()
        
        if predict_while_train:
            if epoch%5 == 0:
                generate_seq(model, CUSTOM_TEXTS, bpe_tokenizer, SEQ_LEN, special_ids, gen_len=50)

        train_loss = train(model, train_loader, optimizer, criterion)
        valid_loss = evaluate(model, val_loader, criterion)

        scheduler.step(valid_loss)
        
        end_time = time.time()
        
        epoch_mins = int((end_time - start_time) / 60)
        epoch_secs = int((end_time - start_time) - (epoch_mins * 60))

        # Save checkpoint if validation loss has improved
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), os.path.join(CHECKPOINT_DIR, f'Epoch{epoch}_val{valid_loss}.pt'))
            print(f"Checkpoint saved: New best validation loss {best_valid_loss:.3f}")

        # TensorBoard logging
        writer.add_scalar('Loss/train', train_loss, epoch)
        writer.add_scalar('Perplexity/train', math.exp(train_loss), epoch)
        writer.add_scalar('Loss/validation', valid_loss, epoch)
        writer.add_scalar('Perplexity/validation', math.exp(valid_loss), epoch)
        writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], epoch)

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [38]:
run_lm(predict_while_train = True, resume_ckpt = True)

Started Training:   0%|          | 0/200 [00:00<?, ?it/s]

=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: attra attra attra features <unk> perceived perceived perceived perceived cha attra attra features features <unk> <unk> perceived perceived perceived perceived perceived cha attra features features ra ra ra ra attra attra attra get get get get get get get get get get get get get get get get get get
[2] INPUT : it is quite an interesting format, if you
OUTPUT: attra song sought attra attra attra attra fa attra attra attra fa attra attra attra fa attra attra attra fa attra attra attra fa attra attra attra fa attra attra attra fa attra attra attra fa attra attra attra fa attra attra attra fa attra attra attra fa attra attra


Started Training:   0%|          | 1/200 [00:30<1:40:02, 30.16s/it]

Checkpoint saved: New best validation loss 6.226
Epoch: 01 | Time: 0m 29s
	Train Loss: 6.387 | Train PPL: 594.184
	 Val. Loss: 6.226 |  Val. PPL: 505.697


Started Training:   1%|          | 2/200 [00:57<1:34:31, 28.65s/it]

Checkpoint saved: New best validation loss 5.998
Epoch: 02 | Time: 0m 27s
	Train Loss: 5.870 | Train PPL: 354.408
	 Val. Loss: 5.998 |  Val. PPL: 402.700


Started Training:   2%|▏         | 3/200 [01:25<1:32:27, 28.16s/it]

Checkpoint saved: New best validation loss 5.760
Epoch: 03 | Time: 0m 27s
	Train Loss: 5.559 | Train PPL: 259.529
	 Val. Loss: 5.760 |  Val. PPL: 317.377


Started Training:   2%|▏         | 4/200 [01:52<1:31:19, 27.96s/it]

Checkpoint saved: New best validation loss 5.067
Epoch: 04 | Time: 0m 27s
	Train Loss: 5.068 | Train PPL: 158.791
	 Val. Loss: 5.067 |  Val. PPL: 158.692


Started Training:   2%|▎         | 5/200 [02:20<1:30:26, 27.83s/it]

Checkpoint saved: New best validation loss 4.460
Epoch: 05 | Time: 0m 27s
	Train Loss: 4.326 | Train PPL:  75.656
	 Val. Loss: 4.460 |  Val. PPL:  86.453
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: lived could have the . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: lived quite an faith , if you you ? <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:   3%|▎         | 6/200 [02:50<1:31:52, 28.42s/it]

Checkpoint saved: New best validation loss 4.114
Epoch: 06 | Time: 0m 29s
	Train Loss: 3.622 | Train PPL:  37.411
	 Val. Loss: 4.114 |  Val. PPL:  61.214


Started Training:   4%|▎         | 7/200 [03:17<1:30:32, 28.15s/it]

Checkpoint saved: New best validation loss 3.727
Epoch: 07 | Time: 0m 27s
	Train Loss: 2.990 | Train PPL:  19.890
	 Val. Loss: 3.727 |  Val. PPL:  41.571


Started Training:   4%|▍         | 8/200 [03:45<1:29:30, 27.97s/it]

Checkpoint saved: New best validation loss 3.360
Epoch: 08 | Time: 0m 27s
	Train Loss: 2.504 | Train PPL:  12.235
	 Val. Loss: 3.360 |  Val. PPL:  28.778


Started Training:   4%|▍         | 9/200 [04:12<1:28:39, 27.85s/it]

Checkpoint saved: New best validation loss 3.051
Epoch: 09 | Time: 0m 27s
	Train Loss: 2.042 | Train PPL:   7.705
	 Val. Loss: 3.051 |  Val. PPL:  21.128


Started Training:   5%|▌         | 10/200 [04:40<1:27:56, 27.77s/it]

Checkpoint saved: New best validation loss 2.962
Epoch: 10 | Time: 0m 27s
	Train Loss: 1.640 | Train PPL:   5.156
	 Val. Loss: 2.962 |  Val. PPL:  19.329
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: ! could have spent <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: nonsense quite an flir ful , if you . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:   6%|▌         | 11/200 [05:10<1:29:13, 28.32s/it]

Checkpoint saved: New best validation loss 2.589
Epoch: 11 | Time: 0m 29s
	Train Loss: 1.312 | Train PPL:   3.715
	 Val. Loss: 2.589 |  Val. PPL:  13.319


Started Training:   6%|▌         | 12/200 [05:37<1:28:00, 28.09s/it]

Checkpoint saved: New best validation loss 2.455
Epoch: 12 | Time: 0m 27s
	Train Loss: 1.017 | Train PPL:   2.765
	 Val. Loss: 2.455 |  Val. PPL:  11.648


Started Training:   6%|▋         | 13/200 [06:05<1:26:55, 27.89s/it]

Epoch: 13 | Time: 0m 27s
	Train Loss: 1.244 | Train PPL:   3.469
	 Val. Loss: 3.519 |  Val. PPL:  33.748


Started Training:   7%|▋         | 14/200 [06:32<1:26:10, 27.80s/it]

Checkpoint saved: New best validation loss 2.301
Epoch: 14 | Time: 0m 27s
	Train Loss: 1.108 | Train PPL:   3.030
	 Val. Loss: 2.301 |  Val. PPL:   9.987


Started Training:   8%|▊         | 15/200 [07:00<1:25:32, 27.74s/it]

Checkpoint saved: New best validation loss 2.205
Epoch: 15 | Time: 0m 27s
	Train Loss: 0.719 | Train PPL:   2.051
	 Val. Loss: 2.205 |  Val. PPL:   9.071
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: absurd could have fore <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: absurd quite an interesting appearance , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:   8%|▊         | 16/200 [07:29<1:26:46, 28.29s/it]

Checkpoint saved: New best validation loss 2.198
Epoch: 16 | Time: 0m 29s
	Train Loss: 0.614 | Train PPL:   1.848
	 Val. Loss: 2.198 |  Val. PPL:   9.011


Started Training:   8%|▊         | 17/200 [07:57<1:25:39, 28.08s/it]

Checkpoint saved: New best validation loss 2.108
Epoch: 17 | Time: 0m 27s
	Train Loss: 0.514 | Train PPL:   1.672
	 Val. Loss: 2.108 |  Val. PPL:   8.235


Started Training:   9%|▉         | 18/200 [08:24<1:24:42, 27.93s/it]

Checkpoint saved: New best validation loss 2.030
Epoch: 18 | Time: 0m 27s
	Train Loss: 0.451 | Train PPL:   1.570
	 Val. Loss: 2.030 |  Val. PPL:   7.611


Started Training:  10%|▉         | 19/200 [08:52<1:23:57, 27.83s/it]

Checkpoint saved: New best validation loss 2.019
Epoch: 19 | Time: 0m 27s
	Train Loss: 0.424 | Train PPL:   1.528
	 Val. Loss: 2.019 |  Val. PPL:   7.534


Started Training:  10%|█         | 20/200 [09:20<1:23:08, 27.72s/it]

Epoch: 20 | Time: 0m 27s
	Train Loss: 0.338 | Train PPL:   1.402
	 Val. Loss: 2.161 |  Val. PPL:   8.682
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have formerly <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an marry , , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  10%|█         | 21/200 [09:49<1:24:11, 28.22s/it]

Epoch: 21 | Time: 0m 29s
	Train Loss: 0.404 | Train PPL:   1.497
	 Val. Loss: 2.082 |  Val. PPL:   8.020


Started Training:  11%|█         | 22/200 [10:16<1:23:03, 28.00s/it]

Epoch: 22 | Time: 0m 27s
	Train Loss: 0.373 | Train PPL:   1.452
	 Val. Loss: 2.028 |  Val. PPL:   7.602


Started Training:  12%|█▏        | 23/200 [10:44<1:22:15, 27.89s/it]

Checkpoint saved: New best validation loss 1.979
Epoch: 23 | Time: 0m 27s
	Train Loss: 0.325 | Train PPL:   1.384
	 Val. Loss: 1.979 |  Val. PPL:   7.235


Started Training:  12%|█▏        | 24/200 [11:12<1:21:27, 27.77s/it]

Epoch: 24 | Time: 0m 27s
	Train Loss: 0.293 | Train PPL:   1.341
	 Val. Loss: 1.982 |  Val. PPL:   7.261


Started Training:  12%|█▎        | 25/200 [11:39<1:20:46, 27.69s/it]

Epoch: 25 | Time: 0m 27s
	Train Loss: 0.282 | Train PPL:   1.325
	 Val. Loss: 2.288 |  Val. PPL:   9.855
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: absurd could have some num . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ago , if you were miserable <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  13%|█▎        | 26/200 [12:09<1:21:52, 28.23s/it]

Epoch: 26 | Time: 0m 29s
	Train Loss: 0.308 | Train PPL:   1.361
	 Val. Loss: 2.261 |  Val. PPL:   9.594


Started Training:  14%|█▎        | 27/200 [12:36<1:20:49, 28.03s/it]

Checkpoint saved: New best validation loss 1.889
Epoch: 27 | Time: 0m 27s
	Train Loss: 0.291 | Train PPL:   1.337
	 Val. Loss: 1.889 |  Val. PPL:   6.614


Started Training:  14%|█▍        | 28/200 [13:04<1:19:49, 27.84s/it]

Epoch: 28 | Time: 0m 27s
	Train Loss: 0.226 | Train PPL:   1.254
	 Val. Loss: 2.094 |  Val. PPL:   8.121


Started Training:  14%|█▍        | 29/200 [13:31<1:19:02, 27.74s/it]

Epoch: 29 | Time: 0m 27s
	Train Loss: 0.212 | Train PPL:   1.236
	 Val. Loss: 2.018 |  Val. PPL:   7.524


Started Training:  15%|█▌        | 30/200 [13:58<1:18:20, 27.65s/it]

Epoch: 30 | Time: 0m 27s
	Train Loss: 0.210 | Train PPL:   1.233
	 Val. Loss: 1.891 |  Val. PPL:   6.625
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: of could have been ort <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting added , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  16%|█▌        | 31/200 [14:28<1:19:23, 28.18s/it]

Epoch: 31 | Time: 0m 29s
	Train Loss: 0.215 | Train PPL:   1.240
	 Val. Loss: 2.135 |  Val. PPL:   8.454


Started Training:  16%|█▌        | 32/200 [14:55<1:18:17, 27.96s/it]

Epoch: 32 | Time: 0m 27s
	Train Loss: 0.221 | Train PPL:   1.248
	 Val. Loss: 1.909 |  Val. PPL:   6.748


Started Training:  16%|█▋        | 33/200 [15:23<1:17:24, 27.81s/it]

Epoch: 33 | Time: 0m 27s
	Train Loss: 0.247 | Train PPL:   1.280
	 Val. Loss: 1.968 |  Val. PPL:   7.154


Started Training:  17%|█▋        | 34/200 [15:50<1:16:45, 27.74s/it]

Checkpoint saved: New best validation loss 1.792
Epoch: 34 | Time: 0m 27s
	Train Loss: 0.186 | Train PPL:   1.205
	 Val. Loss: 1.792 |  Val. PPL:   6.002


Started Training:  18%|█▊        | 35/200 [16:18<1:16:08, 27.69s/it]

Checkpoint saved: New best validation loss 1.773
Epoch: 35 | Time: 0m 27s
	Train Loss: 0.096 | Train PPL:   1.101
	 Val. Loss: 1.773 |  Val. PPL:   5.886
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: length could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: length quite an interesting tendency , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  18%|█▊        | 36/200 [16:47<1:17:12, 28.25s/it]

Checkpoint saved: New best validation loss 1.745
Epoch: 36 | Time: 0m 29s
	Train Loss: 0.067 | Train PPL:   1.069
	 Val. Loss: 1.745 |  Val. PPL:   5.724


Started Training:  18%|█▊        | 37/200 [17:15<1:16:12, 28.05s/it]

Checkpoint saved: New best validation loss 1.715
Epoch: 37 | Time: 0m 27s
	Train Loss: 0.067 | Train PPL:   1.069
	 Val. Loss: 1.715 |  Val. PPL:   5.557


Started Training:  19%|█▉        | 38/200 [17:43<1:15:23, 27.93s/it]

Checkpoint saved: New best validation loss 1.672
Epoch: 38 | Time: 0m 27s
	Train Loss: 0.057 | Train PPL:   1.058
	 Val. Loss: 1.672 |  Val. PPL:   5.323


Started Training:  20%|█▉        | 39/200 [18:10<1:14:42, 27.84s/it]

Checkpoint saved: New best validation loss 1.658
Epoch: 39 | Time: 0m 27s
	Train Loss: 0.044 | Train PPL:   1.045
	 Val. Loss: 1.658 |  Val. PPL:   5.251


Started Training:  20%|██        | 40/200 [18:38<1:14:07, 27.80s/it]

Epoch: 40 | Time: 0m 27s
	Train Loss: 0.039 | Train PPL:   1.040
	 Val. Loss: 1.714 |  Val. PPL:   5.553
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: length could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  20%|██        | 41/200 [19:08<1:14:59, 28.30s/it]

Epoch: 41 | Time: 0m 29s
	Train Loss: 0.036 | Train PPL:   1.037
	 Val. Loss: 1.702 |  Val. PPL:   5.482


Started Training:  21%|██        | 42/200 [19:35<1:13:51, 28.05s/it]

Epoch: 42 | Time: 0m 27s
	Train Loss: 0.034 | Train PPL:   1.034
	 Val. Loss: 1.667 |  Val. PPL:   5.294


Started Training:  22%|██▏       | 43/200 [20:02<1:12:55, 27.87s/it]

Epoch: 43 | Time: 0m 27s
	Train Loss: 0.034 | Train PPL:   1.034
	 Val. Loss: 1.665 |  Val. PPL:   5.288


Started Training:  22%|██▏       | 44/200 [20:30<1:12:09, 27.76s/it]

Epoch: 44 | Time: 0m 27s
	Train Loss: 0.029 | Train PPL:   1.029
	 Val. Loss: 1.663 |  Val. PPL:   5.275


Started Training:  22%|██▎       | 45/200 [20:57<1:11:28, 27.67s/it]

Epoch: 45 | Time: 0m 27s
	Train Loss: 0.030 | Train PPL:   1.031
	 Val. Loss: 1.736 |  Val. PPL:   5.677
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: length could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  23%|██▎       | 46/200 [21:27<1:12:21, 28.19s/it]

Epoch: 46 | Time: 0m 29s
	Train Loss: 0.035 | Train PPL:   1.036
	 Val. Loss: 1.671 |  Val. PPL:   5.315


Started Training:  24%|██▎       | 47/200 [21:54<1:11:18, 27.97s/it]

Epoch: 47 | Time: 0m 27s
	Train Loss: 0.024 | Train PPL:   1.024
	 Val. Loss: 1.674 |  Val. PPL:   5.332


Started Training:  24%|██▍       | 48/200 [22:22<1:10:33, 27.85s/it]

Checkpoint saved: New best validation loss 1.612
Epoch: 48 | Time: 0m 27s
	Train Loss: 0.026 | Train PPL:   1.027
	 Val. Loss: 1.612 |  Val. PPL:   5.011


Started Training:  24%|██▍       | 49/200 [22:49<1:09:48, 27.74s/it]

Epoch: 49 | Time: 0m 27s
	Train Loss: 0.024 | Train PPL:   1.024
	 Val. Loss: 1.628 |  Val. PPL:   5.095


Started Training:  25%|██▌       | 50/200 [23:17<1:09:13, 27.69s/it]

Epoch: 50 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.624 |  Val. PPL:   5.073
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: length could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  26%|██▌       | 51/200 [23:46<1:10:07, 28.24s/it]

Epoch: 51 | Time: 0m 29s
	Train Loss: 0.026 | Train PPL:   1.026
	 Val. Loss: 1.650 |  Val. PPL:   5.209


Started Training:  26%|██▌       | 52/200 [24:14<1:09:09, 28.04s/it]

Epoch: 52 | Time: 0m 27s
	Train Loss: 0.026 | Train PPL:   1.026
	 Val. Loss: 1.645 |  Val. PPL:   5.182


Started Training:  26%|██▋       | 53/200 [24:42<1:08:20, 27.89s/it]

Epoch: 53 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.644 |  Val. PPL:   5.175


Started Training:  27%|██▋       | 54/200 [25:09<1:07:35, 27.78s/it]

Epoch: 54 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.678 |  Val. PPL:   5.354


Started Training:  28%|██▊       | 55/200 [25:36<1:06:54, 27.69s/it]

Epoch: 55 | Time: 0m 27s
	Train Loss: 0.023 | Train PPL:   1.023
	 Val. Loss: 1.626 |  Val. PPL:   5.082
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: length could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  28%|██▊       | 56/200 [26:06<1:07:44, 28.23s/it]

Epoch: 56 | Time: 0m 29s
	Train Loss: 0.014 | Train PPL:   1.014
	 Val. Loss: 1.625 |  Val. PPL:   5.076


Started Training:  28%|██▊       | 57/200 [26:33<1:06:45, 28.01s/it]

Epoch: 57 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.627 |  Val. PPL:   5.091


Started Training:  29%|██▉       | 58/200 [27:01<1:05:56, 27.86s/it]

Epoch: 58 | Time: 0m 27s
	Train Loss: 0.020 | Train PPL:   1.020
	 Val. Loss: 1.706 |  Val. PPL:   5.507


Started Training:  30%|██▉       | 59/200 [27:28<1:05:12, 27.75s/it]

Epoch: 59 | Time: 0m 27s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.644 |  Val. PPL:   5.176


Started Training:  30%|███       | 60/200 [27:56<1:04:33, 27.67s/it]

Epoch: 60 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.678 |  Val. PPL:   5.352
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  30%|███       | 61/200 [28:25<1:05:20, 28.20s/it]

Epoch: 61 | Time: 0m 29s
	Train Loss: 0.018 | Train PPL:   1.019
	 Val. Loss: 1.632 |  Val. PPL:   5.114


Started Training:  31%|███       | 62/200 [28:53<1:04:22, 27.99s/it]

Epoch: 62 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.613 |  Val. PPL:   5.018


Started Training:  32%|███▏      | 63/200 [29:21<1:03:38, 27.87s/it]

Checkpoint saved: New best validation loss 1.605
Epoch: 63 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.605 |  Val. PPL:   4.978


Started Training:  32%|███▏      | 64/200 [29:48<1:02:52, 27.74s/it]

Epoch: 64 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.018
	 Val. Loss: 1.610 |  Val. PPL:   5.003


Started Training:  32%|███▎      | 65/200 [30:15<1:02:13, 27.66s/it]

Epoch: 65 | Time: 0m 27s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.613 |  Val. PPL:   5.019
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  33%|███▎      | 66/200 [30:45<1:02:59, 28.21s/it]

Epoch: 66 | Time: 0m 29s
	Train Loss: 0.015 | Train PPL:   1.016
	 Val. Loss: 1.636 |  Val. PPL:   5.137


Started Training:  34%|███▎      | 67/200 [31:12<1:02:01, 27.98s/it]

Epoch: 67 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.646 |  Val. PPL:   5.184


Started Training:  34%|███▍      | 68/200 [31:40<1:01:13, 27.83s/it]

Epoch: 68 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.636 |  Val. PPL:   5.135


Started Training:  34%|███▍      | 69/200 [32:07<1:00:31, 27.72s/it]

Epoch: 69 | Time: 0m 27s
	Train Loss: 0.021 | Train PPL:   1.021
	 Val. Loss: 1.615 |  Val. PPL:   5.027


Started Training:  35%|███▌      | 70/200 [32:35<59:53, 27.64s/it]  

Epoch: 70 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.676 |  Val. PPL:   5.346
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  36%|███▌      | 71/200 [33:04<1:00:34, 28.18s/it]

Epoch: 71 | Time: 0m 29s
	Train Loss: 0.021 | Train PPL:   1.021
	 Val. Loss: 1.668 |  Val. PPL:   5.303


Started Training:  36%|███▌      | 72/200 [33:32<59:39, 27.97s/it]  

Epoch: 72 | Time: 0m 27s
	Train Loss: 0.014 | Train PPL:   1.015
	 Val. Loss: 1.696 |  Val. PPL:   5.452


Started Training:  36%|███▋      | 73/200 [33:59<58:52, 27.82s/it]

Epoch: 73 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.656 |  Val. PPL:   5.239


Started Training:  37%|███▋      | 74/200 [34:27<58:11, 27.71s/it]

Epoch: 74 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.768 |  Val. PPL:   5.861


Started Training:  38%|███▊      | 75/200 [34:54<57:34, 27.63s/it]

Epoch: 75 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.634 |  Val. PPL:   5.126
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  38%|███▊      | 76/200 [35:23<58:13, 28.17s/it]

Epoch: 76 | Time: 0m 29s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.634 |  Val. PPL:   5.127


Started Training:  38%|███▊      | 77/200 [35:51<57:19, 27.96s/it]

Epoch: 77 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.018
	 Val. Loss: 1.675 |  Val. PPL:   5.341


Started Training:  39%|███▉      | 78/200 [36:18<56:32, 27.81s/it]

Epoch: 78 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.646 |  Val. PPL:   5.188


Started Training:  40%|███▉      | 79/200 [36:46<55:56, 27.74s/it]

Checkpoint saved: New best validation loss 1.604
Epoch: 79 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.604 |  Val. PPL:   4.973


Started Training:  40%|████      | 80/200 [37:13<55:18, 27.66s/it]

Epoch: 80 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.650 |  Val. PPL:   5.205
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  40%|████      | 81/200 [37:43<55:53, 28.18s/it]

Epoch: 81 | Time: 0m 29s
	Train Loss: 0.021 | Train PPL:   1.021
	 Val. Loss: 1.621 |  Val. PPL:   5.057


Started Training:  41%|████      | 82/200 [38:10<55:00, 27.97s/it]

Epoch: 82 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.623 |  Val. PPL:   5.070


Started Training:  42%|████▏     | 83/200 [38:38<54:12, 27.80s/it]

Epoch: 83 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.718 |  Val. PPL:   5.573


Started Training:  42%|████▏     | 84/200 [39:05<53:32, 27.70s/it]

Epoch: 84 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.018
	 Val. Loss: 1.657 |  Val. PPL:   5.245


Started Training:  42%|████▎     | 85/200 [39:33<52:56, 27.62s/it]

Epoch: 85 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.644 |  Val. PPL:   5.175
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  43%|████▎     | 86/200 [40:02<53:32, 28.18s/it]

Epoch: 86 | Time: 0m 29s
	Train Loss: 0.014 | Train PPL:   1.014
	 Val. Loss: 1.628 |  Val. PPL:   5.096


Started Training:  44%|████▎     | 87/200 [40:30<52:41, 27.97s/it]

Epoch: 87 | Time: 0m 27s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.619 |  Val. PPL:   5.048


Started Training:  44%|████▍     | 88/200 [40:57<51:56, 27.83s/it]

Epoch: 88 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.643 |  Val. PPL:   5.168


Started Training:  44%|████▍     | 89/200 [41:25<51:18, 27.73s/it]

Epoch: 89 | Time: 0m 27s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.666 |  Val. PPL:   5.289


Started Training:  45%|████▌     | 90/200 [41:52<50:41, 27.65s/it]

Epoch: 90 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.681 |  Val. PPL:   5.370
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  46%|████▌     | 91/200 [42:22<51:14, 28.21s/it]

Epoch: 91 | Time: 0m 29s
	Train Loss: 0.014 | Train PPL:   1.014
	 Val. Loss: 1.665 |  Val. PPL:   5.284


Started Training:  46%|████▌     | 92/200 [42:49<50:23, 28.00s/it]

Epoch: 92 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.017
	 Val. Loss: 1.668 |  Val. PPL:   5.301


Started Training:  46%|████▋     | 93/200 [43:17<49:39, 27.84s/it]

Epoch: 93 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.617 |  Val. PPL:   5.036


Started Training:  47%|████▋     | 94/200 [43:44<49:01, 27.75s/it]

Epoch: 94 | Time: 0m 27s
	Train Loss: 0.020 | Train PPL:   1.020
	 Val. Loss: 1.645 |  Val. PPL:   5.180


Started Training:  48%|████▊     | 95/200 [44:12<48:25, 27.67s/it]

Epoch: 95 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.681 |  Val. PPL:   5.372
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  48%|████▊     | 96/200 [44:41<48:54, 28.22s/it]

Epoch: 96 | Time: 0m 29s
	Train Loss: 0.014 | Train PPL:   1.014
	 Val. Loss: 1.615 |  Val. PPL:   5.027


Started Training:  48%|████▊     | 97/200 [45:09<48:05, 28.01s/it]

Epoch: 97 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.644 |  Val. PPL:   5.178


Started Training:  49%|████▉     | 98/200 [45:36<47:22, 27.87s/it]

Epoch: 98 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.695 |  Val. PPL:   5.445


Started Training:  50%|████▉     | 99/200 [46:04<46:43, 27.76s/it]

Epoch: 99 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.643 |  Val. PPL:   5.171


Started Training:  50%|█████     | 100/200 [46:31<46:08, 27.69s/it]

Epoch: 100 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.672 |  Val. PPL:   5.320
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  50%|█████     | 101/200 [47:01<46:32, 28.21s/it]

Epoch: 101 | Time: 0m 29s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.677 |  Val. PPL:   5.347


Started Training:  51%|█████     | 102/200 [47:28<45:42, 27.98s/it]

Epoch: 102 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.654 |  Val. PPL:   5.229


Started Training:  52%|█████▏    | 103/200 [47:55<44:59, 27.83s/it]

Epoch: 103 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.611 |  Val. PPL:   5.009


Started Training:  52%|█████▏    | 104/200 [48:23<44:25, 27.76s/it]

Checkpoint saved: New best validation loss 1.601
Epoch: 104 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.601 |  Val. PPL:   4.958


Started Training:  52%|█████▎    | 105/200 [48:51<43:48, 27.67s/it]

Epoch: 105 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.020
	 Val. Loss: 1.658 |  Val. PPL:   5.250
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  53%|█████▎    | 106/200 [49:20<44:11, 28.20s/it]

Epoch: 106 | Time: 0m 29s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.691 |  Val. PPL:   5.426


Started Training:  54%|█████▎    | 107/200 [49:47<43:21, 27.98s/it]

Epoch: 107 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.712 |  Val. PPL:   5.540


Started Training:  54%|█████▍    | 108/200 [50:15<42:40, 27.83s/it]

Epoch: 108 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.676 |  Val. PPL:   5.346


Started Training:  55%|█████▍    | 109/200 [50:42<42:02, 27.72s/it]

Epoch: 109 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.670 |  Val. PPL:   5.314


Started Training:  55%|█████▌    | 110/200 [51:10<41:31, 27.68s/it]

Checkpoint saved: New best validation loss 1.601
Epoch: 110 | Time: 0m 27s
	Train Loss: 0.020 | Train PPL:   1.020
	 Val. Loss: 1.601 |  Val. PPL:   4.958
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  56%|█████▌    | 111/200 [51:39<41:51, 28.22s/it]

Epoch: 111 | Time: 0m 29s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.640 |  Val. PPL:   5.154


Started Training:  56%|█████▌    | 112/200 [52:07<41:02, 27.99s/it]

Epoch: 112 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.639 |  Val. PPL:   5.151


Started Training:  56%|█████▋    | 113/200 [52:34<40:21, 27.83s/it]

Epoch: 113 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.017
	 Val. Loss: 1.670 |  Val. PPL:   5.313


Started Training:  57%|█████▋    | 114/200 [53:02<39:44, 27.72s/it]

Epoch: 114 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.630 |  Val. PPL:   5.102


Started Training:  57%|█████▊    | 115/200 [53:29<39:09, 27.65s/it]

Epoch: 115 | Time: 0m 27s
	Train Loss: 0.014 | Train PPL:   1.015
	 Val. Loss: 1.613 |  Val. PPL:   5.020
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  58%|█████▊    | 116/200 [53:59<39:27, 28.18s/it]

Epoch: 116 | Time: 0m 29s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.662 |  Val. PPL:   5.270


Started Training:  58%|█████▊    | 117/200 [54:26<38:42, 27.98s/it]

Epoch: 117 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.020
	 Val. Loss: 1.715 |  Val. PPL:   5.557


Started Training:  59%|█████▉    | 118/200 [54:54<38:02, 27.83s/it]

Epoch: 118 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.629 |  Val. PPL:   5.096


Started Training:  60%|█████▉    | 119/200 [55:21<37:28, 27.76s/it]

Checkpoint saved: New best validation loss 1.599
Epoch: 119 | Time: 0m 27s
	Train Loss: 0.014 | Train PPL:   1.014
	 Val. Loss: 1.599 |  Val. PPL:   4.950


Started Training:  60%|██████    | 120/200 [55:49<36:54, 27.68s/it]

Epoch: 120 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.020
	 Val. Loss: 1.631 |  Val. PPL:   5.111
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  60%|██████    | 121/200 [56:18<37:08, 28.21s/it]

Epoch: 121 | Time: 0m 29s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.620 |  Val. PPL:   5.055


Started Training:  61%|██████    | 122/200 [56:46<36:23, 27.99s/it]

Epoch: 122 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.613 |  Val. PPL:   5.017


Started Training:  62%|██████▏   | 123/200 [57:13<35:42, 27.83s/it]

Epoch: 123 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.626 |  Val. PPL:   5.085


Started Training:  62%|██████▏   | 124/200 [57:41<35:07, 27.73s/it]

Epoch: 124 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.018
	 Val. Loss: 1.608 |  Val. PPL:   4.991


Started Training:  62%|██████▎   | 125/200 [58:08<34:33, 27.64s/it]

Epoch: 125 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.636 |  Val. PPL:   5.134
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  63%|██████▎   | 126/200 [58:38<34:44, 28.17s/it]

Epoch: 126 | Time: 0m 29s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.631 |  Val. PPL:   5.111


Started Training:  64%|██████▎   | 127/200 [59:05<34:00, 27.96s/it]

Epoch: 127 | Time: 0m 27s
	Train Loss: 0.013 | Train PPL:   1.013
	 Val. Loss: 1.648 |  Val. PPL:   5.198


Started Training:  64%|██████▍   | 128/200 [59:32<33:22, 27.81s/it]

Epoch: 128 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.683 |  Val. PPL:   5.379


Started Training:  64%|██████▍   | 129/200 [1:00:00<32:47, 27.71s/it]

Epoch: 129 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.020
	 Val. Loss: 1.635 |  Val. PPL:   5.127


Started Training:  65%|██████▌   | 130/200 [1:00:27<32:14, 27.63s/it]

Epoch: 130 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.612 |  Val. PPL:   5.013
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  66%|██████▌   | 131/200 [1:00:57<32:24, 28.18s/it]

Epoch: 131 | Time: 0m 29s
	Train Loss: 0.020 | Train PPL:   1.021
	 Val. Loss: 1.669 |  Val. PPL:   5.304


Started Training:  66%|██████▌   | 132/200 [1:01:24<31:42, 27.98s/it]

Epoch: 132 | Time: 0m 27s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.647 |  Val. PPL:   5.191


Started Training:  66%|██████▋   | 133/200 [1:01:52<31:04, 27.82s/it]

Epoch: 133 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.614 |  Val. PPL:   5.025


Started Training:  67%|██████▋   | 134/200 [1:02:19<30:29, 27.72s/it]

Epoch: 134 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.626 |  Val. PPL:   5.084


Started Training:  68%|██████▊   | 135/200 [1:02:47<29:56, 27.64s/it]

Epoch: 135 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.649 |  Val. PPL:   5.203
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  68%|██████▊   | 136/200 [1:03:16<30:03, 28.18s/it]

Epoch: 136 | Time: 0m 29s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.605 |  Val. PPL:   4.977


Started Training:  68%|██████▊   | 137/200 [1:03:44<29:22, 27.97s/it]

Epoch: 137 | Time: 0m 27s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.666 |  Val. PPL:   5.293


Started Training:  69%|██████▉   | 138/200 [1:04:11<28:44, 27.82s/it]

Epoch: 138 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.614 |  Val. PPL:   5.022


Started Training:  70%|██████▉   | 139/200 [1:04:39<28:10, 27.72s/it]

Epoch: 139 | Time: 0m 27s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.671 |  Val. PPL:   5.317


Started Training:  70%|███████   | 140/200 [1:05:06<27:38, 27.64s/it]

Epoch: 140 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.712 |  Val. PPL:   5.538
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  70%|███████   | 141/200 [1:05:35<27:42, 28.18s/it]

Epoch: 141 | Time: 0m 29s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.648 |  Val. PPL:   5.198


Started Training:  71%|███████   | 142/200 [1:06:03<27:02, 27.97s/it]

Epoch: 142 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.672 |  Val. PPL:   5.320


Started Training:  72%|███████▏  | 143/200 [1:06:30<26:25, 27.82s/it]

Epoch: 143 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.017
	 Val. Loss: 1.651 |  Val. PPL:   5.212


Started Training:  72%|███████▏  | 144/200 [1:06:58<25:52, 27.72s/it]

Epoch: 144 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.652 |  Val. PPL:   5.218


Started Training:  72%|███████▎  | 145/200 [1:07:25<25:20, 27.65s/it]

Epoch: 145 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.604 |  Val. PPL:   4.972
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  73%|███████▎  | 146/200 [1:07:55<25:22, 28.19s/it]

Epoch: 146 | Time: 0m 29s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.657 |  Val. PPL:   5.246


Started Training:  74%|███████▎  | 147/200 [1:08:22<24:42, 27.97s/it]

Epoch: 147 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.629 |  Val. PPL:   5.100


Started Training:  74%|███████▍  | 148/200 [1:08:50<24:07, 27.83s/it]

Epoch: 148 | Time: 0m 27s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.663 |  Val. PPL:   5.274


Started Training:  74%|███████▍  | 149/200 [1:09:17<23:33, 27.72s/it]

Epoch: 149 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.668 |  Val. PPL:   5.300


Started Training:  75%|███████▌  | 150/200 [1:09:45<23:02, 27.65s/it]

Epoch: 150 | Time: 0m 27s
	Train Loss: 0.020 | Train PPL:   1.020
	 Val. Loss: 1.658 |  Val. PPL:   5.249
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  76%|███████▌  | 151/200 [1:10:14<23:00, 28.17s/it]

Epoch: 151 | Time: 0m 29s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.649 |  Val. PPL:   5.201


Started Training:  76%|███████▌  | 152/200 [1:10:42<22:21, 27.95s/it]

Epoch: 152 | Time: 0m 27s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.681 |  Val. PPL:   5.369


Started Training:  76%|███████▋  | 153/200 [1:11:09<21:48, 27.84s/it]

Epoch: 153 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.647 |  Val. PPL:   5.192


Started Training:  77%|███████▋  | 154/200 [1:11:37<21:16, 27.76s/it]

Epoch: 154 | Time: 0m 27s
	Train Loss: 0.013 | Train PPL:   1.013
	 Val. Loss: 1.644 |  Val. PPL:   5.174


Started Training:  78%|███████▊  | 155/200 [1:12:04<20:45, 27.69s/it]

Epoch: 155 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.627 |  Val. PPL:   5.090
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  78%|███████▊  | 156/200 [1:12:34<20:41, 28.22s/it]

Epoch: 156 | Time: 0m 29s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.640 |  Val. PPL:   5.153


Started Training:  78%|███████▊  | 157/200 [1:13:01<20:03, 27.99s/it]

Epoch: 157 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.622 |  Val. PPL:   5.062


Started Training:  79%|███████▉  | 158/200 [1:13:29<19:29, 27.84s/it]

Epoch: 158 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.019
	 Val. Loss: 1.634 |  Val. PPL:   5.125


Started Training:  80%|███████▉  | 159/200 [1:13:56<18:57, 27.73s/it]

Epoch: 159 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.649 |  Val. PPL:   5.204


Started Training:  80%|████████  | 160/200 [1:14:24<18:26, 27.67s/it]

Epoch: 160 | Time: 0m 27s
	Train Loss: 0.020 | Train PPL:   1.021
	 Val. Loss: 1.639 |  Val. PPL:   5.152
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  80%|████████  | 161/200 [1:14:53<18:20, 28.21s/it]

Epoch: 161 | Time: 0m 29s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.686 |  Val. PPL:   5.397


Started Training:  81%|████████  | 162/200 [1:15:21<17:43, 27.99s/it]

Epoch: 162 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.603 |  Val. PPL:   4.967


Started Training:  82%|████████▏ | 163/200 [1:15:48<17:09, 27.83s/it]

Epoch: 163 | Time: 0m 27s
	Train Loss: 0.021 | Train PPL:   1.021
	 Val. Loss: 1.629 |  Val. PPL:   5.099


Started Training:  82%|████████▏ | 164/200 [1:16:16<16:38, 27.73s/it]

Epoch: 164 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.631 |  Val. PPL:   5.107


Started Training:  82%|████████▎ | 165/200 [1:16:43<16:07, 27.65s/it]

Epoch: 165 | Time: 0m 27s
	Train Loss: 0.014 | Train PPL:   1.014
	 Val. Loss: 1.672 |  Val. PPL:   5.324
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  83%|████████▎ | 166/200 [1:17:12<15:58, 28.19s/it]

Epoch: 166 | Time: 0m 29s
	Train Loss: 0.014 | Train PPL:   1.014
	 Val. Loss: 1.658 |  Val. PPL:   5.248


Started Training:  84%|████████▎ | 167/200 [1:17:40<15:23, 27.98s/it]

Epoch: 167 | Time: 0m 27s
	Train Loss: 0.014 | Train PPL:   1.014
	 Val. Loss: 1.633 |  Val. PPL:   5.117


Started Training:  84%|████████▍ | 168/200 [1:18:07<14:50, 27.83s/it]

Epoch: 168 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.643 |  Val. PPL:   5.170


Started Training:  84%|████████▍ | 169/200 [1:18:35<14:19, 27.73s/it]

Epoch: 169 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.631 |  Val. PPL:   5.107


Started Training:  85%|████████▌ | 170/200 [1:19:02<13:49, 27.65s/it]

Epoch: 170 | Time: 0m 27s
	Train Loss: 0.014 | Train PPL:   1.014
	 Val. Loss: 1.632 |  Val. PPL:   5.116
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  86%|████████▌ | 171/200 [1:19:32<13:37, 28.20s/it]

Epoch: 171 | Time: 0m 29s
	Train Loss: 0.013 | Train PPL:   1.013
	 Val. Loss: 1.647 |  Val. PPL:   5.193


Started Training:  86%|████████▌ | 172/200 [1:19:59<13:03, 27.98s/it]

Epoch: 172 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.017
	 Val. Loss: 1.648 |  Val. PPL:   5.199


Started Training:  86%|████████▋ | 173/200 [1:20:27<12:31, 27.84s/it]

Epoch: 173 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.639 |  Val. PPL:   5.152


Started Training:  87%|████████▋ | 174/200 [1:20:54<12:01, 27.76s/it]

Epoch: 174 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.625 |  Val. PPL:   5.079


Started Training:  88%|████████▊ | 175/200 [1:21:22<11:32, 27.69s/it]

Epoch: 175 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.648 |  Val. PPL:   5.198
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  88%|████████▊ | 176/200 [1:21:52<11:18, 28.25s/it]

Epoch: 176 | Time: 0m 29s
	Train Loss: 0.013 | Train PPL:   1.013
	 Val. Loss: 1.644 |  Val. PPL:   5.173


Started Training:  88%|████████▊ | 177/200 [1:22:19<10:45, 28.05s/it]

Epoch: 177 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.675 |  Val. PPL:   5.336


Started Training:  89%|████████▉ | 178/200 [1:22:47<10:14, 27.91s/it]

Epoch: 178 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.599 |  Val. PPL:   4.950


Started Training:  90%|████████▉ | 179/200 [1:23:14<09:43, 27.80s/it]

Epoch: 179 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.638 |  Val. PPL:   5.146


Started Training:  90%|█████████ | 180/200 [1:23:42<09:14, 27.72s/it]

Epoch: 180 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.629 |  Val. PPL:   5.097
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  90%|█████████ | 181/200 [1:24:11<08:56, 28.26s/it]

Epoch: 181 | Time: 0m 29s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.651 |  Val. PPL:   5.211


Started Training:  91%|█████████ | 182/200 [1:24:39<08:24, 28.04s/it]

Epoch: 182 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.017
	 Val. Loss: 1.681 |  Val. PPL:   5.372


Started Training:  92%|█████████▏| 183/200 [1:25:06<07:53, 27.87s/it]

Epoch: 183 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.604 |  Val. PPL:   4.972


Started Training:  92%|█████████▏| 184/200 [1:25:34<07:24, 27.77s/it]

Epoch: 184 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.648 |  Val. PPL:   5.196


Started Training:  92%|█████████▎| 185/200 [1:26:01<06:55, 27.70s/it]

Epoch: 185 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.017
	 Val. Loss: 1.664 |  Val. PPL:   5.282
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  93%|█████████▎| 186/200 [1:26:31<06:35, 28.22s/it]

Epoch: 186 | Time: 0m 29s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.656 |  Val. PPL:   5.241


Started Training:  94%|█████████▎| 187/200 [1:26:58<06:04, 28.00s/it]

Epoch: 187 | Time: 0m 27s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.694 |  Val. PPL:   5.440


Started Training:  94%|█████████▍| 188/200 [1:27:26<05:34, 27.86s/it]

Epoch: 188 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.613 |  Val. PPL:   5.018


Started Training:  94%|█████████▍| 189/200 [1:27:53<05:05, 27.76s/it]

Epoch: 189 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.619 |  Val. PPL:   5.046


Started Training:  95%|█████████▌| 190/200 [1:28:21<04:36, 27.68s/it]

Epoch: 190 | Time: 0m 27s
	Train Loss: 0.019 | Train PPL:   1.019
	 Val. Loss: 1.619 |  Val. PPL:   5.048
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  96%|█████████▌| 191/200 [1:28:50<04:14, 28.23s/it]

Epoch: 191 | Time: 0m 29s
	Train Loss: 0.014 | Train PPL:   1.014
	 Val. Loss: 1.698 |  Val. PPL:   5.461


Started Training:  96%|█████████▌| 192/200 [1:29:18<03:44, 28.02s/it]

Epoch: 192 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.642 |  Val. PPL:   5.167


Started Training:  96%|█████████▋| 193/200 [1:29:45<03:15, 27.88s/it]

Epoch: 193 | Time: 0m 27s
	Train Loss: 0.016 | Train PPL:   1.016
	 Val. Loss: 1.624 |  Val. PPL:   5.072


Started Training:  97%|█████████▋| 194/200 [1:30:13<02:46, 27.77s/it]

Epoch: 194 | Time: 0m 27s
	Train Loss: 0.018 | Train PPL:   1.018
	 Val. Loss: 1.688 |  Val. PPL:   5.411


Started Training:  98%|█████████▊| 195/200 [1:30:40<02:18, 27.69s/it]

Epoch: 195 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.634 |  Val. PPL:   5.125
=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: fixed could have some <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: fixed quite an interesting ence , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  98%|█████████▊| 196/200 [1:31:10<01:52, 28.23s/it]

Epoch: 196 | Time: 0m 29s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.664 |  Val. PPL:   5.282


Started Training:  98%|█████████▊| 197/200 [1:31:37<01:24, 28.01s/it]

Epoch: 197 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.630 |  Val. PPL:   5.105


Started Training:  99%|█████████▉| 198/200 [1:32:05<00:55, 27.85s/it]

Epoch: 198 | Time: 0m 27s
	Train Loss: 0.014 | Train PPL:   1.014
	 Val. Loss: 1.628 |  Val. PPL:   5.095


Started Training: 100%|█████████▉| 199/200 [1:32:32<00:27, 27.74s/it]

Epoch: 199 | Time: 0m 27s
	Train Loss: 0.015 | Train PPL:   1.015
	 Val. Loss: 1.625 |  Val. PPL:   5.077


Started Training: 100%|██████████| 200/200 [1:33:00<00:00, 27.90s/it]

Epoch: 200 | Time: 0m 27s
	Train Loss: 0.017 | Train PPL:   1.017
	 Val. Loss: 1.649 |  Val. PPL:   5.202





In [39]:
generate_seq(model, ["Hi, introduce yourself"], bpe_tokenizer, SEQ_LEN, special_ids, gen_len=100)

=== Sample generations ===
[1] INPUT : hi, introduce yourself
OUTPUT: length exceedingly yourself . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


[('hi, introduce yourself',
  ['length',
   'exceedingly',
   'yourself',
   '.',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<e