In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import math
import time
import os
import random
import random
import re
import unicodedata
from tqdm import tqdm

2025-11-14 16:31:23.530518: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763137883.738580      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763137883.795155      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [2]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from tokenizers.trainers import BpeTrainer

In [3]:
# Config
seed = 1234
DATA_DIR = 'data'
SEQ_LEN = 100
VOCAB_SIZE = 5000
EMB_DIM = 256
ENC_HIDDEN_DIM = 512
DEC_HIDDEN_DIM = 512
DROPOUT = 0.3
N_EPOCHS = 200
LEARNING_RATE = 1e-2
BATCH_SIZE = 64
# Output directories
CHECKPOINT_DIR = 'checkpoints'
LOG_DIR = 'runs'
tokenizer_path = "tknzer_dir"
special_tokens = ["<pad>", "<st>", "<end>", "<unk>"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
seed_everything(seed)

In [5]:
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

In [6]:
def load_txt(txt_file):
    with open(txt_file, "r", encoding="utf-8") as f:
        text = f.read()
    return text

In [7]:
dataset_path = "/kaggle/input/iiith-assignment2-dataset/dataset/Pride_and_Prejudice-Jane_Austen.txt"

In [8]:
def normalize_text(text, lowercase = True):
    if lowercase:
        text = text.lower()
    # Unicode Normalization (e.g., converting fancy quotes to standard ones)
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[\*_`\[\]\^{}]', '', text) # handle _word_ and *word*
    text = re.sub(r'\b(mr|mrs|ms|dr|st)\.', r'\1', text, flags=re.IGNORECASE) # handle mr. mrs. etc
    text = text.strip()
    return text

In [9]:
custom_dataset = normalize_text(load_txt(dataset_path))

In [10]:
len(set(custom_dataset)), len(list(custom_dataset))

(56, 702635)

In [11]:
def get_tokenizer(dataset, vocab_size, save_fldr, name = 'bpe'):
    if os.path.exists(save_fldr):
        save_path = os.path.join(save_fldr, 'tokenizer.json')
        return Tokenizer.from_file(save_path)
    if name == 'bpe':
        tokenizer = Tokenizer(models.BPE())
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=special_tokens)
        tokenizer.train_from_iterator([dataset], trainer)
        if save_fldr:
            os.makedirs(save_fldr, exist_ok=True)
            tokenizer.save(os.path.join(save_fldr, 'tokenizer.json'))
            return tokenizer

In [12]:
bpe_tokenizer = get_tokenizer(custom_dataset, VOCAB_SIZE, tokenizer_path, 'bpe')






In [13]:
tokenized_data = bpe_tokenizer.encode(custom_dataset).ids

In [14]:
len(tokenized_data)

154018

In [15]:
def get_data(data, split, train_percent = 0.8):
    n = len(data)
    split_percent = train_percent
    if split == "train":
        return data[ : int((split_percent)*n)]
    elif split == "val":
        return data[int((split_percent)*n) : ]
    else:
        return None

In [16]:
train_data = get_data(tokenized_data,'train', train_percent = 0.8)
val_data = get_data(tokenized_data,'val', train_percent = 0.8)

In [17]:
len(train_data)

123214

In [18]:
# Special token ids
PAD_TOKEN = bpe_tokenizer.token_to_id('<pad>')
ST_TOKEN = bpe_tokenizer.token_to_id('<st>')
END_TOKEN = bpe_tokenizer.token_to_id('<end>')
FULL_STOP = bpe_tokenizer.token_to_id('.')
QUESTION_TOKEN = bpe_tokenizer.token_to_id('?')
EXCLAMATION = bpe_tokenizer.token_to_id('!')
special_ids = PAD_TOKEN, ST_TOKEN, END_TOKEN, FULL_STOP, QUESTION_TOKEN, EXCLAMATION
special_ids

(0, 1, 2, 15, 29, 4)

In [19]:
# Dataset
class SimpleDataset(Dataset):
    def __init__(self, data, special_ids, seq_len):
        super().__init__()
        self.token_ids = data
        self.seq_len = seq_len
        self.pad, self.st, self.end, self.full_stop, self.quest, self.excl= special_ids
        self.sentence_enders = {self.full_stop, self.quest, self.excl}
        
        self.inputs = []
        self.decoder_inputs = []
        self.decoder_targets = []
        
        
        self.create_sequences()
        
    def create_sequences(self):
        tokens = self.token_ids
        n = len(tokens)
        L = self.seq_len
        
        i = 0
        
        while i < n-1:
            enc_seq = []
            seq_end_idx = i 
            for j in range(L):
                current_idx = i + j
                if current_idx >= n:
                    break
                token = tokens[current_idx]
                
                enc_seq.append(token)
                seq_end_idx = current_idx
                
                if token in self.sentence_enders:
                    break
            
            
            if not enc_seq or len(enc_seq) < 2:
                i = seq_end_idx + 1 # Move to the next token
                continue

            dec_target_seq = enc_seq[1:] + [self.end]
            dec_input_seq = [self.st] + dec_target_seq[:-1]
                
            # Pad sequences if shorter
            enc_seq_len = len(enc_seq)
            enc_seq = enc_seq + [self.pad] * max(L - enc_seq_len, 0)
            dec_input_seq = dec_input_seq + [self.pad] * max(L - enc_seq_len, 0)
            dec_target_seq = dec_target_seq + [self.pad] * max(L - enc_seq_len, 0)

            self.inputs.append(enc_seq)
            self.decoder_inputs.append(dec_input_seq)
            self.decoder_targets.append(dec_target_seq)
            
            i = seq_end_idx + 1
    
    def __len__(self):
        return len(self.inputs)
    
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.inputs[idx], dtype=torch.long), # encoder input 50 timesteps
            torch.tensor(self.decoder_inputs[idx], dtype=torch.long), # decoder inputs shifter from target 50 steps
            torch.tensor(self.decoder_targets[idx], dtype=torch.long) # decoder targets 50 steps
        )

In [20]:
train_dataset = SimpleDataset(train_data, special_ids, SEQ_LEN)
val_dataset = SimpleDataset(val_data, special_ids, SEQ_LEN)

In [21]:
len(train_dataset), len(val_dataset)

(3667, 1225)

In [22]:
# Example
val_dataset[0]

(tensor([ 370, 2784,   13,  406,  876,   15,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]),
 tensor([   1, 2784,   13,  406,  876,   15,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,

In [23]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [24]:
%%writefile model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

# ENCODER
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.enc_hidden_dim = hidden_dim
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=dropout
        )
    
    def forward(self, x):
        # x: [batch, seq_len]
        emb = self.embedding(x)  # [batch, seq_len, emb_dim]
        outputs, (h, c) = self.lstm(emb)  # outputs: [batch, seq_len, 2*hidden]
        return outputs, (h, c)

# ATTENTION 
class Attention(nn.Module):
    def __init__(self, enc_hidden_dim, dec_hidden_dim):
        super().__init__()
        
        # We need to align the dimensions of encoder outputs and decoder hidden state
        # enc_hidden_dim is for one direction, but encoder is bidirectional (2*)
        self.attn = nn.Linear((enc_hidden_dim * 2) + dec_hidden_dim, dec_hidden_dim)
        self.v = nn.Linear(dec_hidden_dim, 1, bias = False)
        
    def forward(self, dec_hidden, enc_outputs):
        # dec_hidden: [batch, dec_hidden_dim] (from the *top layer* of decoder)
        # enc_outputs: [batch, src_len, enc_hidden_dim * 2]
        
        batch_size = enc_outputs.shape[0]
        src_len = enc_outputs.shape[1]
        
        # Repeat decoder hidden state src_len times to concatenate
        # dec_hidden: [batch, src_len, dec_hidden_dim]
        dec_hidden = dec_hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        # energy: [batch, src_len, (enc_hidden * 2) + dec_hidden]
        energy_input = torch.cat((dec_hidden, enc_outputs), dim = 2)
        
        # energy: [batch, src_len, dec_hidden_dim]
        energy = torch.tanh(self.attn(energy_input))
        
        # v(energy): [batch, src_len, 1] -> [batch, src_len]
        attention = self.v(energy).squeeze(2)
        
        # Return softmax'd weights
        return F.softmax(attention, dim=1)

# Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, enc_hidden_dim, dec_hidden_dim, dropout=0.1):
        super().__init__()
        self.dec_hidden_dim = dec_hidden_dim
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        
        self.attention = Attention(enc_hidden_dim, dec_hidden_dim)
        
        self.lstm = nn.LSTM(
            input_size=emb_dim + (enc_hidden_dim * 2),
            hidden_size=dec_hidden_dim,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )
        
        self.fc = nn.Linear(dec_hidden_dim + (enc_hidden_dim * 2), vocab_size)
    
    def forward(self, dec_input, dec_hidden, enc_outputs):
        # dec_input: [batch] -> current token IDs
        # dec_hidden: (h, c) from previous step
        # enc_outputs: [batch, src_len, 2*enc_hidden_dim]
        
        # dec_input [batch] -> [batch, 1]
        dec_input = dec_input.unsqueeze(1)
        
        # emb: [batch, 1, emb_dim]
        emb = self.embedding(dec_input)
        
       
        a = self.attention(dec_hidden[0][-1], enc_outputs)
        
        a = a.unsqueeze(1)
        
        context = torch.bmm(a, enc_outputs)
        
        lstm_input = torch.cat([emb, context], dim=2)
        output, dec_hidden = self.lstm(lstm_input, dec_hidden)
        
        output = output.squeeze(1)
        context = context.squeeze(1)

        concat_output = torch.cat([output, context], dim=1)
        
        logits = self.fc(concat_output)
        
        return logits, dec_hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        self.enc_num_layers = self.encoder.lstm.num_layers
        self.enc_num_directions = 2 if self.encoder.lstm.bidirectional else 1
        self.enc_hidden_dim = self.encoder.enc_hidden_dim
        self.dec_hidden_dim = self.decoder.dec_hidden_dim
        
        self.fc_hidden = nn.Linear(self.enc_hidden_dim * self.enc_num_directions, self.dec_hidden_dim)
        self.fc_cell = nn.Linear(self.enc_hidden_dim * self.enc_num_directions, self.dec_hidden_dim)

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.fc.out_features
        
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        enc_outputs, (h, c) = self.encoder(src)
        
        h = h.view(self.enc_num_layers, self.enc_num_directions, batch_size, self.enc_hidden_dim)
        c = c.view(self.enc_num_layers, self.enc_num_directions, batch_size, self.enc_hidden_dim)
        
        h_cat = torch.cat((h[:, 0, :, :], h[:, 1, :, :]), dim=2)
        c_cat = torch.cat((c[:, 0, :, :], c[:, 1, :, :]), dim=2)
        
        dec_hidden = (torch.tanh(self.fc_hidden(h_cat)), 
                      torch.tanh(self.fc_cell(c_cat)))
        
        dec_input = trg[:, 0]
        
        # Loop from 0, store at t
        for t in range(trg_len):
            
            # The decoder's forward pass now includes the attention mechanism
            logits, dec_hidden = self.decoder(dec_input, dec_hidden, enc_outputs)
            
            outputs[:, t] = logits
            
            use_teacher_force = random.random() < teacher_forcing_ratio
            
            if use_teacher_force:
                if t < trg_len - 1:
                    dec_input = trg[:, t+1]
                else:
                    break
            else:
                top1 = logits.argmax(1)
                dec_input = top1
            
        return outputs

Writing model.py


In [25]:
from model import Encoder, Decoder, Seq2Seq

In [26]:
encoder = Encoder(VOCAB_SIZE, EMB_DIM, ENC_HIDDEN_DIM, dropout=DROPOUT)
decoder = Decoder(VOCAB_SIZE, EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, dropout=DROPOUT)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

In [27]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(5000, 256)
    (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(5000, 256)
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (lstm): LSTM(1280, 512, num_layers=2, batch_first=True, dropout=0.3)
    (fc): Linear(in_features=1536, out_features=5000, bias=True)
  )
  (fc_hidden): Linear(in_features=1024, out_features=512, bias=True)
  (fc_cell): Linear(in_features=1024, out_features=512, bias=True)
)

In [28]:
def load_ckpt(model):
    if os.path.exists(CHECKPOINT_DIR):
        models = [os.path.join(CHECKPOINT_DIR,ckpt) for ckpt in os.listdir(CHECKPOINT_DIR)]
        if len(models) > 0:
            sorted_models = sorted(models, key=lambda x: x.split("_val")[1].split('.pt')[0])
            model.load_state_dict(torch.load(sorted_models[0], map_location=DEVICE))
            print(f"Loaded checkpoint from {sorted_models[0]}")
            best_loss = float(os.path.basename(sorted_models[0]).split("_val")[1].split('.pt')[0])
            last_epoch = int(os.path.basename(sorted_models[0]).split("_val")[0].split('Epoch')[1])
            return best_loss, last_epoch
    

In [29]:
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5)

In [30]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)

In [31]:
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for i, (src, dec_in, trg) in enumerate(iterator):
        
        src, dec_in, trg = src.to(DEVICE), dec_in.to(DEVICE), trg.to(DEVICE)
        
        optimizer.zero_grad()
        output = model(src, dec_in)
        
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [32]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, (src, dec_in, trg) in enumerate(iterator):
            src, dec_in, trg = src.to(DEVICE), dec_in.to(DEVICE), trg.to(DEVICE)
            output = model(src, dec_in)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [33]:
def generate_seq(model, texts, tokenizer, seq_len, special_ids, gen_len=100):
    model.eval()
    samples = []
    pad, st, end, full_stop, quest, excl= special_ids

    with torch.no_grad():
        for text in texts:
            # prepare encoder input
            text = normalize_text(text)
            ids = tokenizer.encode(text).ids
            if len(ids) < seq_len:
                ids = ids + [pad] * (seq_len - len(ids))
            else:
                print("Warning: More than seq length -- considering first 50 tokens:")
                ids = ids[:seq_len]

            src = torch.tensor([ids], dtype=torch.long, device=DEVICE)
            # print(src.shape)
            # print(src)

            # start decoder with only the start token (length = 1)
            dec_in = torch.tensor([[st]], dtype=torch.long, device=DEVICE)

            generated = []
            for step in range(gen_len):
                out = model(src, dec_in)          # [1, cur_dec_len, vocab_size]
                next_token = int(out[0, -1].argmax().cpu().item())  # last timestep prediction
                generated.append(next_token)
                
                # if next_token == end:
                #     break

                

                # append predicted token to decoder input for next step
                dec_in = torch.cat(
                    [dec_in, torch.tensor([[next_token]], dtype=torch.long, device=DEVICE)],
                    dim=1
                )
            # convert ids -> tokens
            # print("Out Tokens", generated)
            tokens = [tokenizer.id_to_token(tid) for tid in generated]
            samples.append((text, tokens))

    print("=== Sample generations ===")
    for idx, (inp, toks) in enumerate(samples, 1):
        print(f"[{idx}] INPUT : {inp}")
        print(f"OUTPUT: {' '.join(toks)}")
    print("======================================================")
    return samples

In [34]:
writer = SummaryWriter(LOG_DIR)

In [35]:
# text from internet
"""If I could have ceased what pendulums swung, or wheels turned, 
or water clocks emptied, then, in order to keep the Fates from marching in time,
I would have, for though it is what a boy naturally wishes when he fears change will 
come upon what he loves and take it away, a man remembers it, too, and in his heart 
wishes the same when all around him he feels only loss, loss that has been his 
companion for some time, and promises to remain at his side."""


"""
It is quite an interesting format, if you don't like the opening you are reading,
then you can skip and go to the next book opening. If the opening does capture your attention
you can click at the bottom to reveal the title and author. It reminds me of blind date books 
where bookstores will wrap a random book up in paper (to prevent the title and author from being seen)
and will write brief descriptions about it.


"""

"\nIt is quite an interesting format, if you don't like the opening you are reading,\nthen you can skip and go to the next book opening. If the opening does capture your attention\nyou can click at the bottom to reveal the title and author. It reminds me of blind date books \nwhere bookstores will wrap a random book up in paper (to prevent the title and author from being seen)\nand will write brief descriptions about it.\n\n\n"

In [36]:
CUSTOM_TEXTS = [
                "If I could have ceased",
                "It is quite an interesting format, if you"
        ]

In [37]:
def run_lm(predict_while_train = True, resume_ckpt = True):
    best_valid_loss = float('inf')
    last_epoch = 0
    if resume_ckpt:
        if os.listdir(CHECKPOINT_DIR):
            best_valid_loss, last_epoch = load_ckpt(model)
    
    for epoch in tqdm(range(N_EPOCHS), desc="Started Training"):
        epoch += last_epoch
        start_time = time.time()
        
        if predict_while_train:
            if epoch%5 == 0:
                generate_seq(model, CUSTOM_TEXTS, bpe_tokenizer, SEQ_LEN, special_ids, gen_len=50)

        train_loss = train(model, train_loader, optimizer, criterion)
        valid_loss = evaluate(model, val_loader, criterion)

        scheduler.step(valid_loss)
        
        end_time = time.time()
        
        epoch_mins = int((end_time - start_time) / 60)
        epoch_secs = int((end_time - start_time) - (epoch_mins * 60))

        # Save checkpoint if validation loss has improved
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), os.path.join(CHECKPOINT_DIR, f'Epoch{epoch}_val{valid_loss}.pt'))
            print(f"Checkpoint saved: New best validation loss {best_valid_loss:.3f}")

        # TensorBoard logging
        writer.add_scalar('Loss/train', train_loss, epoch)
        writer.add_scalar('Perplexity/train', math.exp(train_loss), epoch)
        writer.add_scalar('Loss/validation', valid_loss, epoch)
        writer.add_scalar('Perplexity/validation', math.exp(valid_loss), epoch)
        writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], epoch)

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [38]:
run_lm(predict_while_train = True, resume_ckpt = True)

Started Training:   0%|          | 0/200 [00:00<?, ?it/s]

=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra attra
[2] INPUT : it is quite an interesting format, if you
OUTPUT: attra attra attra attra attra attra attra because attra attra attra attra attra attra attra attra fa attra attra attra attra attra attra attra attra fa attra attra attra attra attra attra attra attra fa attra attra attra attra attra attra attra attra fa attra attra attra attra attra attra


Started Training:   0%|          | 1/200 [01:05<3:36:57, 65.41s/it]

Checkpoint saved: New best validation loss 6.211
Epoch: 01 | Time: 1m 5s
	Train Loss: 6.408 | Train PPL: 606.449
	 Val. Loss: 6.211 |  Val. PPL: 498.298


Started Training:   1%|          | 2/200 [02:08<3:30:27, 63.78s/it]

Checkpoint saved: New best validation loss 5.949
Epoch: 02 | Time: 1m 2s
	Train Loss: 5.862 | Train PPL: 351.543
	 Val. Loss: 5.949 |  Val. PPL: 383.467


Started Training:   2%|▏         | 3/200 [03:10<3:27:42, 63.26s/it]

Checkpoint saved: New best validation loss 5.653
Epoch: 03 | Time: 1m 2s
	Train Loss: 5.515 | Train PPL: 248.295
	 Val. Loss: 5.653 |  Val. PPL: 285.013


Started Training:   2%|▏         | 4/200 [04:13<3:25:49, 63.01s/it]

Checkpoint saved: New best validation loss 5.253
Epoch: 04 | Time: 1m 2s
	Train Loss: 5.072 | Train PPL: 159.571
	 Val. Loss: 5.253 |  Val. PPL: 191.178


Started Training:   2%|▎         | 5/200 [05:15<3:24:18, 62.86s/it]

Checkpoint saved: New best validation loss 4.812
Epoch: 05 | Time: 1m 2s
	Train Loss: 4.469 | Train PPL:  87.245
	 Val. Loss: 4.812 |  Val. PPL: 122.962


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have have l ! <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: <end> by an rou t , if you must <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:   3%|▎         | 6/200 [06:20<3:25:23, 63.52s/it]

Checkpoint saved: New best validation loss 4.427
Epoch: 06 | Time: 1m 4s
	Train Loss: 3.940 | Train PPL:  51.441
	 Val. Loss: 4.427 |  Val. PPL:  83.692


Started Training:   4%|▎         | 7/200 [07:23<3:23:17, 63.20s/it]

Checkpoint saved: New best validation loss 4.175
Epoch: 07 | Time: 1m 2s
	Train Loss: 3.352 | Train PPL:  28.571
	 Val. Loss: 4.175 |  Val. PPL:  65.020


Started Training:   4%|▍         | 8/200 [08:25<3:21:36, 63.00s/it]

Checkpoint saved: New best validation loss 3.857
Epoch: 08 | Time: 1m 2s
	Train Loss: 2.913 | Train PPL:  18.414
	 Val. Loss: 3.857 |  Val. PPL:  47.312


Started Training:   4%|▍         | 9/200 [09:28<3:20:08, 62.87s/it]

Checkpoint saved: New best validation loss 3.710
Epoch: 09 | Time: 1m 2s
	Train Loss: 2.520 | Train PPL:  12.423
	 Val. Loss: 3.710 |  Val. PPL:  40.859


Started Training:   5%|▌         | 10/200 [10:30<3:18:48, 62.78s/it]

Checkpoint saved: New best validation loss 3.634
Epoch: 10 | Time: 1m 2s
	Train Loss: 2.251 | Train PPL:   9.494
	 Val. Loss: 3.634 |  Val. PPL:  37.877


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: <end> could have provo <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: <end> quite an amiable niece , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:   6%|▌         | 11/200 [11:35<3:19:41, 63.39s/it]

Checkpoint saved: New best validation loss 3.519
Epoch: 11 | Time: 1m 4s
	Train Loss: 2.043 | Train PPL:   7.717
	 Val. Loss: 3.519 |  Val. PPL:  33.753


Started Training:   6%|▌         | 12/200 [12:38<3:17:49, 63.13s/it]

Checkpoint saved: New best validation loss 3.243
Epoch: 12 | Time: 1m 2s
	Train Loss: 1.792 | Train PPL:   6.002
	 Val. Loss: 3.243 |  Val. PPL:  25.601


Started Training:   6%|▋         | 13/200 [13:40<3:16:03, 62.90s/it]

Epoch: 13 | Time: 1m 2s
	Train Loss: 1.595 | Train PPL:   4.930
	 Val. Loss: 3.784 |  Val. PPL:  44.002


Started Training:   7%|▋         | 14/200 [14:43<3:14:44, 62.82s/it]

Checkpoint saved: New best validation loss 3.006
Epoch: 14 | Time: 1m 2s
	Train Loss: 1.422 | Train PPL:   4.146
	 Val. Loss: 3.006 |  Val. PPL:  20.203


Started Training:   8%|▊         | 15/200 [15:45<3:13:15, 62.68s/it]

Epoch: 15 | Time: 1m 2s
	Train Loss: 1.300 | Train PPL:   3.668
	 Val. Loss: 3.270 |  Val. PPL:  26.299


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: stretch could have died <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: <end> quite an greatest , , you you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:   8%|▊         | 16/200 [16:50<3:14:07, 63.30s/it]

Checkpoint saved: New best validation loss 2.981
Epoch: 16 | Time: 1m 4s
	Train Loss: 1.168 | Train PPL:   3.217
	 Val. Loss: 2.981 |  Val. PPL:  19.716


Started Training:   8%|▊         | 17/200 [17:53<3:12:29, 63.11s/it]

Checkpoint saved: New best validation loss 2.835
Epoch: 17 | Time: 1m 2s
	Train Loss: 1.059 | Train PPL:   2.885
	 Val. Loss: 2.835 |  Val. PPL:  17.032


Started Training:   9%|▉         | 18/200 [18:55<3:10:53, 62.93s/it]

Checkpoint saved: New best validation loss 2.740
Epoch: 18 | Time: 1m 2s
	Train Loss: 0.883 | Train PPL:   2.419
	 Val. Loss: 2.740 |  Val. PPL:  15.480


Started Training:  10%|▉         | 19/200 [19:58<3:09:31, 62.83s/it]

Checkpoint saved: New best validation loss 2.688
Epoch: 19 | Time: 1m 2s
	Train Loss: 0.812 | Train PPL:   2.251
	 Val. Loss: 2.688 |  Val. PPL:  14.706


Started Training:  10%|█         | 20/200 [21:00<3:08:11, 62.73s/it]

Checkpoint saved: New best validation loss 2.688
Epoch: 20 | Time: 1m 2s
	Train Loss: 0.814 | Train PPL:   2.256
	 Val. Loss: 2.688 |  Val. PPL:  14.702


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: afforded could have cast your guard <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: afforded quite an rou , , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  10%|█         | 21/200 [22:05<3:08:46, 63.28s/it]

Epoch: 21 | Time: 1m 4s
	Train Loss: 0.725 | Train PPL:   2.065
	 Val. Loss: 2.715 |  Val. PPL:  15.101


Started Training:  11%|█         | 22/200 [23:07<3:07:03, 63.05s/it]

Checkpoint saved: New best validation loss 2.543
Epoch: 22 | Time: 1m 2s
	Train Loss: 0.677 | Train PPL:   1.969
	 Val. Loss: 2.543 |  Val. PPL:  12.719


Started Training:  12%|█▏        | 23/200 [24:10<3:05:27, 62.87s/it]

Epoch: 23 | Time: 1m 2s
	Train Loss: 0.575 | Train PPL:   1.778
	 Val. Loss: 2.592 |  Val. PPL:  13.353


Started Training:  12%|█▏        | 24/200 [25:12<3:04:01, 62.74s/it]

Epoch: 24 | Time: 1m 2s
	Train Loss: 0.578 | Train PPL:   1.783
	 Val. Loss: 2.611 |  Val. PPL:  13.615


Started Training:  12%|█▎        | 25/200 [26:15<3:02:44, 62.65s/it]

Epoch: 25 | Time: 1m 2s
	Train Loss: 0.502 | Train PPL:   1.652
	 Val. Loss: 2.581 |  Val. PPL:  13.216


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: overlooked could have liberal <end> <end> <end> philosophy <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: overlooked quite an interesting meanwhile , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  13%|█▎        | 26/200 [27:19<3:03:33, 63.29s/it]

Checkpoint saved: New best validation loss 2.518
Epoch: 26 | Time: 1m 4s
	Train Loss: 0.548 | Train PPL:   1.729
	 Val. Loss: 2.518 |  Val. PPL:  12.406


Started Training:  14%|█▎        | 27/200 [28:22<3:01:44, 63.03s/it]

Epoch: 27 | Time: 1m 2s
	Train Loss: 0.456 | Train PPL:   1.577
	 Val. Loss: 2.584 |  Val. PPL:  13.249


Started Training:  14%|█▍        | 28/200 [29:24<3:00:08, 62.84s/it]

Epoch: 28 | Time: 1m 2s
	Train Loss: 0.439 | Train PPL:   1.552
	 Val. Loss: 2.709 |  Val. PPL:  15.018


Started Training:  14%|█▍        | 29/200 [30:27<2:58:49, 62.74s/it]

Checkpoint saved: New best validation loss 2.443
Epoch: 29 | Time: 1m 2s
	Train Loss: 0.508 | Train PPL:   1.662
	 Val. Loss: 2.443 |  Val. PPL:  11.506


Started Training:  15%|█▌        | 30/200 [31:29<2:57:27, 62.63s/it]

Epoch: 30 | Time: 1m 2s
	Train Loss: 0.421 | Train PPL:   1.523
	 Val. Loss: 2.484 |  Val. PPL:  11.990


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: lord could have wood nove <end> journey journey <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: lord quite an rou ously , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  16%|█▌        | 31/200 [32:34<2:58:14, 63.28s/it]

Checkpoint saved: New best validation loss 2.422
Epoch: 31 | Time: 1m 4s
	Train Loss: 0.433 | Train PPL:   1.542
	 Val. Loss: 2.422 |  Val. PPL:  11.268


Started Training:  16%|█▌        | 32/200 [33:36<2:56:27, 63.02s/it]

Epoch: 32 | Time: 1m 2s
	Train Loss: 0.414 | Train PPL:   1.513
	 Val. Loss: 2.446 |  Val. PPL:  11.537


Started Training:  16%|█▋        | 33/200 [34:39<2:54:53, 62.84s/it]

Epoch: 33 | Time: 1m 2s
	Train Loss: 0.341 | Train PPL:   1.406
	 Val. Loss: 2.486 |  Val. PPL:  12.017


Started Training:  17%|█▋        | 34/200 [35:41<2:53:34, 62.74s/it]

Checkpoint saved: New best validation loss 2.417
Epoch: 34 | Time: 1m 2s
	Train Loss: 0.355 | Train PPL:   1.426
	 Val. Loss: 2.417 |  Val. PPL:  11.208


Started Training:  18%|█▊        | 35/200 [36:44<2:52:17, 62.65s/it]

Epoch: 35 | Time: 1m 2s
	Train Loss: 0.321 | Train PPL:   1.379
	 Val. Loss: 2.661 |  Val. PPL:  14.315


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: missed could have mary <end> <end> <end> shaken <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: missed quite an strange concern , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  18%|█▊        | 36/200 [37:48<2:52:52, 63.24s/it]

Epoch: 36 | Time: 1m 4s
	Train Loss: 0.386 | Train PPL:   1.471
	 Val. Loss: 2.442 |  Val. PPL:  11.502


Started Training:  18%|█▊        | 37/200 [38:51<2:51:14, 63.04s/it]

Checkpoint saved: New best validation loss 2.376
Epoch: 37 | Time: 1m 2s
	Train Loss: 0.339 | Train PPL:   1.404
	 Val. Loss: 2.376 |  Val. PPL:  10.757


Started Training:  19%|█▉        | 38/200 [39:53<2:49:47, 62.88s/it]

Checkpoint saved: New best validation loss 2.369
Epoch: 38 | Time: 1m 2s
	Train Loss: 0.299 | Train PPL:   1.348
	 Val. Loss: 2.369 |  Val. PPL:  10.690


Started Training:  20%|█▉        | 39/200 [40:56<2:48:22, 62.75s/it]

Epoch: 39 | Time: 1m 2s
	Train Loss: 0.317 | Train PPL:   1.373
	 Val. Loss: 2.408 |  Val. PPL:  11.107


Started Training:  20%|██        | 40/200 [41:58<2:47:00, 62.63s/it]

Epoch: 40 | Time: 1m 2s
	Train Loss: 0.316 | Train PPL:   1.372
	 Val. Loss: 2.485 |  Val. PPL:  12.002


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: tranqu could have shut <end> <end> <end> <end> ption <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: tranqu quite an interesting niece , if you <end> <end> <end> <end> <end> <end> <end> <end> outh <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  20%|██        | 41/200 [43:03<2:47:30, 63.21s/it]

Epoch: 41 | Time: 1m 4s
	Train Loss: 0.312 | Train PPL:   1.366
	 Val. Loss: 2.411 |  Val. PPL:  11.142


Started Training:  21%|██        | 42/200 [44:05<2:45:55, 63.01s/it]

Checkpoint saved: New best validation loss 2.366
Epoch: 42 | Time: 1m 2s
	Train Loss: 0.320 | Train PPL:   1.377
	 Val. Loss: 2.366 |  Val. PPL:  10.652


Started Training:  22%|██▏       | 43/200 [45:08<2:44:26, 62.84s/it]

Epoch: 43 | Time: 1m 2s
	Train Loss: 0.361 | Train PPL:   1.435
	 Val. Loss: 2.586 |  Val. PPL:  13.273


Started Training:  22%|██▏       | 44/200 [46:10<2:43:04, 62.72s/it]

Epoch: 44 | Time: 1m 2s
	Train Loss: 0.382 | Train PPL:   1.465
	 Val. Loss: 2.420 |  Val. PPL:  11.241


Started Training:  22%|██▎       | 45/200 [47:13<2:41:46, 62.62s/it]

Epoch: 45 | Time: 1m 2s
	Train Loss: 0.352 | Train PPL:   1.422
	 Val. Loss: 2.420 |  Val. PPL:  11.241


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: bu could have observed ased <end> <end> <end> bu if if <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: bu quite an griev ager , if you <end> <end> <end> <end> bu omy amusing <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  23%|██▎       | 46/200 [48:17<2:42:15, 63.22s/it]

Epoch: 46 | Time: 1m 4s
	Train Loss: 0.375 | Train PPL:   1.455
	 Val. Loss: 2.486 |  Val. PPL:  12.018


Started Training:  24%|██▎       | 47/200 [49:20<2:40:35, 62.98s/it]

Epoch: 47 | Time: 1m 2s
	Train Loss: 0.340 | Train PPL:   1.405
	 Val. Loss: 2.478 |  Val. PPL:  11.918


Started Training:  24%|██▍       | 48/200 [50:22<2:39:06, 62.81s/it]

Epoch: 48 | Time: 1m 2s
	Train Loss: 0.342 | Train PPL:   1.408
	 Val. Loss: 2.464 |  Val. PPL:  11.754


Started Training:  24%|██▍       | 49/200 [51:25<2:38:03, 62.80s/it]

Checkpoint saved: New best validation loss 2.266
Epoch: 49 | Time: 1m 2s
	Train Loss: 0.229 | Train PPL:   1.257
	 Val. Loss: 2.266 |  Val. PPL:   9.639


Started Training:  25%|██▌       | 50/200 [52:27<2:36:50, 62.73s/it]

Checkpoint saved: New best validation loss 2.249
Epoch: 50 | Time: 1m 2s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 2.249 |  Val. PPL:   9.482


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: collected could have keep <end> <end> <end> <end> sex ? <end> dependence <end> dependence feels <end> omy six son <end> <end> omy six son <end> <end> <end> omy six son <end> <end> <end> omy six <end> <end> omy <end> <end> omy <end> <end> omy <end> <end> omy six <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: collected quite an interesting connection , if you <end> <end> <end> <end> feels <end> feels . <end> <end> <end> <end> <end> omy six son . <end> <end> <end> <end> omy six son . <end> <end> <end> <end> omy six <end> <end> omy <end> <end> omy <end> <end> <end> omy <end>


Started Training:  26%|██▌       | 51/200 [53:32<2:37:21, 63.36s/it]

Checkpoint saved: New best validation loss 2.211
Epoch: 51 | Time: 1m 4s
	Train Loss: 0.119 | Train PPL:   1.126
	 Val. Loss: 2.211 |  Val. PPL:   9.126


Started Training:  26%|██▌       | 52/200 [54:35<2:35:39, 63.10s/it]

Epoch: 52 | Time: 1m 2s
	Train Loss: 0.102 | Train PPL:   1.107
	 Val. Loss: 2.222 |  Val. PPL:   9.226


Started Training:  26%|██▋       | 53/200 [55:37<2:34:08, 62.92s/it]

Epoch: 53 | Time: 1m 2s
	Train Loss: 0.096 | Train PPL:   1.101
	 Val. Loss: 2.226 |  Val. PPL:   9.261


Started Training:  27%|██▋       | 54/200 [56:40<2:32:52, 62.83s/it]

Checkpoint saved: New best validation loss 2.169
Epoch: 54 | Time: 1m 2s
	Train Loss: 0.075 | Train PPL:   1.078
	 Val. Loss: 2.169 |  Val. PPL:   8.748


Started Training:  28%|██▊       | 55/200 [57:42<2:31:43, 62.78s/it]

Checkpoint saved: New best validation loss 2.121
Epoch: 55 | Time: 1m 2s
	Train Loss: 0.080 | Train PPL:   1.083
	 Val. Loss: 2.121 |  Val. PPL:   8.341


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: bu could have abilities <end> <end> <end> <end> <end> dependence . <end> <end> omy six son . <end> <end> omy six omy ery <end> <end> <end> omy <end> <end> omy <end> <end> omy <end> <end> omy <end> <end> omy <end> <end> omy <end> <end> omy <end> <end> omy <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: bu quite an rou tone , if you <end> <end> <end> <end> <end> <end> <end> <end> dependence . <end> <end> omy . <end> <end> omy <end> <end> omy <end> <end> omy <end> <end> <end> omy <end> <end> <end> omy <end> <end> <end> omy <end> <end> <end> omy <end> <end> omy


Started Training:  28%|██▊       | 56/200 [58:47<2:32:04, 63.37s/it]

Epoch: 56 | Time: 1m 4s
	Train Loss: 0.063 | Train PPL:   1.065
	 Val. Loss: 2.199 |  Val. PPL:   9.020


Started Training:  28%|██▊       | 57/200 [59:50<2:30:20, 63.08s/it]

Epoch: 57 | Time: 1m 2s
	Train Loss: 0.064 | Train PPL:   1.066
	 Val. Loss: 2.228 |  Val. PPL:   9.281


Started Training:  29%|██▉       | 58/200 [1:00:52<2:28:55, 62.93s/it]

Checkpoint saved: New best validation loss 2.121
Epoch: 58 | Time: 1m 2s
	Train Loss: 0.059 | Train PPL:   1.061
	 Val. Loss: 2.121 |  Val. PPL:   8.338


Started Training:  30%|██▉       | 59/200 [1:01:55<2:27:31, 62.78s/it]

Epoch: 59 | Time: 1m 2s
	Train Loss: 0.062 | Train PPL:   1.064
	 Val. Loss: 2.145 |  Val. PPL:   8.538


Started Training:  30%|███       | 60/200 [1:02:57<2:26:14, 62.68s/it]

Epoch: 60 | Time: 1m 2s
	Train Loss: 0.052 | Train PPL:   1.054
	 Val. Loss: 2.134 |  Val. PPL:   8.446


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: collected could have ? <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: bu quite an rou ug , if you <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  30%|███       | 61/200 [1:04:02<2:26:39, 63.30s/it]

Epoch: 61 | Time: 1m 4s
	Train Loss: 0.060 | Train PPL:   1.062
	 Val. Loss: 2.159 |  Val. PPL:   8.663


Started Training:  31%|███       | 62/200 [1:05:04<2:25:00, 63.05s/it]

Epoch: 62 | Time: 1m 2s
	Train Loss: 0.051 | Train PPL:   1.052
	 Val. Loss: 2.136 |  Val. PPL:   8.462


Started Training:  32%|███▏      | 63/200 [1:06:07<2:23:33, 62.87s/it]

Epoch: 63 | Time: 1m 2s
	Train Loss: 0.051 | Train PPL:   1.052
	 Val. Loss: 2.126 |  Val. PPL:   8.383


Started Training:  32%|███▏      | 64/200 [1:07:09<2:22:12, 62.74s/it]

Epoch: 64 | Time: 1m 2s
	Train Loss: 0.051 | Train PPL:   1.052
	 Val. Loss: 2.177 |  Val. PPL:   8.818


Started Training:  32%|███▎      | 65/200 [1:08:12<2:20:58, 62.65s/it]

Epoch: 65 | Time: 1m 2s
	Train Loss: 0.049 | Train PPL:   1.050
	 Val. Loss: 2.133 |  Val. PPL:   8.439


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: collected could have discovered <end> perfect nature ached acutely :// ached acutely . <end> <end> <end> <end> <end> omy . <end> <end> omy . <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: collected quite an interesting laughing , if you need <end> <end> <end> acutely & omy . <end> <end> <end> <end> <end> <end> <end> <end> <end> omy . <end> <end> <end> <end> <end> <end> <end> <end> <end> omy . <end> <end> <end> <end> <end> omy . <end> <end> <end> <end> <end>


Started Training:  33%|███▎      | 66/200 [1:09:16<2:21:23, 63.31s/it]

Checkpoint saved: New best validation loss 2.100
Epoch: 66 | Time: 1m 4s
	Train Loss: 0.045 | Train PPL:   1.046
	 Val. Loss: 2.100 |  Val. PPL:   8.168


Started Training:  34%|███▎      | 67/200 [1:10:19<2:19:44, 63.04s/it]

Epoch: 67 | Time: 1m 2s
	Train Loss: 0.033 | Train PPL:   1.034
	 Val. Loss: 2.132 |  Val. PPL:   8.429


Started Training:  34%|███▍      | 68/200 [1:11:21<2:18:20, 62.89s/it]

Epoch: 68 | Time: 1m 2s
	Train Loss: 0.031 | Train PPL:   1.031
	 Val. Loss: 2.104 |  Val. PPL:   8.203


Started Training:  34%|███▍      | 69/200 [1:12:24<2:17:03, 62.78s/it]

Epoch: 69 | Time: 1m 2s
	Train Loss: 0.034 | Train PPL:   1.035
	 Val. Loss: 2.102 |  Val. PPL:   8.184


Started Training:  35%|███▌      | 70/200 [1:13:26<2:15:50, 62.69s/it]

Epoch: 70 | Time: 1m 2s
	Train Loss: 0.030 | Train PPL:   1.031
	 Val. Loss: 2.159 |  Val. PPL:   8.665


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: collected could have discovered <end> perfect nature ached acutely :// ached acutely :// acutely . <end> <end> <end> omy :// <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: collected quite an interesting laughing , if you need <end> <end> <end> acutely . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> omy . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> omy . <end> <end> <end> <end> <end> <end> omy . <end> <end> <end>


Started Training:  36%|███▌      | 71/200 [1:14:31<2:16:10, 63.34s/it]

Checkpoint saved: New best validation loss 2.097
Epoch: 71 | Time: 1m 4s
	Train Loss: 0.033 | Train PPL:   1.034
	 Val. Loss: 2.097 |  Val. PPL:   8.138


Started Training:  36%|███▌      | 72/200 [1:15:34<2:14:32, 63.07s/it]

Epoch: 72 | Time: 1m 2s
	Train Loss: 0.028 | Train PPL:   1.028
	 Val. Loss: 2.120 |  Val. PPL:   8.335


Started Training:  36%|███▋      | 73/200 [1:16:36<2:13:09, 62.91s/it]

Epoch: 73 | Time: 1m 2s
	Train Loss: 0.029 | Train PPL:   1.029
	 Val. Loss: 2.132 |  Val. PPL:   8.428


Started Training:  37%|███▋      | 74/200 [1:17:39<2:11:51, 62.79s/it]

Epoch: 74 | Time: 1m 2s
	Train Loss: 0.029 | Train PPL:   1.029
	 Val. Loss: 2.182 |  Val. PPL:   8.860


Started Training:  38%|███▊      | 75/200 [1:18:41<2:10:37, 62.70s/it]

Epoch: 75 | Time: 1m 2s
	Train Loss: 0.023 | Train PPL:   1.023
	 Val. Loss: 2.098 |  Val. PPL:   8.146


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: collected could have discovered <end> <end> <end> <end> acutely ached :// feels . <end> acutely . <end> <end> omy :// <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: collected quite an interesting laughing , if you need <end> <end> <end> acutely . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> omy . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> omy . <end> <end> <end> <end> <end> <end> omy . <end>


Started Training:  38%|███▊      | 76/200 [1:19:46<2:10:49, 63.30s/it]

Epoch: 76 | Time: 1m 4s
	Train Loss: 0.025 | Train PPL:   1.025
	 Val. Loss: 2.145 |  Val. PPL:   8.542


Started Training:  38%|███▊      | 77/200 [1:20:48<2:09:13, 63.04s/it]

Epoch: 77 | Time: 1m 2s
	Train Loss: 0.023 | Train PPL:   1.023
	 Val. Loss: 2.104 |  Val. PPL:   8.196


Started Training:  39%|███▉      | 78/200 [1:21:51<2:07:52, 62.89s/it]

Epoch: 78 | Time: 1m 2s
	Train Loss: 0.027 | Train PPL:   1.028
	 Val. Loss: 2.125 |  Val. PPL:   8.374


Started Training:  40%|███▉      | 79/200 [1:22:53<2:06:35, 62.77s/it]

Epoch: 79 | Time: 1m 2s
	Train Loss: 0.032 | Train PPL:   1.033
	 Val. Loss: 2.108 |  Val. PPL:   8.228


Started Training:  40%|████      | 80/200 [1:23:56<2:05:31, 62.76s/it]

Checkpoint saved: New best validation loss 2.074
Epoch: 80 | Time: 1m 2s
	Train Loss: 0.027 | Train PPL:   1.027
	 Val. Loss: 2.074 |  Val. PPL:   7.960


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: collected could have discovered <end> <end> <end> <end> acutely ached :// feels . <end> <end> omy :// <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> omy .
[2] INPUT : it is quite an interesting format, if you
OUTPUT: collected quite an interesting delightful , if you need <end> <end> <end> acutely . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> omy . <end> <end> <end> <end> <end> <end> <end> <end>


Started Training:  40%|████      | 81/200 [1:25:01<2:05:42, 63.38s/it]

Epoch: 81 | Time: 1m 4s
	Train Loss: 0.031 | Train PPL:   1.031
	 Val. Loss: 2.146 |  Val. PPL:   8.551


Started Training:  41%|████      | 82/200 [1:26:04<2:04:11, 63.15s/it]

Epoch: 82 | Time: 1m 2s
	Train Loss: 0.028 | Train PPL:   1.028
	 Val. Loss: 2.099 |  Val. PPL:   8.158


Started Training:  42%|████▏     | 83/200 [1:27:06<2:02:49, 62.98s/it]

Epoch: 83 | Time: 1m 2s
	Train Loss: 0.037 | Train PPL:   1.038
	 Val. Loss: 2.159 |  Val. PPL:   8.658


Started Training:  42%|████▏     | 84/200 [1:28:09<2:01:31, 62.86s/it]

Epoch: 84 | Time: 1m 2s
	Train Loss: 0.029 | Train PPL:   1.030
	 Val. Loss: 2.136 |  Val. PPL:   8.469


Started Training:  42%|████▎     | 85/200 [1:29:11<2:00:18, 62.77s/it]

Epoch: 85 | Time: 1m 2s
	Train Loss: 0.025 | Train PPL:   1.026
	 Val. Loss: 2.149 |  Val. PPL:   8.575


=== Sample generations ===
[1] INPUT : if i could have ceased
OUTPUT: collected could have discovered <end> <end> <end> <end> acutely ached :// feels . <end> <end> omy :// <end> omy . <end> <end> <end> omy . <end> <end> <end> omy . <end> <end> <end> <end> omy . <end> <end> <end> <end> omy . <end> <end> <end> <end> omy . <end> <end>
[2] INPUT : it is quite an interesting format, if you
OUTPUT: collected quite an interesting delightful , if you need <end> <end> <end> acutely . <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> omy . <end> <end> <end> <end> <end> <end> <end>


Started Training:  43%|████▎     | 86/200 [1:30:16<2:00:27, 63.40s/it]

In [None]:
generate_seq(model, ["Hi, introduce yourself"], bpe_tokenizer, SEQ_LEN, special_ids, gen_len=100)