In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset

import math
from pathlib import Path
import copy

In [21]:
import torchtext.datasets as datasets
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import LambdaLR

import warnings
from tqdm import tqdm
import os
from pathlib import Path

# Huggingface datasets and tokenizers
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

import torchmetrics
from torch.utils.tensorboard import SummaryWriter

Matplotlib is building the font cache; this may take a moment.


# Creating Model Architecture

In [2]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq = seq
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq, d_model)
        position = torch.arange(0, seq, dtype=torch.float).unsqueeze(1) # (seq, 1)

        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)

        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))

        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))

        pe = pe.unsqueeze(0) # (1, seq, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq, d_model)
        return self.dropout(x)

In [5]:
class LayerNormalization(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features))
        self.bias = nn.Parameter(torch.zeros(features))

    def forward(self, x):
        # x: (batch, seq, hidden_size)
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq, 1)
        std = x.std(dim = -1, keepdim = True) # (batch, seq, 1)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

In [6]:
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq, d_model) --> (batch, seq, d_ff) --> (batch, seq, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

In [7]:
class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h  # split embedding vector for each head
        self.w_q = nn.Linear(d_model, d_model, bias=False)
        self.w_k = nn.Linear(d_model, d_model, bias=False)
        self.w_v = nn.Linear(d_model, d_model, bias=False)
        self.w_o = nn.Linear(d_model, d_model, bias=False)
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # (batch, h, seq, d_k) --> (batch, h, seq, seq)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Low value represents -inf from paper
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq, seq)
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq, seq) --> (batch, h, seq, d_k)
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq, d_model) --> (batch, seq, d_model)
        key = self.w_k(k) # (batch, seq, d_model) --> (batch, seq, d_model)
        value = self.w_v(v) # (batch, seq, d_model) --> (batch, seq, d_model)

        # (batch, seq, d_model) --> (batch, seq, h, d_k) --> (batch, h, seq, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # Combine all the heads together
        # (batch, h, seq, d_k) --> (batch, seq, h, d_k) --> (batch, seq, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # (batch, seq, d_model) --> (batch, seq, d_model)
        return self.w_o(x)

In [8]:
class ResidualConnection(nn.Module):

        def __init__(self, features: int, dropout: float) -> None:
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = LayerNormalization(features)

        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

In [9]:
class EncoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

In [10]:
class Encoder(nn.Module): # to stack all encoder blocks

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [11]:
class DecoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block # cross-attention for connecting relations between encoder output (input) and target
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

In [12]:
class Decoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

In [13]:
class ProjectionLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq, d_model) --> (batch, seq, vocab_size)
        return self.proj(x)

In [14]:
class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    def project(self, x):
        # (batch, seq, vocab_size)
        return self.projection_layer(x)

In [15]:
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq: int, tgt_seq: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048) -> Transformer:
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    src_pos = PositionalEncoding(d_model, src_seq, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq, dropout)

    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))

    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer

# Dataset

In [16]:
class TranslationDataset(Dataset):

    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq):
        super().__init__()
        self.seq = seq

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        enc_num_padding_tokens = self.seq - len(enc_input_tokens) - 2
        dec_num_padding_tokens = self.seq - len(dec_input_tokens) - 1

        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of the tensors to make sure they are all seq long
        assert encoder_input.size(0) == self.seq
        assert decoder_input.size(0) == self.seq
        assert label.size(0) == self.seq

        return {
            "encoder_input": encoder_input,  # (seq)
            "decoder_input": decoder_input,  # (seq)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq) & (1, seq, seq),
            "label": label,  # (seq)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

In [17]:
def get_config():
    return {
        "batch_size": 8,
        "num_epochs": 20,
        "lr": 10**-4,
        "seq": 350,
        "d_model": 512,
        "datasource": 'opus_books',
        "lang_src": "en",
        "lang_tgt": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

def get_weights_file_path(config, epoch: str):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}{epoch}.pt"
    return str(Path('.') / model_folder / model_filename)

# Find the latest weights file in the weights folder
def latest_weights_file_path(config):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}*"
    weights_files = list(Path(model_folder).glob(model_filename))
    if len(weights_files) == 0:
        return None
    weights_files.sort()
    return str(weights_files[-1])

In [22]:
def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

In [23]:
def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

In [24]:
def get_ds(config):
    ds_raw = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='train')

    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])

    # Keep 90% for training, 10% for validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    train_ds = TranslationDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq'])
    val_ds = TranslationDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq'])

    # Find the maximum length of each sentence in the source and target sentence
    max_len_src = 0
    max_len_tgt = 0

    for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')

    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

In [25]:
def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config["seq"], config['seq'], d_model=config['d_model'])
    return model

# Training

In [29]:
def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')

    encoder_output = model.encode(source, source_mask)
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # get next token
        prob = model.project(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
        )

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)

In [26]:
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_step, writer, num_examples=2):
    model.eval()
    count = 0

    source_texts = []
    expected = []
    predicted = []

    try:
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)
    except:
        # If we can't get the console width, use 80 as default
        console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device) # (b, seq)
            encoder_mask = batch["encoder_mask"].to(device) # (b, 1, 1, seq)

            # check that the batch size is 1
            assert encoder_input.size(
                0) == 1, "Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            source_text = batch["src_text"][0]
            target_text = batch["tgt_text"][0]
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)
            
            print_msg('-'*console_width)
            print_msg(f"{f'SOURCE: ':>12}{source_text}")
            print_msg(f"{f'TARGET: ':>12}{target_text}")
            print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")

            if count == num_examples:
                print_msg('-'*console_width)
                break
    
    if writer:
        # Evaluate the character error rate
        # Compute the char error rate 
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        writer.add_scalar('validation cer', cer, global_step)
        writer.flush()

        # Compute the word error rate
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        writer.add_scalar('validation wer', wer, global_step)
        writer.flush()

        # Compute the BLEU metric
        metric = torchmetrics.BLEUScore()
        bleu = metric(predicted, expected)
        writer.add_scalar('validation BLEU', bleu, global_step)
        writer.flush()

In [27]:
def train_model(config):
    # Define the device
    device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
    print("Using device:", device)
    if (device == 'cuda'):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
    elif (device == 'mps'):
        print(f"Device name: <mps>")
    else:
        print("NOTE: If you have a GPU, consider using it for training.")
        print("      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
        print("      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")
    device = torch.device(device)

    # Make sure the weights folder exists
    Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])

    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

    # If the user specified a model to preload before training, load it
    initial_epoch = 0
    global_step = 0
    preload = config['preload']
    model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None
    if model_filename:
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
    else:
        print('No model to preload, starting from scratch')

    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    for epoch in range(initial_epoch, config['num_epochs']):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (b, seq)
            decoder_input = batch['decoder_input'].to(device) # (B, seq)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq, seq)

            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq, d_model)
            proj_output = model.project(decoder_output) # (B, seq, vocab_size)

            label = batch['label'].to(device) # (B, seq)

            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            loss.backward()

            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1

        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq'], device, lambda msg: batch_iterator.write(msg), global_step, writer)

        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)

In [30]:
if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    config = get_config()
    train_model(config)

Using device: cuda
Device name: NVIDIA GeForce RTX 3060 Laptop GPU
Device memory: 5.99951171875 GB
Max length of source sentence: 309
Max length of target sentence: 274
No model to preload, starting from scratch


Processing Epoch 00: 100%|██████████| 3638/3638 [1:09:58<00:00,  1.15s/it, loss=5.109]


--------------------------------------------------------------------------------
    SOURCE: 'Go with this note to the Countess Vronskaya's country house; do you know it?
    TARGET: — Va’ con questo stesso biglietto, in campagna, dalla contessa Vronskaja, sai?
 PREDICTED: — Non è vero , ma non è vero , ma non è vero ?
--------------------------------------------------------------------------------
    SOURCE: It was long since the farm work had seemed so important to him as it did that evening.
    TARGET: Da tempo gli affari dell’azienda non gli apparivano così importanti come quel giorno.
 PREDICTED: a un ’ altra , ma non era più stato , ma non si .
--------------------------------------------------------------------------------


Processing Epoch 01: 100%|██████████| 3638/3638 [1:04:31<00:00,  1.06s/it, loss=5.993]


--------------------------------------------------------------------------------
    SOURCE: When they were still a quarter of a verst from the house, Levin saw Grisha and Tanya running toward him.
    TARGET: Circa un quarto di versta prima di giungere a casa, Levin scorse Tanja e Griša che gli correvano incontro.
 PREDICTED: Quando si , , , , Levin , Levin si fermò e Levin si fermò , e Levin si fermò .
--------------------------------------------------------------------------------
    SOURCE: Through two luminous windows saw
    TARGET: attraverso a due finestre luminose,
 PREDICTED: .
--------------------------------------------------------------------------------


Processing Epoch 02: 100%|██████████| 3638/3638 [1:14:12<00:00,  1.22s/it, loss=4.736]


--------------------------------------------------------------------------------
    SOURCE: And of all princes, it is impossible for the new prince to avoid the imputation of cruelty, owing to new states being full of dangers.
    TARGET: Et intra tutti e' principi, al principe nuovo è impossibile fuggire el nome di crudele, per essere li stati nuovi pieni di pericoli.
 PREDICTED: E che , per ' tempi è stato di non essere , et , et , et , et .
--------------------------------------------------------------------------------
    SOURCE: You can never rouse Harris.
    TARGET: Noi non possiamo mai scuotere Harris.
 PREDICTED: Non vi avete detto nulla di Harris .
--------------------------------------------------------------------------------


Processing Epoch 03: 100%|██████████| 3638/3638 [1:12:26<00:00,  1.19s/it, loss=5.624]


--------------------------------------------------------------------------------
    SOURCE: They prayed: 'That they may live in chastity for the good of the fruits of the womb, and find joy in their sons and daughters.'
    TARGET: Si pregava “perché fosse loro donata la purezza, e il frutto delle viscere per il loro bene, perché si rallegrassero della vista dei figli e delle figlie”.
 PREDICTED: : “ è sempre sempre più di , di , e di , e di e di .
--------------------------------------------------------------------------------
    SOURCE: She was dressed in pure white; an amber-coloured scarf was passed over her shoulder and across her breast, tied at the side, and descending in long, fringed ends below her knee.
    TARGET: Aveva un vestito bianco, una sciarpa color ambra gettata sulle spalle.
 PREDICTED: Ella era un ' espressione di capelli e , e la mano era , e la mano si alzò e si mise a guardare il viso , e si , , si , si , il capo .
---------------------------------------------

Processing Epoch 04: 100%|██████████| 3638/3638 [1:04:49<00:00,  1.07s/it, loss=4.891]


--------------------------------------------------------------------------------
    SOURCE: The Countess Lydia Ivanovna knew very well that it was one of his greatest pleasures, though he would never confess it.
    TARGET: La contessa Lidija Ivanovna sapeva bene che questa era una delle sue più grandi gioie, anche se egli non l’avrebbe mai confessato.
 PREDICTED: La contessa Lidija Ivanovna aveva una volta che era una più grande , ma non poteva mai mai .
--------------------------------------------------------------------------------
    SOURCE: He had been to Spain, where he arranged serenades and became intimate with a Spanish woman who played the mandoline.
    TARGET: Era stato in Spagna, e là aveva fatto serenate e stretto amicizia con una spagnola che sonava il mandolino.
 PREDICTED: Era stato per , dove si con uno di , e di con un di .
--------------------------------------------------------------------------------


Processing Epoch 05: 100%|██████████| 3638/3638 [1:14:14<00:00,  1.22s/it, loss=4.308]


--------------------------------------------------------------------------------
    SOURCE: It trotted quietly on until its would-be assassin was within a yard of it, and then it turned round and sat down in the middle of the road, and looked at Montmorency with a gentle, inquiring expression, that said:
    TARGET: Trotterellò tranquillamente finchè l’eventuale assassino non si trovò a un metro da lui; e poi si voltò e si sedette in mezzo alla strada, guardando Montmorency con una soave, interrogativa espressione, che diceva:
 PREDICTED: Era un po ’ di sollievo , se fosse stato più difficile di più , e poi si alzò in piedi , e si fermò a piedi , e si fermò con un ’ espressione di gioia , che diceva :
--------------------------------------------------------------------------------
    SOURCE: She did not doubt that she had acted rightly, yet for a long time she lay in bed unable to sleep.
    TARGET: Non aveva nessun dubbio di non essersi regolata così come conveniva. Ma a letto, per 

Processing Epoch 06: 100%|██████████| 3638/3638 [1:11:52<00:00,  1.19s/it, loss=4.403]


--------------------------------------------------------------------------------
    SOURCE: And this being an ancient custom, it cannot be called a new principality, because there are none of those difficulties in it that are met with in new ones; for although the prince is new, the constitution of the state is old, and it is framed so as to receive him as if he were its hereditary lord.
    TARGET: Et essendo questo ordine antiquato, non si può chiamare principato nuovo, perché in quello non sono alcune di quelle difficultà che sono ne' nuovi; perché, se bene el principe è nuovo, li ordini di quello stato sono vecchi et ordinati a riceverlo come se fussi loro signore ereditario.
 PREDICTED: E questa è una grande , non può essere , perché non sono mai stati che quelli che sono stati in Italia , e che con le cose sono , come si , e come se non si , e che , come se non si .
--------------------------------------------------------------------------------
    SOURCE: His softened voice an

Processing Epoch 07: 100%|██████████| 3638/3638 [1:04:31<00:00,  1.06s/it, loss=3.874]


--------------------------------------------------------------------------------
    SOURCE: In that short time the centre of the cloud had already so moved over the sun that it was as dark as during an eclipse.
    TARGET: In quel breve spazio di tempo la nuvola s’era già tanto avanzata col suo centro sul sole, che s’era fatto buio come in una eclissi.
 PREDICTED: In quel momento il più piccolo della luce , si era nel sole che il sole era come un matrimonio .
--------------------------------------------------------------------------------
    SOURCE: But when I had done this, I was unable to stir it up again, or to get under it, much less to move it forward towards the water; so I was forced to give it over; and yet, though I gave over the hopes of the boat, my desire to venture over for the main increased, rather than decreased, as the means for it seemed impossible.
    TARGET: Ma raggiunta questa meta, mi trovai nuovamente inabile a moverla, a mettermici sotto, tanto più poi a spin

Processing Epoch 08: 100%|██████████| 3638/3638 [1:04:40<00:00,  1.07s/it, loss=3.666]


--------------------------------------------------------------------------------
    SOURCE: He had been sitting, without knowing it, on the very verge of a small gully, the long grass hiding it from view; and in leaning a little back he had shot over, pie and all.
    TARGET: Egli s’era seduto, senza accorgersene, sul ciglio d’un fosso che la lunga erba nascondeva alla vista, e, nel tirarsi indietro, v’era precipitato col pasticcio e tutto.
 PREDICTED: Aveva appena appena seduto , senza capire , in alto , in un piccolo , un po ’ di , di sotto gli alberi , e di un pezzo , lo fece in fretta , e tutto il pasticcio .
--------------------------------------------------------------------------------
    SOURCE: It is unpleasant for me to enter this house.
    TARGET: “Viene lei da me — pensò Vronskij — e sarebbe meglio.
 PREDICTED: È vero che mi è accaduto questo a casa .
--------------------------------------------------------------------------------


Processing Epoch 09: 100%|██████████| 3638/3638 [1:04:45<00:00,  1.07s/it, loss=3.113]


--------------------------------------------------------------------------------
    SOURCE: This was hardly what I intended.
    TARGET: Veramente io non la intendevo così.
 PREDICTED: Non avevo nulla da fare .
--------------------------------------------------------------------------------
    SOURCE: He loves (as he _can_ love, and that is not as you love) a beautiful young lady called Rosamond.
    TARGET: "Egli è innamorato (come lui può esserlo) di una bella ragazza, per nome Rosmunda.
 PREDICTED: Egli ha detto che fare amore , e che non è amore , come è giovane , ragazza di ragazza .
--------------------------------------------------------------------------------


Processing Epoch 10: 100%|██████████| 3638/3638 [1:14:27<00:00,  1.23s/it, loss=3.349]


--------------------------------------------------------------------------------
    SOURCE: "Go to the devil!" was his brother-in-law's recommendation.
    TARGET: — Andate al diavolo, — gli disse il cognato.
 PREDICTED: — il diavolo ! — esclamò il cognato .
--------------------------------------------------------------------------------
    SOURCE: "What are you going to do?" asked George's father.
    TARGET: — E che decisione prendi? — chiese il padre di Giorgio.
 PREDICTED: — Che volete fare ? — domandò il padre di Giorgio .
--------------------------------------------------------------------------------


Processing Epoch 11: 100%|██████████| 3638/3638 [1:14:27<00:00,  1.23s/it, loss=3.293]


--------------------------------------------------------------------------------
    SOURCE: I never thought it would be so interesting.
    TARGET: Non avrei mai pensato che la cosa fosse così interessante.
 PREDICTED: Non avevo mai pensato a questo .
--------------------------------------------------------------------------------
    SOURCE: 'I wrote both to you and to Sergius Ivanich that I do not know you and do not wish to know you.
    TARGET: — Io ho scritto a voi e a Sergej Ivanyc che non vi conosco e non voglio conoscervi.
 PREDICTED: — Io ti ho scritto e a Sergej Ivanovic , a questo non so che io e non lo voglio .
--------------------------------------------------------------------------------


Processing Epoch 12: 100%|██████████| 3638/3638 [1:12:11<00:00,  1.19s/it, loss=3.357]


--------------------------------------------------------------------------------
    SOURCE: She had not expected such cruelty from her, and was angry with her.
    TARGET: Non si aspettava tanta crudeltà da lei e ne provò sdegno.
 PREDICTED: Ella non aspettava in lei un essere simile , e si era arrabbiato con lei .
--------------------------------------------------------------------------------
    SOURCE: The doctor...'
    TARGET: Il dottore... e poi...
 PREDICTED: Il dottore ...
--------------------------------------------------------------------------------


Processing Epoch 13: 100%|██████████| 3638/3638 [1:14:19<00:00,  1.23s/it, loss=3.487]


--------------------------------------------------------------------------------
    SOURCE: When the Countess Nordston took the liberty of hinting that she had hoped for something better, Kitty got so heated and proved so convincingly that no one on earth could be better than Levin, that the Countess had to admit it, and thereafter never encountered Levin in Kitty's presence without a smile of delight.
    TARGET: Quando la contessa Nordston si permise di accennare al fatto che avrebbe desiderato per lei qualcosa di meglio, Kitty si accalorò tanto e dimostrò con tanta convinzione che non poteva esservi al mondo alcuno migliore di Levin, che la contessa Nordston dovette riconoscerlo e da allora in poi, in presenza di Kitty, accolse Levin con un sorriso di ammirazione.
 PREDICTED: Quando la contessa Nordston aveva preso la libertà che aveva cercato di trovare qualcosa , Kitty s ’ era preparato a trovare una così e che non si poteva trovare sulla terra , Levin che la contessa Nordston av

Processing Epoch 14: 100%|██████████| 3638/3638 [1:12:19<00:00,  1.19s/it, loss=3.097]


--------------------------------------------------------------------------------
    SOURCE: And now, lest my good resolutions should continue, my companion, who had enticed me away, comes to me; “Well, Bob,” says he, clapping me upon the shoulder, “how do you do after it?
    TARGET: Allora il mio compagno per paura che continuassero le mie buone risoluzioni, perchè era stato egli che m’avea sedotto a fuggire di casa, mi si accosto battendomi amichevolmente con una mano la spalla e dicendomi: — «Ebbene, come vi sentite adesso, bell’uomo?
 PREDICTED: E ora , se il mio consiglio fosse andato a male , m ’ avesse voluto dire , , anch ’ io : , ! — « , padrone ! me stare a dire !»
--------------------------------------------------------------------------------
    SOURCE: 'Why, is it a philanthropic undertaking?'
    TARGET: — Be’, qualcosa di filantropico?
 PREDICTED: — Ma perché è un ’ impresa tale ?
--------------------------------------------------------------------------------


Processing Epoch 15: 100%|██████████| 3638/3638 [1:12:02<00:00,  1.19s/it, loss=3.672]


--------------------------------------------------------------------------------
    SOURCE: Some years older than I, she knew more of the world, and could tell me many things I liked to hear: with her my curiosity found gratification: to my faults also she gave ample indulgence, never imposing curb or rein on anything I said.
    TARGET: Aveva qualche anno più di me, e, conoscendo il mondo, poteva narrarmi cose che mi dilettavano. Anna era indulgente per i miei difetti e non metteva mai un freno alle mie parole.
 PREDICTED: Qualche anno fa più che mai il mondo di me conosceva o molte cose , che potei il mio dovere . La mia lettera , con la mia lettera , il mio pensiero non era più nulla , né pronta a nessuna forma .
--------------------------------------------------------------------------------
    SOURCE: "You know--and perhaps think well of."
    TARGET: — Voi pensate forse....
 PREDICTED: — Sapete , e credete di esser bene .
--------------------------------------------------------

Processing Epoch 16: 100%|██████████| 3638/3638 [1:12:31<00:00,  1.20s/it, loss=2.408]


--------------------------------------------------------------------------------
    SOURCE: The forest transaction was completed, he had the money in his pocket, the shooting had been fine, Oblonsky was in the best of spirits, and therefore all the more anxious to dispel Levin's ill-humour.
    TARGET: L’affare del bosco era concluso, il denaro era in tasca, la caccia era stata magnifica, e Stepan Arkad’ic si trovava nella più amena disposizione d’animo; voleva perciò in particolar modo disperdere il cattivo umore che era piombato su Levin.
 PREDICTED: Il bosco era passato , fatto il denaro in tasca , il denaro era bello , Stepan Arkad ’ ic era stato ottimo umore e , perciò , tutti , si faceva caldo per Levin .
--------------------------------------------------------------------------------
    SOURCE: CHAPTER XX
    TARGET: XX.
 PREDICTED: XX
--------------------------------------------------------------------------------


Processing Epoch 17: 100%|██████████| 3638/3638 [1:12:39<00:00,  1.20s/it, loss=2.459]


--------------------------------------------------------------------------------
    SOURCE: Only a few drops...
    TARGET: Appena poche gocce.
 PREDICTED: Solo alcuni pezzi ...
--------------------------------------------------------------------------------
    SOURCE: For the rest, whether trite or novel, it is short.
    TARGET: "Del resto è corta.
 PREDICTED: Per lo stesso , se la signorina o il tempo è breve e breve .
--------------------------------------------------------------------------------


Processing Epoch 18: 100%|██████████| 3638/3638 [1:07:59<00:00,  1.12s/it, loss=1.896]


--------------------------------------------------------------------------------
    SOURCE: The old man complained that his affairs were in a bad way.
    TARGET: Il vecchio si lamentava che gli affari andavano male.
 PREDICTED: Il vecchio che gli altri si alzarono .
--------------------------------------------------------------------------------
    SOURCE: 'You insist too much on your devotion, for me to value it greatly,' she replied in the same playful tone, while she involuntarily listened to the sound of Vronsky's footsteps following them.
    TARGET: — Tu insisti troppo su questa tua tenerezza, perché io possa apprezzarla — disse lei con lo stesso tono scherzoso, prestando involontariamente orecchio al suono dei passi di Vronskij che camminava dietro di loro.
 PREDICTED: — Voi dovete volerle bene , per me vi sta bene — ella disse con la stessa maniera sottile nel tono sottile di riso , mentre lei , con un tono tale che guardava Vronskij nel suo silenzio .
----------------------

Processing Epoch 19: 100%|██████████| 3638/3638 [1:16:48<00:00,  1.27s/it, loss=2.320]


--------------------------------------------------------------------------------
    SOURCE: The feeling was not like an electric shock, but it was quite as sharp, as strange, as startling: it acted on my senses as if their utmost activity hitherto had been but torpor, from which they were now summoned and forced to wake.
    TARGET: Quella sensazione non somigliava a un urto elettrico, ma era altrettanto acuto, quanto strano e violento. Pareva che fino a quel momento la mia maggior attività fosse stata soltanto torpore, dal quale mi s'imponeva d'uscire.
 PREDICTED: Il sentimento non era come un , ma nel mio strano , così strano , come si mostrò i miei segni . Dopo aver stabilito , a quanto i miei desideri , si erano messi a vedere prima i miei pensieri .
--------------------------------------------------------------------------------
    SOURCE: 'You don't care much for oysters?' said Oblonsky, emptying his champagne glass – 'or perhaps you're thinking of something else.
    TARGET: —