In [1]:
# Importing libraries

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

# Math
import math

# HuggingFace libraries 
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

# Pathlib 
from pathlib import Path

# typing
from typing import Any

# Library for progress bars in loops
from tqdm import tqdm

# Importing library of warnings
import warnings

In [2]:
# Creating Input Embeddings
class InputEmbeddings(nn.Module):
    
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model # Dimension of vectors (512)
        self.vocab_size = vocab_size # Size of the vocabulary
        self.embedding = nn.Embedding(vocab_size, d_model) # PyTorch layer that converts integer indices to dense embeddings
        
    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model) # Normalizing the variance of the embeddings

In [3]:
# Creating the Positional Encoding
class PositionalEncoding(nn.Module):
    
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model # Dimensionality of the model
        self.seq_len = seq_len # Maximum sequence length
        self.dropout = nn.Dropout(dropout) # Dropout layer to prevent overfitting
        
        # Creating a positional encoding matrix of shape (seq_len, d_model) filled with zeros
        pe = torch.zeros(seq_len, d_model) 
        
        # Creating a tensor representing positions (0 to seq_len - 1)
        position = torch.arange(0, seq_len, dtype = torch.float).unsqueeze(1) # Transforming 'position' into a 2D tensor['seq_len, 1']
        
        # Creating the division term for the positional encoding formula
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        # Apply sine to even indices in pe
        pe[:, 0::2] = torch.sin(position * div_term)
        # Apply cosine to odd indices in pe
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # Adding an extra dimension at the beginning of pe matrix for batch handling
        pe = pe.unsqueeze(0)
        
        # Registering 'pe' as buffer. Buffer is a tensor not considered as a model parameter
        self.register_buffer('pe', pe) 
        
    def forward(self,x):
        # Addind positional encoding to the input tensor X
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x) # Dropout for regularization

In [4]:
# Creating Layer Normalization
class LayerNormalization(nn.Module):
    
    def __init__(self, eps: float = 10**-6) -> None: # We define epsilon as 0.000001 to avoid division by zero
        super().__init__()
        self.eps = eps
        
        # We define alpha as a trainable parameter and initialize it with ones
        self.alpha = nn.Parameter(torch.ones(1)) # One-dimensional tensor that will be used to scale the input data
        
        # We define bias as a trainable parameter and initialize it with zeros
        self.bias = nn.Parameter(torch.zeros(1)) # One-dimensional tenso that will be added to the input data
        
    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True) # Computing the mean of the input data. Keeping the number of dimensions unchanged
        std = x.std(dim = -1, keepdim = True) # Computing the standard deviation of the input data. Keeping the number of dimensions unchanged
        
        # Returning the normalized input
        return self.alpha * (x-mean) / (std + self.eps) + self.bias

In [5]:
# Creating Feed Forward Layers
class FeedForwardBlock(nn.Module):
    
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        # First linear transformation
        self.linear_1 = nn.Linear(d_model, d_ff) # W1 & b1
        self.dropout = nn.Dropout(dropout) # Dropout to prevent overfitting
        # Second linear transformation
        self.linear_2 = nn.Linear(d_ff, d_model) # W2 & b2
        
    def forward(self, x):
        # (Batch, seq_len, d_model) --> (batch, seq_len, d_ff) -->(batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

In [6]:
# Creating the Multi-Head Attention block
class MultiHeadAttentionBlock(nn.Module):
    
    def __init__(self, d_model: int, h: int, dropout: float) -> None: # h = number of heads
        super().__init__()
        self.d_model = d_model
        self.h = h
        
        # We ensure that the dimensions of the model is divisible by the number of heads
        assert d_model % h == 0, 'd_model is not divisible by h'
        
        # d_k is the dimension of each attention head's key, query, and value vectors
        self.d_k = d_model // h # d_k formula, like in the original "Attention Is All You Need" paper
        
        # Defining the weight matrices
        self.w_q = nn.Linear(d_model, d_model) # W_q
        self.w_k = nn.Linear(d_model, d_model) # W_k
        self.w_v = nn.Linear(d_model, d_model) # W_v
        self.w_o = nn.Linear(d_model, d_model) # W_o
        
        self.dropout = nn.Dropout(dropout) # Dropout layer to avoid overfitting
        
    
    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):# mask => When we want certain words to NOT interact with others, we "hide" them
        
        d_k = query.shape[-1] # The last dimension of query, key, and value
        
        # We calculate the Attention(Q,K,V) as in the formula in the image above 
        attention_scores = (query @ key.transpose(-2,-1)) / math.sqrt(d_k) # @ = Matrix multiplication sign in PyTorch
        
        # Before applying the softmax, we apply the mask to hide some interactions between words
        if mask is not None: # If a mask IS defined...
            attention_scores.masked_fill_(mask == 0, -1e9) # Replace each value where mask is equal to 0 by -1e9
        attention_scores = attention_scores.softmax(dim = -1) # Applying softmax
        if dropout is not None: # If a dropout IS defined...
            attention_scores = dropout(attention_scores) # We apply dropout to prevent overfitting
            
        return (attention_scores @ value), attention_scores # Multiply the output matrix by the V matrix, as in the formula
        
    def forward(self, q, k, v, mask): 
        
        query = self.w_q(q) # Q' matrix
        key = self.w_k(k) # K' matrix
        value = self.w_v(v) # V' matrix
        
        
        # Splitting results into smaller matrices for the different heads
        # Splitting embeddings (third dimension) into h parts
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1,2) # Transpose => bring the head to the second dimension
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1,2) # Transpose => bring the head to the second dimension
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1,2) # Transpose => bring the head to the second dimension
        
        # Obtaining the output and the attention scores
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
        
        # Obtaining the H matrix
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)
        
        return self.w_o(x) # Multiply the H matrix by the weight matrix W_o, resulting in the MH-A matrix

In [7]:
# Building Residual Connection
class ResidualConnection(nn.Module):
    def __init__(self, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout) # We use a dropout layer to prevent overfitting
        self.norm = LayerNormalization() # We use a normalization layer 
    
    def forward(self, x, sublayer):
        # We normalize the input and add it to the original input 'x'. This creates the residual connection process.
        return x + self.dropout(sublayer(self.norm(x))) 

In [8]:
# Building Encoder Block
class EncoderBlock(nn.Module):
    
    # This block takes in the MultiHeadAttentionBlock and FeedForwardBlock, as well as the dropout rate for the residual connections
    def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        # Storing the self-attention block and feed-forward block
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)]) # 2 Residual Connections with dropout
        
    def forward(self, x, src_mask):
        # Applying the first residual connection with the self-attention block
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask)) # Three 'x's corresponding to query, key, and value inputs plus source mask
        
        # Applying the second residual connection with the feed-forward block 
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x # Output tensor after applying self-attention and feed-forward layers with residual connections.

In [9]:
# Building Encoder 
# An Encoder can have several Encoder Blocks
class Encoder(nn.Module):
    
    # The Encoder takes in instances of 'EncoderBlock'
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers # Storing the EncoderBlocks
        self.norm = LayerNormalization() # Layer for the normalization of the output of the encoder layers
        
    def forward(self, x, mask):
        # Iterating over each EncoderBlock stored in self.layers
        for layer in self.layers:
            x = layer(x, mask) # Applying each EncoderBlock to the input tensor 'x'
        return self.norm(x) # Normalizing output

In [10]:
# Building Decoder Block
class DecoderBlock(nn.Module):
    
    # The DecoderBlock takes in two MultiHeadAttentionBlock. One is self-attention, while the other is cross-attention.
    # It also takes in the feed-forward block and the dropout rate
    def __init__(self,  self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)]) # List of three Residual Connections with dropout rate
        
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        
        # Self-Attention block with query, key, and value plus the target language mask
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        
        # The Cross-Attention block using two 'encoder_ouput's for key and value plus the source language mask. It also takes in 'x' for Decoder queries
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        
        # Feed-forward block with residual connections
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

In [11]:
# Building Decoder
# A Decoder can have several Decoder Blocks
class Decoder(nn.Module):
    
    # The Decoder takes in instances of 'DecoderBlock'
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        
        # Storing the 'DecoderBlock's
        self.layers = layers
        self.norm = LayerNormalization() # Layer to normalize the output
        
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        
        # Iterating over each DecoderBlock stored in self.layers
        for layer in self.layers:
            # Applies each DecoderBlock to the input 'x' plus the encoder output and source and target masks
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x) # Returns normalized output

In [12]:
# Buiding Linear Layer
class ProjectionLayer(nn.Module):
    def __init__(self, d_model: int, vocab_size: int) -> None: # Model dimension and the size of the output vocabulary
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size) # Linear layer for projecting the feature space of 'd_model' to the output space of 'vocab_size'
    def forward(self, x):
        return torch.log_softmax(self.proj(x), dim = -1) # Applying the log Softmax function to the output

In [13]:
# Creating the Transformer Architecture
class Transformer(nn.Module):
    
    # This takes in the encoder and decoder, as well the embeddings for the source and target language.
    # It also takes in the Positional Encoding for the source and target language, as well as the projection layer
    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer
        
    # Encoder     
    def encode(self, src, src_mask):
        src = self.src_embed(src) # Applying source embeddings to the input source language
        src = self.src_pos(src) # Applying source positional encoding to the source embeddings
        return self.encoder(src, src_mask) # Returning the source embeddings plus a source mask to prevent attention to certain elements
    
    # Decoder
    def decode(self, encoder_output, src_mask, tgt, tgt_mask):
        tgt = self.tgt_embed(tgt) # Applying target embeddings to the input target language (tgt)
        tgt = self.tgt_pos(tgt) # Applying target positional encoding to the target embeddings
        
        # Returning the target embeddings, the output of the encoder, and both source and target masks
        # The target mask ensures that the model won't 'see' future elements of the sequence
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    # Applying Projection Layer with the Softmax function to the Decoder output
    def project(self, x):
        return self.projection_layer(x)

In [14]:
# Building & Initializing Transformer

# Definin function and its parameter, including model dimension, number of encoder and decoder stacks, heads, etc.
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int = 512, N: int = 6, h: int = 8, dropout: float = 0.1, d_ff: int = 2048) -> Transformer:
    
    # Creating Embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size) # Source language (Source Vocabulary to 512-dimensional vectors)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size) # Target language (Target Vocabulary to 512-dimensional vectors)
    
    # Creating Positional Encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout) # Positional encoding for the source language embeddings
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout) # Positional encoding for the target language embeddings
    
    # Creating EncoderBlocks
    encoder_blocks = [] # Initial list of empty EncoderBlocks
    for _ in range(N): # Iterating 'N' times to create 'N' EncoderBlocks (N = 6)
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout) # Self-Attention
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout) # FeedForward
        
        # Combine layers into an EncoderBlock
        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block) # Appending EncoderBlock to the list of EncoderBlocks
        
    # Creating DecoderBlocks
    decoder_blocks = [] # Initial list of empty DecoderBlocks
    for _ in range(N): # Iterating 'N' times to create 'N' DecoderBlocks (N = 6)
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout) # Self-Attention
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout) # Cross-Attention
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout) # FeedForward
        
        # Combining layers into a DecoderBlock
        decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block) # Appending DecoderBlock to the list of DecoderBlocks
        
    # Creating the Encoder and Decoder by using the EncoderBlocks and DecoderBlocks lists
    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))
    
    # Creating projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size) # Map the output of Decoder to the Target Vocabulary Space
    
    # Creating the transformer by combining everything above
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)
    
    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
            
    return transformer # Assembled and initialized Transformer. Ready to be trained and validated!

In [15]:
# Defining Tokenizer
def build_tokenizer(config, ds, lang):
    
    # Crating a file path for the tokenizer 
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    
    # Checking if Tokenizer already exists
    if not Path.exists(tokenizer_path): 
        
        # If it doesn't exist, we create a new one
        tokenizer = Tokenizer(WordLevel(unk_token = '[UNK]')) # Initializing a new world-level tokenizer
        tokenizer.pre_tokenizer = Whitespace() # We will split the text into tokens based on whitespace
        
        # Creating a trainer for the new tokenizer
        trainer = WordLevelTrainer(special_tokens = ["[UNK]", "[PAD]", 
                                                     "[SOS]", "[EOS]"], min_frequency = 2) # Defining Word Level strategy and special tokens
        
        # Training new tokenizer on sentences from the dataset and language specified 
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer = trainer)
        tokenizer.save(str(tokenizer_path)) # Saving trained tokenizer to the file path specified at the beginning of the function
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path)) # If the tokenizer already exist, we load it
    return tokenizer # Returns the loaded tokenizer or the trained tokenizer

In [16]:
# Iterating through dataset to extract the original sentence and its translation 
def get_all_sentences(ds, lang):
    for pair in ds:
        yield pair['translation'][lang]

In [17]:
def get_ds(config):
    
    # Loading the train portion of the OpusBooks dataset.
    # The Language pairs will be defined in the 'config' dictionary we will build later
    ds_raw = load_dataset('opus_books', f'{config["lang_src"]}-{config["lang_tgt"]}', split = 'train') 
    
    # Building or loading tokenizer for both the source and target languages 
    tokenizer_src = build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = build_tokenizer(config, ds_raw, config['lang_tgt'])
    
    # Splitting the dataset for training and validation 
    train_ds_size = int(0.9 * len(ds_raw)) # 90% for training
    val_ds_size = len(ds_raw) - train_ds_size # 10% for validation
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size]) # Randomly splitting the dataset
                                    
    # Processing data with the BilingualDataset class, which we will define below
    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
                                    
    # Iterating over the entire dataset and printing the maximum length found in the sentences of both the source and target languages
    max_len_src = 0
    max_len_tgt = 0
    for pair in ds_raw:
        src_ids = tokenizer_src.encode(pair['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_src.encode(pair['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))
        
    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')
    
    # Creating dataloaders for the training and validadion sets
    # Dataloaders are used to iterate over the dataset in batches during training and validation
    train_dataloader = DataLoader(train_ds, batch_size = config['batch_size'], shuffle = True) # Batch size will be defined in the config dictionary
    val_dataloader = DataLoader(val_ds, batch_size = 1, shuffle = True)
    
    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt # Returning the DataLoader objects and tokenizers

In [18]:
def casual_mask(size):
        # Creating a square matrix of dimensions 'size x size' filled with ones
        mask = torch.triu(torch.ones(1, size, size), diagonal = 1).type(torch.int)
        return mask == 0

In [19]:
class BilingualDataset(Dataset):
    
    # This takes in the dataset contaning sentence pairs, the tokenizers for target and source languages, and the strings of source and target languages
    # 'seq_len' defines the sequence length for both languages
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len) -> None:
        super().__init__()
        
        self.seq_len = seq_len
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        
        # Defining special tokens by using the target language tokenizer
        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

        
    # Total number of instances in the dataset (some pairs are larger than others)
    def __len__(self):
        return len(self.ds)
    
    # Using the index to retrive source and target texts
    def __getitem__(self, index: Any) -> Any:
        src_target_pair = self.ds[index]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]
        
        # Tokenizing source and target texts 
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
        
        # Computing how many padding tokens need to be added to the tokenized texts 
        # Source tokens
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2 # Subtracting the two '[EOS]' and '[SOS]' special tokens
        # Target tokens
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 # Subtracting the '[SOS]' special token
        
        # If the texts exceed the 'seq_len' allowed, it will raise an error. This means that one of the sentences in the pair is too long to be processed
        # given the current sequence length limit (this will be defined in the config dictionary below)
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError('Sentence is too long')
         
        # Building the encoder input tensor by combining several elements
        encoder_input = torch.cat(
            [
            self.sos_token, # inserting the '[SOS]' token
            torch.tensor(enc_input_tokens, dtype = torch.int64), # Inserting the tokenized source text
            self.eos_token, # Inserting the '[EOS]' token
            torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype = torch.int64) # Addind padding tokens
            ]
        )
        
        # Building the decoder input tensor by combining several elements
        decoder_input = torch.cat(
            [
                self.sos_token, # inserting the '[SOS]' token 
                torch.tensor(dec_input_tokens, dtype = torch.int64), # Inserting the tokenized target text
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64) # Addind padding tokens
            ]
        
        )
        
        # Creating a label tensor, the expected output for training the model
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype = torch.int64), # Inserting the tokenized target text
                self.eos_token, # Inserting the '[EOS]' token 
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64) # Adding padding tokens
                
            ]
        )
        
        # Ensuring that the length of each tensor above is equal to the defined 'seq_len'
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len
        
        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input, 
            'encoder_mask': (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            'decoder_mask': (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask(decoder_input.size(0)), 
            'label': label,
            'src_text': src_text,
            'tgt_text': tgt_text
        }    

In [20]:
# Define function to obtain the most probable next token
def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    # Retrieving the indices from the start and end of sequences of the target tokens
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')
    
    # Computing the output of the encoder for the source sequence
    encoder_output = model.encode(source, source_mask)
    # Initializing the decoder input with the Start of Sentence token
    decoder_input = torch.empty(1,1).fill_(sos_idx).type_as(source).to(device)
    
    # Looping until the 'max_len', maximum length, is reached
    while True:
        if decoder_input.size(1) == max_len:
            break
            
        # Building a mask for the decoder input
        decoder_mask = casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)
        
        # Calculating the output of the decoder
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)
        
        # Applying the projection layer to get the probabilities for the next token
        prob = model.project(out[:, -1])
        
        # Selecting token with the highest probability
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat([decoder_input, torch.empty(1,1). type_as(source).fill_(next_word.item()).to(device)], dim=1)
        
        # If the next token is an End of Sentence token, we finish the loop
        if next_word == eos_idx:
            break
            
    return decoder_input.squeeze(0) # Sequence of tokens generated by the decoder

In [21]:
# Defining function to evaluate the model on the validation dataset
# num_examples = 2, two examples per run
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_state, writer, num_examples=2):
    model.eval() # Setting model to evaluation mode
    count = 0 # Initializing counter to keep track of how many examples have been processed
    
    console_width = 80 # Fixed witdh for printed messages
    
    # Creating evaluation loop
    with torch.no_grad(): # Ensuring that no gradients are computed during this process
        for batch in validation_ds:
            count += 1
            encoder_input = batch['encoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            
            # Ensuring that the batch_size of the validation set is 1
            assert encoder_input.size(0) ==  1, 'Batch size must be 1 for validation.'
            
            # Applying the 'greedy_decode' function to get the model's output for the source text of the input batch
            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)
            
            # Retrieving source and target texts from the batch
            source_text = batch['src_text'][0]
            target_text = batch['tgt_text'][0] # True translation 
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy()) # Decoded, human-readable model output
            
            # Printing results
            print_msg('-'*console_width)
            print_msg(f'SOURCE: {source_text}')
            print_msg(f'TARGET: {target_text}')
            print_msg(f'PREDICTED: {model_out_text}')
            
            # After two examples, we break the loop
            if count == num_examples:
                break

In [22]:
# We pass as parameters the config dictionary, the length of the vocabylary of the source language and the target language
def get_model(config, vocab_src_len, vocab_tgt_len):
    
    # Loading model using the 'build_transformer' function.
    # We will use the lengths of the source language and target language vocabularies, the 'seq_len', and the dimensionality of the embeddings
    model = build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], config['d_model'])
    return model

In [23]:
# Define settings for building and training the transformer model
def get_config():
    return{
        'batch_size': 8,
        'num_epochs': 20,
        'lr': 10**-4,
        'seq_len': 350,
        'd_model': 512, # Dimensions of the embeddings in the Transformer. 512 like in the "Attention Is All You Need" paper.
        'lang_src': 'en',
        'lang_tgt': 'it',
        'model_folder': 'weights',
        'model_basename': 'tmodel_',
        'preload': None,
        'tokenizer_file': 'tokenizer_{0}.json',
        'experiment_name': 'runs/tmodel'
    }
    

# Function to construct the path for saving and retrieving model weights
def get_weights_file_path(config, epoch: str):
    model_folder = config['model_folder'] # Extracting model folder from the config
    model_basename = config['model_basename'] # Extracting the base name for model files
    model_filename = f"{model_basename}{epoch}.pt" # Building filename
    return str(Path('.')/ model_folder/ model_filename) # Combining current directory, the model folder, and the model filename

In [24]:
def train_model(config):
    # Setting up device to run on GPU to train faster
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device {device}")
    
    # Creating model directory to store weights
    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)
    
    # Retrieving dataloaders and tokenizers for source and target languages using the 'get_ds' function
    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    
    # Initializing model on the GPU using the 'get_model' function
    model = get_model(config,tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    
    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])
    
    # Setting up the Adam optimizer with the specified learning rate from the '
    # config' dictionary plus an epsilon value
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps = 1e-9)
    
    # Initializing epoch and global step variables
    initial_epoch = 0
    global_step = 0
    
    # Checking if there is a pre-trained model to load
    # If true, loads it
    if config['preload']:
        model_filename = get_weights_file_path(config, config['preload'])
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename) # Loading model
        
        # Sets epoch to the saved in the state plus one, to resume from where it stopped
        initial_epoch = state['epoch'] + 1
        # Loading the optimizer state from the saved model
        optimizer.load_state_dict(state['optimizer_state_dict'])
        # Loading the global step state from the saved model
        global_step = state['global_step']
        
    # Initializing CrossEntropyLoss function for training
    # We ignore padding tokens when computing loss, as they are not relevant for the learning process
    # We also apply label_smoothing to prevent overfitting
    loss_fn = nn.CrossEntropyLoss(ignore_index = tokenizer_src.token_to_id('[PAD]'), label_smoothing = 0.1).to(device)
    
    # Initializing training loop 
    
    # Iterating over each epoch from the 'initial_epoch' variable up to
    # the number of epochs informed in the config
    for epoch in range(initial_epoch, config['num_epochs']):
        
        # Initializing an iterator over the training dataloader
        # We also use tqdm to display a progress bar
        batch_iterator = tqdm(train_dataloader, desc = f'Processing epoch {epoch:02d}')
        
        # For each batch...
        for batch in batch_iterator:
            model.train() # Train the model
            
            # Loading input data and masks onto the GPU
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            decoder_mask = batch['decoder_mask'].to(device)
            
            # Running tensors through the Transformer
            encoder_output = model.encode(encoder_input, encoder_mask)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
            proj_output = model.project(decoder_output)
            
            # Loading the target labels onto the GPU
            label = batch['label'].to(device)
            
            # Computing loss between model's output and true labels
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            
            # Updating progress bar
            batch_iterator.set_postfix({f"loss": f"{loss.item():6.3f}"})
            
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()
            
            # Performing backpropagation
            loss.backward()
            
            # Updating parameters based on the gradients
            optimizer.step()
            
            # Clearing the gradients to prepare for the next batch
            optimizer.zero_grad()
            
            global_step += 1 # Updating global step count
            
        # We run the 'run_validation' function at the end of each epoch
        # to evaluate model performance
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)
         
        # Saving model
        model_filename = get_weights_file_path(config, f'{epoch:02d}')
        # Writting current model state to the 'model_filename'
        torch.save({
            'epoch': epoch, # Current epoch
            'model_state_dict': model.state_dict(),# Current model state
            'optimizer_state_dict': optimizer.state_dict(), # Current optimizer state
            'global_step': global_step # Current global step 
        }, model_filename)

In [25]:
if __name__ == '__main__':
    warnings.filterwarnings('ignore') # Filtering warnings
    config = get_config() # Retrieving config settings
    train_model(config) # Training model with the config arguments

Using device cuda
Max length of source sentence: 309
Max length of target sentence: 274


Processing epoch 00: 100%|██████████| 3638/3638 [17:58<00:00,  3.37it/s, loss=5.557]


--------------------------------------------------------------------------------
SOURCE: Would you like to see her?
TARGET: Vuoi vederla?
PREDICTED: E che cosa ?
--------------------------------------------------------------------------------
SOURCE: It was to his interest that every labourer should get through as much work as possible and at the same time give his mind to it, not injuring the winnowing machine, the horse-rake, or the threshing machine, but working intelligently. The labourer wished to work in the pleasantest way possible, with intervals of rest, and especially to think unconcernedly about other things without having to reason.
TARGET: Egli aveva interesse a che ogni lavoratore rendesse quanto più possibile, che non si distraesse, che badasse a non rompere i vagli, che riflettesse a quello che faceva; il lavoratore, invece, aveva interesse a lavorare nel modo più piacevole possibile, con respiro, e soprattutto senza preoccupazione, lasciandosi andare, senza pensare. E 

Processing epoch 01: 100%|██████████| 3638/3638 [16:39<00:00,  3.64it/s, loss=5.348]


--------------------------------------------------------------------------------
SOURCE: By remaining in the service I lose nothing.
TARGET: Rimanendo in servizio non perdo nulla.
PREDICTED: a questo momento , ma non mi .
--------------------------------------------------------------------------------
SOURCE: She glanced at Dolly, but not pausing for a reply continued:
TARGET: Guardò per un attimo Dolly, ma senza aspettare risposta, continuò.
PREDICTED: Ella guardò , ma non era un po ’ di nuovo , ma non era un po ’ di .


Processing epoch 02: 100%|██████████| 3638/3638 [15:45<00:00,  3.85it/s, loss=5.404]


--------------------------------------------------------------------------------
SOURCE: I meditated wonderingly on this incident; but gradually quitting it, as I found it for the present inexplicable, I turned to the consideration of my master's manner to myself.
TARGET: Vi riflettei per un istante, ma poi, trovando inesplicabile quell'incidente, vi rinunziai e mi misi allora a pensare alle maniere del signor Rochester.
PREDICTED: un uomo che mi , ma mi , ma mi , mi la mia vita , mi in quel momento che mi .
--------------------------------------------------------------------------------
SOURCE: First, I had no plough to turn up the earth—no spade or shovel to dig it. Well, this I conquered by making me a wooden spade, as I observed before; but this did my work but in a wooden manner; and though it cost me a great many days to make it, yet, for want of iron, it not only wore out soon, but made my work the harder, and made it be performed much worse.
TARGET: Primieramente io non aveva a

Processing epoch 03: 100%|██████████| 3638/3638 [16:03<00:00,  3.78it/s, loss=5.089]


--------------------------------------------------------------------------------
SOURCE: My eyes were covered and closed: eddying darkness seemed to swim round me, and reflection came in as black and confused a flow.
TARGET: La vista mi si oscurò, mi pareva che le tenebre mi circondassero, i miei pensieri si facevano confusi.
PREDICTED: La mia testa si e mi il capo .
--------------------------------------------------------------------------------
SOURCE: After that, feeling rather weak, he sat down on a bench in the yard and began demonstrating to Yashvin Russia's superiority to Prussia, especially in cavalry charges, and the carousal quieted down for a moment.
TARGET: Dopo, il comandante del reggimento, già infiacchito, sedette su di una panca nel cortile e cominciò a dimostrare a Jašvin la superiorità della Russia sulla Prussia, specie nell’attacco di cavalleria, e per un momento la baldoria si chetò.
PREDICTED: Dopo aver paura , un ’ espressione di cui si , si fermò in un angolo e s

Processing epoch 04: 100%|██████████| 3638/3638 [18:47<00:00,  3.23it/s, loss=3.777]


--------------------------------------------------------------------------------
SOURCE: I'm never sure what I'm going to be, from one minute to another!
TARGET: Non so mai che diventerò da un minuto all'altro!
PREDICTED: Non posso dire che cosa devo fare , ma ora , dopo , a un tratto , un ’ altra .
--------------------------------------------------------------------------------
SOURCE: 'I?
TARGET: — Io?
PREDICTED: — Io ?


Processing epoch 05: 100%|██████████| 3638/3638 [17:17<00:00,  3.51it/s, loss=4.280]


--------------------------------------------------------------------------------
SOURCE: Dolly, who had inherited her father's gift of putting things humorously, made Varenka collapse with laughter when she related for the third or fourth time, with ever fresh humorous additions, how she was just putting on some new ribbons in the visitor's honour, and was about to go into the drawing-room, when suddenly she heard the clatter of the old cart.
TARGET: E Dolly, che aveva ricevuto dal padre l’arte di raccontare le cose con spirito, faceva morir dal ridere Varen’ka, quando per la terza o quarta volta, sempre con nuove aggiunte umoristiche, raccontava come, proprio mentre lei si accingeva a mettersi dei nuovi nastri in omaggio all’ospite, e stava per uscire in salotto, ecco che, all’improvviso, aveva sentito il rumore della carretta.
PREDICTED: Dolly , che aveva il padre di Dar ’ ja Aleksandrovna , aveva preso la moglie di Varen ’ ka , quando Varen ’ ka , quando parlava con lei , o come se 

Processing epoch 06: 100%|██████████| 3638/3638 [18:07<00:00,  3.35it/s, loss=3.987]


--------------------------------------------------------------------------------
SOURCE: In this kind of dress I went my new journey, and was out five or six days.
TARGET: Non avendo questa volta la piroga che mi desse fastidio, presi per terra una via più corta, per giungere all’altura ov’era salito dianzi.
PREDICTED: A questo vestito di seta mi feci tornare a casa , e fui vicino a noi o sei giorni .
--------------------------------------------------------------------------------
SOURCE: It never occurred to him that by this action he was weakening himself, depriving himself of friends and of those who had thrown themselves into his lap, whilst he aggrandized the Church by adding much temporal power to the spiritual, thus giving it greater authority.
TARGET: Né si accorse, con questa deliberazione, che faceva sé debole, togliendosi li amici e quelli che se li erano gittati in grembo, e la Chiesa grande, aggiugnendo allo spirituale, che gli dà tanta autorità, tanto temporale.
PREDICTED

Processing epoch 07: 100%|██████████| 3638/3638 [17:30<00:00,  3.46it/s, loss=3.917]


--------------------------------------------------------------------------------
SOURCE: He understood all the different kinds and was able to draw inspiration from all, but he could not imagine that it is possible to be quite ignorant of the different kinds of art and to be inspired directly by what is in one's own soul, regardless of whether what one paints belongs to any particular school.
TARGET: Intendeva qualsiasi genere, e poteva ispirarsi a questo e a quello; non immaginava che si potesse del tutto ignorare quali generi di pittura esistessero e che ci si potesse ispirare direttamente a quello che c’è nell’anima, senza preoccuparsi se quello che si è dipinto appartiene a un certo determinato genere.
PREDICTED: Sentiva tutte le nuove correnti e di umore , ma per questo non poteva non poter essere altrimenti che si può essere potuto essere della vita e dell ’ arte , e che per un ’ arte e di un ’ azienda è accaduto in un ’ anima di simile a quello che si può comprare un ’ idea di .

Processing epoch 08: 100%|██████████| 3638/3638 [16:33<00:00,  3.66it/s, loss=4.293]


--------------------------------------------------------------------------------
SOURCE: Daily He announces more distinctly,--'Surely I come quickly!' and hourly I more eagerly respond,--'Amen; even so come, Lord Jesus!'"
TARGET: Avanzo rapidamente, e ogni ora che scorre rispondo con maggior ardore: — Amen, venite, Signore Gesù!
PREDICTED: Ogni volta che più egli si considerava , tanto più strana e più spesso mi diceva , con piacere che il colpevole fosse ancora un ' altra cosa : “ Signore !
--------------------------------------------------------------------------------
SOURCE: The wish to acquire is in truth very natural and common, and men always do so when they can, and for this they will be praised not blamed; but when they cannot do so, yet wish to do so by any means, then there is folly and blame.
TARGET: È cosa veramente molto naturale et ordinaria desiderare di acquistare; e sempre, quando li uomini lo fanno che possano, saranno laudati, o non biasimati; ma, quando non possono

Processing epoch 09: 100%|██████████| 3638/3638 [15:43<00:00,  3.86it/s, loss=2.612]


--------------------------------------------------------------------------------
SOURCE: The old-fashioned chairs were very bright, and the walnut-wood table was like a looking-glass.
TARGET: Le antiche sedie eran lucenti e la tavola di noce pareva uno specchio.
PREDICTED: Il vecchio di carrozze erano fra le sedie e le sedie , come un tavolino , era un tavolino su un tavolino .
--------------------------------------------------------------------------------
SOURCE: Adele brought her stool to my feet; ere long she touched my knee. "What is it, Adele?"
TARGET: Adele portò il panchettino davanti a me, e poco dopo mi toccò il ginocchio. — Che cosa volete? — le domandai.
PREDICTED: La signorina Miller le fece venire in camera mia , e la chiusi , perché mi domandò : — Che cosa è qui ?


Processing epoch 10: 100%|██████████| 3638/3638 [15:43<00:00,  3.86it/s, loss=2.998]


--------------------------------------------------------------------------------
SOURCE: 'Perhaps he has gone out into the hall; he was walking about there just now.
TARGET: — È forse uscito nell’ingresso, non faceva che camminare.
PREDICTED: — Può darsi che è andato in anticamera , perché è arrivato fino a ora .
--------------------------------------------------------------------------------
SOURCE: I sought the orchard, driven to its shelter by the wind, which all day had blown strong and full from the south, without, however, bringing a speck of rain.
TARGET: Mi diressi verso il pomario, per trovar riparo contro il vento di mezzogiorno che aveva soffiato fin dalla mattina, senza procurarci il sollievo della pioggia.
PREDICTED: Cercai un posticino alla brezza della vento coperto di vento , che tutto il vento della giornata , e su un vento coperto di miglio all ’ incirca .


Processing epoch 11: 100%|██████████| 3638/3638 [15:42<00:00,  3.86it/s, loss=3.565]


--------------------------------------------------------------------------------
SOURCE: "Do you forgive me, Jane?"
TARGET: — Mi perdonate, Jane?
PREDICTED: — Lo , Jane ?
--------------------------------------------------------------------------------
SOURCE: When she got back to the Cheshire Cat, she was surprised to find quite a large crowd collected round it: there was a dispute going on between the executioner, the King, and the Queen, who were all talking at once, while all the rest were quite silent, and looked very uncomfortable.
TARGET: Ma con sorpresa trovò una gran folla raccolta intorno al Ghignagatto; il Re, la Regina e il carnefice urlavano tutti e tre insieme, e gli altri erano silenziosi e malinconici.
PREDICTED: Quando la Regina fu terminata , ( era ) si vide un nuovo tratto alla folla ) e si sedettero fra il Re ) e il Re inforcò con tutta la Regina , mentre parlava di tutte le altre cose .


Processing epoch 12: 100%|██████████| 3638/3638 [15:42<00:00,  3.86it/s, loss=2.647]


--------------------------------------------------------------------------------
SOURCE: I suppose he had considered that these were all the governess would require for her private perusal; and, indeed, they contented me amply for the present; compared with the scanty pickings I had now and then been able to glean at Lowood, they seemed to offer an abundant harvest of entertainment and information.
TARGET: Aveva supposto che questo dovesse bastare a una istitutrice.
PREDICTED: Credo che si sentisse il fatto che tutti quelli che potessero , ed l ' agitazione del mio linguaggio ; perchè mi con tanta violenza come ne avevo giudicato il più facile ; poi mi la storia delle mie , credo che la a Lowood e il cibo .
--------------------------------------------------------------------------------
SOURCE: 'I don't expect you to understand me and my feelings, as an affectionate man would; but I did expect ordinary delicacy,' she said.
TARGET: — Io non m’aspetto che vi ricordiate di me, dei miei se

Processing epoch 13: 100%|██████████| 3638/3638 [16:46<00:00,  3.61it/s, loss=2.593]


--------------------------------------------------------------------------------
SOURCE: But Dolly, on receiving the telegram, only sighed over the rouble it had cost, and understood that it had been sent toward the end of a dinner.
TARGET: Dar’ja Aleksandrovna, invece, ricevuto il telegramma, sospirò soltanto per il rublo del telegramma e capì che la cosa era avvenuta alla fine d’un pranzo.
PREDICTED: Ma Dolly , nel telegramma , nel telegramma , sospirò soltanto in dieci rubli , dicendo che si doveva avere un pranzo verso la fine del pranzo .
--------------------------------------------------------------------------------
SOURCE: 'You are growing younger every day, Bondarenko!' he remarked to the ruddy-faced, smart-looking sergeant-major, serving for a second term, who stood just in front of him.
TARGET: — Tu diventi sempre più giovane, Bondarenko — disse rivolto a un ben fatto, rubicondo maresciallo che era stato richiamato in servizio per la seconda volta, e che stava diritto davant

Processing epoch 14: 100%|██████████| 3638/3638 [18:28<00:00,  3.28it/s, loss=3.070]


--------------------------------------------------------------------------------
SOURCE: It is very old, and it was very strong and great once.
TARGET: È città antichissima, e una volta era assai forte e grande.
PREDICTED: È molto vecchio , e molto ebbe grande e grande .
--------------------------------------------------------------------------------
SOURCE: "No, I didn't," grunted Harris; "I said twelve.
TARGET: — No, non è vero — grugnì Harris. — Ho detto dodici.
PREDICTED: — No , non ho mai pensato a Landau , — dissi Harris .


Processing epoch 15: 100%|██████████| 3638/3638 [17:41<00:00,  3.43it/s, loss=2.574]


--------------------------------------------------------------------------------
SOURCE: And the leaves of the trees that grew in the wood were very dark and thick, so that no ray of light came through the branches to lighten the gloom and sadness.
TARGET: E le foglie degli alberi che crescevano nella foresta erano oscurissime e folte tanto che non un raggio di luce filtrava a traverso i rami ad attenuare la tenebra e la tristezza.
PREDICTED: E le foglie gli alberi erano l ' uno all ' altro lato , così forte , così forte , che non si era acceso , e il fragore di quell ' aria s ' .
--------------------------------------------------------------------------------
SOURCE: She filled up the hiatus his silence left by a reply of her own.
TARGET: E vedendo che Saint-John non rispondeva, riprese:
PREDICTED: Ella si accostò al silenzio del suo silenzio , e immediatamente per un attimo di rimprovero .


Processing epoch 16: 100%|██████████| 3638/3638 [17:24<00:00,  3.48it/s, loss=2.858]


--------------------------------------------------------------------------------
SOURCE: 'Perhaps you would not if you were not successful,' said Vronsky.
TARGET: — Forse non lo confesseresti, se non avessi successo — disse Vronskij.
PREDICTED: — Può darsi che tu non sareste per te e per la stessa maniera — disse Vronskij .
--------------------------------------------------------------------------------
SOURCE: 'I shall still get angry with Ivan the coachman in the same way, shall dispute in the same way, shall inopportunely express my thoughts; there will still be a wall between my soul's holy of holies and other people; even my wife I shall still blame for my own fears and shall repent of it. My reason will still not understand why I pray, but I shall still pray, and my life, my whole life, independently of anything that may happen to me, is every moment of it no longer meaningless as it was before, but has an unquestionable meaning of goodness with which I have the power to invest i

Processing epoch 17: 100%|██████████| 3638/3638 [16:41<00:00,  3.63it/s, loss=2.040]


--------------------------------------------------------------------------------
SOURCE: 'Till then, and that may be always, you are happy and tranquil.
TARGET: — Fino ad allora, e potrebbe essere sempre, voi sarete felici e tranquilli.
PREDICTED: — Bisogna che non siamo sempre contenti di nuovo e di nuovo siete tranquillo .
--------------------------------------------------------------------------------
SOURCE: Now act as you please: write and contradict my assertion--expose my falsehood as soon as you like.
TARGET: Ora fate quello che volete, scrivete per contraddire la mia asserzione, sbugiardatemi, dite tutto quello che vi piace.
PREDICTED: Vi fa piacere , vi prego , vi prego , vieni a mente ; la mia ragione è sì fuor di me .


Processing epoch 18: 100%|██████████| 3638/3638 [17:40<00:00,  3.43it/s, loss=2.227]


--------------------------------------------------------------------------------
SOURCE: "That is no answer; or rather it is a very irritating, because a very evasive one. Reply clearly."
TARGET: — Non è una risposta, o almeno è irritante, perché evasiva; rispondete chiaro.
PREDICTED: — Non si tratta di una risposta , o almeno piuttosto , perché sono molto contenta di un ' espressione severa .
--------------------------------------------------------------------------------
SOURCE: I believe there were some misunderstandings between them.
TARGET: Credo che vi fossero frequenti dispute fra i due fratelli.
PREDICTED: Credo che ci fossero tutti e tre .


Processing epoch 19: 100%|██████████| 3638/3638 [17:54<00:00,  3.39it/s, loss=2.086]


--------------------------------------------------------------------------------
SOURCE: Annushka was already dozing, her broad hands, with a hole in one of the gloves, holding the red bag on her lap.
TARGET: Annuška già sonnecchiava, tenendo la sacca rossa sulle ginocchia con le mani larghe nei guanti, uno dei quali era bucato.
PREDICTED: Annuška si mangiava la , con le mani in un estremo senso , le braccia le braccia incrociate e morbide sul petto .
--------------------------------------------------------------------------------
SOURCE: "Why did I never hear of this?" I asked.
TARGET: — Perché non ho saputo nulla di ciò? — domandai.
PREDICTED: — Perché non ho mai saputo mai ? — domandai .
