<a href="https://www.kaggle.com/code/aisuko/implement-transformer-from-scratch?scriptVersionId=212771199" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import math
import torch
import warnings
import torch.nn as nn

from typing import Any
from tqdm import tqdm

from datasets import load_dataset

from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

In [2]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model:int, vocab_size: int):
        super().__init__()
        self.d_model = d_model # dimensiong of vectors (512 is default values from the paper)
        self.vocab_size = vocab_size # size of the vocabulary
        self.embedding=nn.Embedding(vocab_size, d_model) # PyTorch layer that converts integer indices to dense embeddings
    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model) # Normalizing the variance of the embeddings(It's not a step of original paper)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len:int, dropout: float)-> None:
        super().__init__()
        self.d_model = d_model # dimensionality of the model
        self.seq_len = seq_len # maximum sequence length
        self.dropout=nn.Dropout(dropout) # Dropout layer to prevent overfitting
        # Creating a positional encoding matrix of shape (seq_len, d_model) filed with zeros
        pe=torch.zeros(seq_len, d_model)

        # Creating a tensor representing positions (0 to seq_len - 1)
        position = torch.arange(0, seq_len, dtype = torch.float).unsqueeze(1) # Transforming 'position' into a 2D tensor['seq_len, 1']

        # creating the division term for the positional encoding formula
        div_term = torch.exp(torch.arange(0, d_model,2).float()*(-math.log(10000.0)/d_model))

        # Apply sine to even indices in pe
        pe[:,0::2] = torch.sin(position*div_term)
        # Apply cosine to add indices in pe
        pe[:,1::2] = torch.cos(position*div_term)

        # Adding an extra dimension at the beginning of pe matrix for batch handling
        pe=pe.unsqueeze(0)

        # Registerting 'pe' as buffer. Buffer is a tensor not considered as a model parameter
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Adding positional encoding to the input tensor x
        x=x+(self.pe[:,:x.shape[1],:]).requires_grad_(False)
        return self.dropout(x) # Dropout for regularization


# Creating Layer Normalization
class LayerNormalization(nn.Module):
    
    def __init__(self, eps: float = 10**-6) -> None: # We define epsilon as 0.000001 to avoid division by zero
        super().__init__()
        self.eps = eps
        
        # We define alpha as a trainable parameter and initialize it with ones
        self.alpha = nn.Parameter(torch.ones(1)) # One-dimensional tensor that will be used to scale the input data
        
        # We define bias as a trainable parameter and initialize it with zeros
        self.bias = nn.Parameter(torch.zeros(1)) # One-dimensional tenso that will be added to the input data
        
    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True) # Computing the mean of the input data. Keeping the number of dimensions unchanged
        std = x.std(dim = -1, keepdim = True) # Computing the standard deviation of the input data. Keeping the number of dimensions unchanged
        
        # Returning the normalized input
        return self.alpha * (x-mean) / (std + self.eps) + self.bias

class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        # First linear transformation
        self.linear_1=nn.Linear(d_model, d_ff) # W1 & b1
        self.dropout=nn.Dropout(dropout) # Dropout to prevent overfitting
        # second linear transformation
        self.linear_2=nn.Linear(d_ff, d_model) # W2 & b2

    def forward(self, x):
        # (Batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))


class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float)-> None:
        super().__init__()
        self.d_model=d_model
        self.h=h

        # ensure the dimensions of the model is divisible by the number of heads
        assert d_model % h == 0, 'd_model is not divisible by h'

        # d_k is the dimension of each attention head's key, query and value vectors
        self.d_k = d_model // h # d_k formula, like in the original "Attention Is All You Need" paper

        # Defining the weight matrices
        self.w_q=nn.Linear(d_model, d_model) # W_q
        self.w_k=nn.Linear(d_model, d_model) # W_k
        self.w_v=nn.Linear(d_model, d_model) # W_v
        self.w_o=nn.Linear(d_model, d_model) # W_o

        self.dropout=nn.Dropout(dropout) # dropout layer to avoid overfitting
    
    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout): # mask => it's useful if we want certain words to NOT interact with others, we "hiden" them.
        d_k=query.shape[-1] # the last dimension of query, key and value

        # we calcualte the Attention(Q,K,V) as in the formula
        attention_scores=(query @ key.transpose(-2,-1))/math.sqrt(d_k)

        # Apllying mask to hide some interactions between words before applying the softmax
        if mask is not None:
            attention_scores.masked_fill_(mask==0,-1e9) # Replace each value where mask is equal to 0 by -1e9
        attention_scores=attention_scores.softmax(dim=-1)
        if dropout is not None:
            attention_scores=dropout(attention_scores)
        return (attention_scores @ value), attention_scores # Multiply the output matrix by the V matrix

    def forward(self, q,k,v,mask):
        query=self.w_q(q) # Q' matrix
        key=self.w_k(k) # K' matrix
        value=self.w_v(v) # V' matrix

        # Splitting results into smaller matrices for the different heads
        # Spliting embeddings (third dimension) into h parts
        query=query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1,2) # Transpose=> bring the head to the second dimension
        key=key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1,2)
        value=value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1,2)

        # Obtaining the output and the attention socres
        x, self.attention_scores=MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # obtaining the H matrix
        x=x.transpose(1,2).contiguous().view(x.shape[0], -1, self.h*self.d_k)

        return self.w_o(x) # # Multiply the H matrix by the weight matrix W_o, resulting in the MH-A matrix

class ResidualConnection(nn.Module):
    def __init__(self, dropout: float) -> None:
        super().__init__()
        self.dropout=nn.Dropout(dropout)
        self.norm=LayerNormalization()

    def forward(self, x, sublayer):
        # we normalize the input and add it to the otiginal input'x'. This creates the residual connection process.
        return x+self.dropout(sublayer(self.norm(x)))


# Encoder

In [3]:
class EncoderBlock(nn.Module):
    def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        # Storing the self-attention block and feed-forward block
        self.self_attention_block=self_attention_block
        self.feed_forward_block=feed_forward_block
        self.residual_connections=nn.ModuleList(
            [ResidualConnection(dropout) for _ in range(2)] # 2 residual connections with dropout
        )

    def forward(self, x, src_mask):
        # Applying the first residual connection with the self-attention block
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask)) # Three 'x's corresponding to query, key, and value inputs plus source mask
        
        # Applying the second residual connection with the feed-forward block 
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x # Output tensor after applying self-attention and feed-forward layers with residual connections.

class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers=layers
        self.norm=LayerNormalization()

    def forward(self, x, mask):
        for layer in self.layers:
            x=layer(x, mask)
        return self.norm(x)

# Decoder

The main difference is that Decoder has an additional sub-layer that performs multi-head attention with a **cross-attention** component that uses the output of the Encoder as its keys and values while using the decoder's input as queries.

In [4]:
class DecoderBlock(nn.Module):
    def __init__(
        self, 
        self_attention_block: MultiHeadAttentionBlock, 
        cross_attention_block: MultiHeadAttentionBlock,
        feed_forward_block: FeedForwardBlock,
        dropout: float
    ) -> None:
        super().__init__()
        self.self_attention_block=self_attention_block # self-attention with mask
        self.cross_attention_block=cross_attention_block # use encoder result as input
        self.feed_forward_block=feed_forward_block
        self.residual_connections=nn.ModuleList(
            [ResidualConnection(dropout) for _ in range(3)] # List of three residual connections with dropout rate
        )

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        # Self-Attention block with query, key and valye plus the target language mask
        x=self.residual_connections[0](x, lambda x: self.self_attention_block(x,x,x,tgt_mask))

        # The cross-attention block using two encoder outputs for key and value plus the source language mask. It aslo takes in 'x' for Decoder queries
        x=self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))

        # Feed-forward block with residual connections
        x=self.residual_connections[2](x, self.feed_forward_block)

        return x

class Decoder(nn.Module):
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers=layers
        self.norm=LayerNormalization()
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            # applies each decoderBlock tot he input 'x' plus the encoder output and source and target masks
            x=layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x) # returns normalized output


class ProjectionLayer(nn.Module):
    def __init__(self, d_model: int, vocab_size: int) -> None: # Model dimension and the size of the output vocabulary
        super().__init__()
        self.proj=nn.Linear(d_model, vocab_size)# Linear layer for projecting the feature space of 'd_model' to the output space of 'vocab_size'
    
    def forward(self, x):
        return torch.log_softmax(self.proj(x), dim=-1) # Applying the log Softmax function to the output

In [5]:
class Transformer(nn.Module):
    def __init__(
        self,
        encoder: Encoder,
        decoder: Decoder,
        src_embed: InputEmbeddings,
        tgt_embed: InputEmbeddings,
        src_pos: PositionalEncoding,
        tgt_pos: PositionalEncoding,
        projection_layer: ProjectionLayer
    ) -> None:
        super().__init__()
        self.encoder=encoder
        self.decoder=decoder
        self.src_embed=src_embed
        self.tgt_embed=tgt_embed
        self.src_pos=src_pos
        self.tgt_pos=tgt_pos
        self.projection_layer= projection_layer
    
    # Encoder
    def encode(self, src, src_mask):
        src=self.src_embed(src)
        src=self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self, encoder_output, src_mask, tgt, tgt_mask):
        tgt=self.tgt_embed(tgt)
        tgt=self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    def project(self, x):
        return self.projection_layer(x)


def build_transformer(
    src_vocab_size: int,
    tgt_vocab_size: int,
    src_seq_len: int,
    tgt_seq_len: int,
    d_model: int =512,
    N:int=6,
    h:int=8,
    dropout: float=0.1,
    d_ff:int=2048
) -> Transformer:
    src_embed=InputEmbeddings(d_model, src_vocab_size)
    tgt_embed=InputEmbeddings(d_model, tgt_vocab_size)

    src_pos=PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos=PositionalEncoding(d_model, tgt_seq_len, dropout)

    encoder_blocks = []
    for _ in range(N): # N=6
        encoder_self_attention_block=MultiHeadAttentionBlock(d_model, h, dropout) # self-attention
        feed_forward_block=FeedForwardBlock(d_model, d_ff, dropout) # feedforward

        # combine layers into an encoderblock
        encoder_block=EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block=MultiHeadAttentionBlock(d_model, h, dropout) # self_attention
        decoder_cross_attention_block=MultiHeadAttentionBlock(d_model, h, dropout) # Cross-attention
        feed_forward_block=FeedForwardBlock(d_model, d_ff, dropout)

        decoder_block=DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    # creating the encoder and decoder by using the EncoderBlocks and DecoderBlocks lists
    encoder=Encoder(nn.ModuleList(encoder_blocks))
    decoder=Decoder(nn.ModuleList(decoder_blocks))

    # Map the output of Decoder to the Target Vocabulary Space
    projection_layer=ProjectionLayer(
        d_model, tgt_vocab_size
    )

    transformer=Transformer(
        encoder,
        decoder,
        src_embed,
        tgt_embed,
        src_pos,
        tgt_pos,
        projection_layer
    )

    for p in transformer.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform_(p)
    return transformer

# Tokenization

We will use the word-level tokenization to transform each word in a sentence into a token.

In [6]:
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

def build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))

    if not Path.exists(tokenizer_path):
        tokenizer=Tokenizer(WordLevel(unk_token = '[UNK]'))
        tokenizer.pre_tokenizer=Whitespace()  # We will split the text into tokens based on whitespace

        # creating trainer for the new tokenizer
        trainer=WordLevelTrainer(special_tokens=[
            "[UNK]","[PAD]","[SOS]","[EOS]"
        ], min_frequency=2) # Defining Word Level strategy and special tokens

        # training new tokenizer on sentences form the dataset and language specified
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path)) # saving trained tokenizer to the file path specified at the beginning of the function
    else:
        tokenizer=Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

# Load the dataset

Here we will use a dataset for machine translation task.

In [7]:

class BilingualDataset:
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len) -> None:
        super().__init__()
        self.seq_len=seq_len
        self.ds=ds
        self.tokenizer_src=tokenizer_src
        self.tokenizer_tgt=tokenizer_tgt
        self.src_lang=src_lang # source lan
        self.tgt_lang=tgt_lang # target lang

        self.sos_token=torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token=torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token=torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    # using the index to retrive source and target texts
    def __getitem__(self, index: Any) -> Any:
        src_target_pair=self.ds[index]
        src_text=src_target_pair['translation'][self.src_lang]
        tgt_text=src_target_pair['translation'][self.tgt_lang]

        # tokenizing source and target texts
        enc_input_tokens=self.tokenizer_src.encode(src_text).ids
        dec_input_tokens=self.tokenizer_tgt.encode(tgt_text).ids

        # computing how many padding tokens need to be added to the tokenized texts
        # source tokens
        enc_num_padding_tokens=self.seq_len-len(enc_input_tokens)-2 # Subtracting the two '[EOS]' and '[SOS]' special tokens
        #target tokens
        dec_num_padding_tokens=self.seq_len-len(dec_input_tokens)-1 # Subtracting the '[SOS]' special token
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError('Sentence is too long')
        # Building the encoder input tensor by combining several elements
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token]*enc_num_padding_tokens, dtype=torch.int64)
            ]
        )

        # Building the decoder input tensor by combining several elements
        decoder_input=torch.cat(
            [
                self.sos_token, # inserting the '[SOS]' token
                torch.tensor(dec_input_tokens, dtype=torch.int64), # Inserting the tokenized target text
                torch.tensor([self.pad_token]*dec_num_padding_tokens, dtype=torch.int64) # Adding padding tokens
            ]
        )

        # Creating a label tensor, the expected output for training the model
        label=torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token, # inserting the '[EOS]' token
                torch.tensor([self.pad_token]*dec_num_padding_tokens, dtype=torch.int64)
            ]
        )
        # Ensuring that the length of each tensor above is equal to the defined 'seq_len'
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input,
            'encoder_mask': (encoder_input !=self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            'decoder_mask': (decoder_input !=self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask(decoder_input.size(0)),
            'label':label,
            'src_text': src_text,
            'tgt_text': tgt_text
        }

def casual_mask(size):
    mask=torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.int)
    return mask==0

# Iterating through dataset to extract the original sentence and its translation 
def get_all_sentences(ds, lang):
    for pair in ds:
        yield pair['translation'][lang]

def get_ds(config):
    ds_raw = load_dataset('opus_books', f'{config["lang_src"]}-{config["lang_tgt"]}', split = 'train')

    tokenizer_src=build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt=build_tokenizer(config, ds_raw, config['lang_tgt'])

    # splitting the dataset for training and validation
    train_ds_size=int(0.9*len(ds_raw))
    val_ds_size=len(ds_raw)-train_ds_size
    train_ds_raw, val_ds_raw=random_split(ds_raw, [train_ds_size, val_ds_size]) # Randomly splitting the dataset

    # Precessing data with the BilingualDataset class
    train_ds=BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds=BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

    # iterating over the entire datset and printing the maximum length found in the sentences of both the source and target languages
    max_len_src=0
    max_len_tgt=0
    for pair in ds_raw:
        src_ids=tokenizer_src.encode(pair['translation'][config['lang_src']]).ids
        tgt_ids=tokenizer_src.encode(pair['translation'][config['lang_tgt']]).ids
        max_len_src=max(max_len_src, len(src_ids))
        max_len_tgt=max(max_len_tgt, len(tgt_ids))
    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')

    # creating dataloaders for the training and validation sets
    # Dataloaders are used to iterate over the dataet in batches during trianing and validation
    train_dataloader=DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True) # batch size will be defined in the condif dictionary
    val_dataloader=DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt # Returning the DataLoader objects and tokenizers

# Training

In [8]:
import warnings

# Define function to obtain the most probable next token
def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    # retrieving the indices from the start and end of sequences of the target tokens
    sos_idx=tokenizer_tgt.token_to_id('[SOS]')
    eos_idx=tokenizer_tgt.token_to_id('[EOS]')

    encoder_output=model.encode(source, source_mask)
    decoder_input=torch.empty(1,1).fill_(sos_idx).type_as(source).to(device)

    # looping until the 'max_len', maximum length, is reached
    while True:
        if decoder_input.size(1)==max_len:
            break
        # building a mask for the decoder input
        decoder_mask=casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)
        # calculating the ouput of the decoder
        out=model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # Applying the projection layer to get the probabilities for the next token
        prob=model.project(out[:,-1])

        # selecting token with highest probability
        _, next_word =torch.max(prob, dim=1)
        decoder_input=torch.cat([
            decoder_input,
            torch.empty(1,1).type_as(source).fill_(next_word.item()).to(device)], dim=1)

        if next_word==eos_idx:
            break
    return decoder_input.squeeze(0) # Sequence of tokens generated by the decoder


# Defining function to evaluate the model on the validation dataset
# num_examples=2, two examples per run
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_state, writer, num_examples=2):
    model.eval() # setting model ro evaluation mode
    count=0 # Initializing counter to keep track of how many examples have been processed

    console_width=80 # fixed witdh for printed messages

    # Crating evaluation loop
    with torch.no_grad():
        for batch in validation_ds:
            count+=1
            encoder_input=batch['encoder_input'].to(device)
            encoder_mask=batch['encoder_mask'].to(device)

            # ensuring that the batch_size of the validation set is 1
            assert encoder_input.size(0)==1, 'Batch size must be 1 for validation.'

            # Applying the 'greedy_decode' function to get the model's output for the source text of the input batch
            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)
            # Retrieving source and target texts from the batch
            source_text=batch['src_text'][0]
            target_text=batch['tgt_text'][0] # true translation
            model_out_text=tokenizer_tgt.decode(model_out.detach().cpu().numpy()) # Decoded, human-readable model output

            # printing results
            print_msg('-'*console_width)
            print_msg(f'SOURCE: {source_text}')
            print_msg(f'TARGET: {target_text}')
            print_msg(f'PREDICTED: {model_out_text}')
            
            # After two examples, we break the loop
            if count == num_examples:
                break

# We pass as parameters the config dictionary, the length of the vocabylary of the source language and the target language
def get_model(config, vocab_src_len, vocab_tgt_len):
    model=build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'],config['seq_len'], config['d_model'])
    return model


def get_config():
    return {
        'batch_size':8,
        'num_epochs': 10,
        'lr': 10**-4,
        'seq_len': 350,
        'd_model': 512, # dimensions of the embeddings in the transformer. 512 is original from the paper
        'lang_src': 'en',
        'lang_tgt': 'it',
        'model_folder': 'weights',
        'model_basename':'transformer',
        'preload': None,
        'tokenizer_file':'tokenizer_{0}.json',
        'experiment_name': 'runs/tmodel'
    }


def get_weights_file_path(config, epoch: str):
    model_folder=config['model_folder'] # extracting model folder from the config 
    model_basename=config['model_basename'] # extracting the base name for model files
    model_filename=f"{model_basename}{epoch}.pt"
    return str(Path('.')/model_folder/model_filename)  # Combining current directory, the model folder, and the model filename



def train_model(config):
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device {device}")

    # Creating model directory to store weights
    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)
    
    # Retrieving dataloaders and tokenizers for source and target languages using the 'get_ds' function
    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    
    # Initializing model on the GPU using the 'get_model' function
    model = get_model(config,tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    
    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])

    # Setting up the Adam optimizer with the specified learning rate from the '
    # config' dictionary plus an epsilon value
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps = 1e-9)
    
    # Initializing epoch and global step variables
    initial_epoch = 0
    global_step = 0

    # Checking if there is a pre-trained model to load
    # If true, loads it

    if config['preload']:
        model_filename = get_weights_file_path(config, config['preload'])
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename) # Loading model
        
        # Sets epoch to the saved in the state plus one, to resume from where it stopped
        initial_epoch = state['epoch'] + 1
        # Loading the optimizer state from the saved model
        optimizer.load_state_dict(state['optimizer_state_dict'])
        # Loading the global step state from the saved model
        global_step = state['global_step']

        
    # Initializing CrossEntropyLoss function for training
    # We ignore padding tokens when computing loss, as they are not relevant for the learning process
    # We also apply label_smoothing to prevent overfitting
    loss_fn = nn.CrossEntropyLoss(ignore_index = tokenizer_src.token_to_id('[PAD]'), label_smoothing = 0.1).to(device)

    # Initializing training loop

    for epoch in range(initial_epoch, config['num_epochs']):
        batch_iterator=tqdm(train_dataloader, desc = f'Processing epoch {epoch:02d}')

        for batch in batch_iterator:
            model.train()
            
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            decoder_mask = batch['decoder_mask'].to(device)

            # running tensors through the transformer
            encoder_output = model.encode(encoder_input, encoder_mask)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
            proj_output = model.project(decoder_output)

            # loading the target labels onto the GPU
            label=batch['label'].to(device)
            # Computing loss between model's output and true labels
            loss=loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()),label.view(-1))
            # updating progress bar
            batch_iterator.set_postfix({f"loss":f"{loss.item():6.3f}"})

            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            loss.backward()
            optimizer.step()
            optimizer.zero_grad() # prepare for the next batch
            global_step+=1 # updating global step count

        # we run the 'run_validation' func at the end of each epoch to evalute model performance
        run_validation(
            model, 
            val_dataloader, 
            tokenizer_src, 
            tokenizer_tgt, 
            config['seq_len'], 
            device, 
            lambda msg: batch_iterator.write(msg),
            global_step, 
            writer
        )

        # Saving the model
        model_filename=get_weights_file_path(config, f'{epoch:02d}')

        torch.save({
            'epoch': epoch, # Current epoch
            'model_state_dict': model.state_dict(),# Current model state
            'optimizer_state_dict': optimizer.state_dict(), # Current optimizer state
            'global_step': global_step # Current global step 
        },model_filename)
        

warnings.filterwarnings('ignore')
config=get_config()
train_model(config)

Using device cuda


README.md:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/5.73M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32332 [00:00<?, ? examples/s]

Max length of source sentence: 309
Max length of target sentence: 274


Processing epoch 00: 100%|██████████| 3638/3638 [15:23<00:00,  3.94it/s, loss=5.462]


--------------------------------------------------------------------------------
SOURCE: 'And has he not yet gone to the country to sell the forest?'
TARGET: — E che, non è ancora partito per la campagna a vendere il legname?
PREDICTED: — Ma non è stata stata un ’ altro che non è stata stata stata stata stata .
--------------------------------------------------------------------------------
SOURCE: He might have been up stuffing himself with eggs and bacon, irritating the dog, or flirting with the slavey, instead of sprawling there, sunk in soul-clogging oblivion.
TARGET: Egli avrebbe potuto ingozzarsi di uova e prosciutto, stuzzicare il cane, o corteggiare la fantesca, invece di starsene lì, immerso in una mortale oblivione.
PREDICTED: Non si , e la sua , e la sua , e , e , e , e , e .


Processing epoch 01: 100%|██████████| 3638/3638 [15:23<00:00,  3.94it/s, loss=5.516]


--------------------------------------------------------------------------------
SOURCE: Delightful!
TARGET: Un incanto!
PREDICTED: È una donna !
--------------------------------------------------------------------------------
SOURCE: I pointed out to him that we were miles away from a pub.; and then he went on about the river, and what was the good of the river, and was everyone who came on the river to die of thirst?
TARGET: Gli feci osservare che eravamo delle miglia distanti da qualunque spaccio di bevande; e allora egli si mise a mormorare del fiume: e a che serviva il fiume, e dovevano tutti venir sul fiume a morir di sete?
PREDICTED: a , che mi , e , e , e , e , e , e , e , e , e , e , e , e che non si a ?


Processing epoch 02: 100%|██████████| 3638/3638 [15:23<00:00,  3.94it/s, loss=5.606]


--------------------------------------------------------------------------------
SOURCE: She did not walk but was borne toward him by some invisible force.
TARGET: Ella non aveva camminato, ma era stata portata verso di lui come da una forza invisibile.
PREDICTED: Ella non si sentiva più di nuovo , ma si era stata .
--------------------------------------------------------------------------------
SOURCE: She had a small travelling looking-glass in her bag, and felt inclined to take it out; but glancing at the backs of the coachman and the clerk who sat swaying beside him, she knew she would feel ashamed if one of them chanced to look round, and she did not take it out.
TARGET: Aveva uno specchietto da viaggio nella borsa e avrebbe voluto tirarlo fuori; ma guardando la schiena del cocchiere e quella dello scrivano che si dondolava, sentì che si sarebbe vergognata se uno di loro si fosse voltato, e non tirò fuori lo specchio.
PREDICTED: Ella era un po ’ di seta , e , come un tratto , ma ,

Processing epoch 03: 100%|██████████| 3638/3638 [15:23<00:00,  3.94it/s, loss=4.879]


--------------------------------------------------------------------------------
SOURCE: Besides, there were fewer to feed; the sick could eat little; our breakfast-basins were better filled; when there was no time to prepare a regular dinner, which often happened, she would give us a large piece of cold pie, or a thick slice of bread and cheese, and this we carried away with us to the wood, where we each chose the spot we liked best, and dined sumptuously.
TARGET: Quando non c'era tempo di cucinare, cosa che accadeva spesso, ci davano un bel pezzo di pasticcio freddo, pane e formaggio, e andavamo a desinare sull'erba.
PREDICTED: Ciò non ostante noi ci sarebbe stato più facile , e il nostro conduttore non ci sarebbe stato più facile , quando il tè non ci sarebbe stato più , che il tè ci sarebbe stato stato fatto che il tè , che ci sarebbe stato , e noi , un pezzo di pane , un , , un po ’ di pane , , un pezzo di pane , e , , , , , , , , la , e , , , , , , la , , , , , , , , , , , la , ,

Processing epoch 04: 100%|██████████| 3638/3638 [15:23<00:00,  3.94it/s, loss=4.161]


--------------------------------------------------------------------------------
SOURCE: When we told them how we placed ourselves and the horses in the middle, they blamed us exceedingly, and told us it was fifty to one but we had been all destroyed, for it was the sight of the horses which made the wolves so furious, seeing their prey, and that at other times they are really afraid of a gun; but being excessively hungry, and raging on that account, the eagerness to come at the horses had made them senseless of danger, and that if we had not by the continual fire, and at last by the stratagem of the train of powder, mastered them, it had been great odds but that we had been torn to pieces; whereas, had we been content to have sat still on horseback, and fired as horsemen, they would not have taken the horses so much for their own, when men were on their backs, as otherwise; and withal, they told us that at last, if we had stood altogether, and left our horses, they would have been so 

Processing epoch 05: 100%|██████████| 3638/3638 [15:23<00:00,  3.94it/s, loss=3.441]


--------------------------------------------------------------------------------
SOURCE: Concerning these two methods of rising to be a prince by ability or fortune, I wish to adduce two examples within our own recollection, and these are Francesco Sforza and Cesare Borgia.
TARGET: Io voglio all'uno et all'altro di questi modi detti, circa el diventare principe per virtù o per fortuna, addurre dua esempli stati ne' dí della memoria nostra: e questi sono Francesco Sforza e Cesare Borgia.
PREDICTED: due o due , , uno principe , o per essere dua , sono stati stati stati per e ' nostri cittadini sono stati di quelli che sono .
--------------------------------------------------------------------------------
SOURCE: 'They're on your table,' answered Matthew with a questioning and sympathizing glance at his master – adding after a pause with a sly smile: 'Some one has called from the jobmaster's.'
TARGET: — Sulla tavola — rispose Matvej. Guardò interrogativamente, con interesse, il padrone, e

Processing epoch 06: 100%|██████████| 3638/3638 [15:21<00:00,  3.95it/s, loss=3.584]


--------------------------------------------------------------------------------
SOURCE: No one could determine better than he the limits of freedom, simplicity, and formality, necessary for the pleasant transaction of business.
TARGET: Nessuno più di Stepan Arkad’ic sapeva con maggiore precisione il limite tra la cordialità confidenziale e il tono ufficiale, così necessario al piacevole disbrigo degli affari.
PREDICTED: Nessuno poteva più credere che egli fosse stato di loro di libertà , e per tutta la ragione di essere indipendente da ogni modo .
--------------------------------------------------------------------------------
SOURCE: 'I think so.
TARGET: — Credo che sia possibile.
PREDICTED: — Io penso .


Processing epoch 07: 100%|██████████| 3638/3638 [15:22<00:00,  3.95it/s, loss=3.638]


--------------------------------------------------------------------------------
SOURCE: 'Quiet, quiet, Krak!' he said affectionately to the dog, which was throwing its paws up against his stomach and chest and getting them entangled in his game-bag.
TARGET: “Tout beau, tout beau Krak!” egli gridava, carezzando il cane che gli poneva le zampe sul ventre e sul petto, impigliandosi con esse nel carniere.
PREDICTED: — , Krak ! — disse , avvicinandosi al cane , sentendo che il cane , i bottoni che e nel petto .
--------------------------------------------------------------------------------
SOURCE: "Except me: I am substantial enough--touch me."
TARGET: — Eccettuato me; sono di carne e d'ossa; toccatemi.
PREDICTED: — , sono contenta di essermi .


Processing epoch 08: 100%|██████████| 3638/3638 [15:22<00:00,  3.94it/s, loss=3.474]


--------------------------------------------------------------------------------
SOURCE: Nevertheless Milan was taken from France both the first and the second time.
TARGET: Non di manco, e la prima e la seconda volta, li fu tolto.
PREDICTED: Non vi fu da Francia e l ’ altro e l ’ altro .
--------------------------------------------------------------------------------
SOURCE: I shall return to Brocklehurst Hall in the course of a week or two: my good friend, the Archdeacon, will not permit me to leave him sooner. I shall send Miss Temple notice that she is to expect a new girl, so that there will be no difficulty about receiving her.
TARGET: Sono costretto a dirvi addio. Non ritornerò alla mia villa altro che fra un paio di settimane, perché il mio buon amico, l'arcidiacono, non vuole che la lasci prima; ma farò dire alla signora Temple di attendere una nuova alunna.
PREDICTED: a quella settimana o due .


Processing epoch 09: 100%|██████████| 3638/3638 [15:22<00:00,  3.94it/s, loss=3.372]


--------------------------------------------------------------------------------
SOURCE: 'I received it, but really do not understand what you are worrying about,' replied Alexis.
TARGET: — L'ho ricevuto e, davvero, non capisco di che mai tu voglia darti pensiero — disse Aleksej.
PREDICTED: — Io ho cambiato , ma non capisco perché quello che avete fatto caldo — rispose Aleksej Aleksandrovic .
--------------------------------------------------------------------------------
SOURCE: Now, take me.
TARGET: Ecco, vi dirò di me.
PREDICTED: Adesso , vieni .


# Upload the model to HF

In [9]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

In [10]:
from huggingface_hub import HfApi

api = HfApi()
repo_url = api.create_repo(repo_id="transformer", exist_ok=True)

In [11]:
from huggingface_hub import upload_folder

upload_folder(
    folder_path=".",
    repo_id="aisuko/transformer",
    commit_message="Add model checkpoints"
)

__notebook__.ipynb:   0%|          | 0.00/14.9M [00:00<?, ?B/s]

transformer01.pt:   0%|          | 0.00/904M [00:00<?, ?B/s]

transformer00.pt:   0%|          | 0.00/904M [00:00<?, ?B/s]

Upload 12 LFS files:   0%|          | 0/12 [00:00<?, ?it/s]

events.out.tfevents.1734043303.a4b1979a92b3.23.0:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

transformer02.pt:   0%|          | 0.00/904M [00:00<?, ?B/s]

transformer03.pt:   0%|          | 0.00/904M [00:00<?, ?B/s]

transformer04.pt:   0%|          | 0.00/904M [00:00<?, ?B/s]

transformer05.pt:   0%|          | 0.00/904M [00:00<?, ?B/s]

transformer06.pt:   0%|          | 0.00/904M [00:00<?, ?B/s]

transformer07.pt:   0%|          | 0.00/904M [00:00<?, ?B/s]

transformer08.pt:   0%|          | 0.00/904M [00:00<?, ?B/s]

transformer09.pt:   0%|          | 0.00/904M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aisuko/transformer/commit/3886c0ded15dc355330d5dff3bd6b0f21764b83a', commit_message='Add model checkpoints', commit_description='', oid='3886c0ded15dc355330d5dff3bd6b0f21764b83a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/aisuko/transformer', endpoint='https://huggingface.co', repo_type='model', repo_id='aisuko/transformer'), pr_revision=None, pr_num=None)