In [23]:
!ls

runs  sample_data  tokenizer_en.json  tokenizer_fr.json  weights


In [24]:
import torch
import torch.nn as nn
import math


class InputEmbeddings(nn.Module):
    def __init__(self, d_model, vocab_size):
        """
        d_model is the dimension of the vector,

        vocab_size is the number of words in the vocbulary
        """
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        "given a number always given a number it proiveds the  same number every time , "
        "embedding is mapping between number and a vector of size d_model"
        self.embedding = nn.Embedding(vocab_size, d_model)
        "this is just a dictionnary"
    def forward(self, x):
        "the vector the number being mapped to is learned by the model "
        "in the paper they mult by sqrt(d_model)"
        return self.embedding(x)*math.sqrt(self.d_model)





class PositionalEncoding(nn.Module):

    def __init__(self,d_model,seq_len,dropout):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        # we will be creating pos enco for seq_len each with 512 size vec
        "so we just return a matrix"
        pos_enc = torch.zeros(seq_len,d_model)
        "positions vector (seq_len,1)"
        position = torch.arange(0,seq_len,dtype=torch.float).unsqueeze(1)
        determinator = torch.exp(torch.arange(0,d_model,2).float()*(-math.log(1000)/d_model))
        # all tokens , all rows from 0 to the end jumping by two
        # the even pos use the sine function
        pos_enc[:,0::2]= torch.sin(position*determinator)

        # all tokens , all rows from 1 to the end jumping by two
        # the odd pos use the cosine function
        pos_enc[:,1::2]= torch.cos(position*determinator)

        # for the model we deal with batches so we need the batch dimension
        pos_enc = pos_enc.unsqueeze(0)


        # when saving the model this buffer will be saved along with the state of the model
        # why? to store fixed non trainable data
        self.register_buffer('pos_enc',pos_enc)


    def forward(self,x):
        # they are fixed not learned
        # i think when backproping we will only go through the x
        x = x + self.pos_enc[:,:x.shape[1],:].requires_grad_(False)

        # dropout reduce the reliance of the model on the position
        return self.dropout(x)


# layer normalization :
# when we have three items , each item has  mean and std
# we normalize each item with its mean and std
# we introduce in the normalization two params additive and multiplicative
# normalization can hurt us when the fluctuation is actually necessary
# where it happens  :
# Not across batch: Unlike BatchNorm, LayerNorm ignores other samples.
# Not across tokens: It does not normalize across the sequence (tokens).
# Only across embedding features (d_model) for each token.
class LayerNorm(nn.Module):
    def __init__(self,eps = 1e-6):
        super().__init__()
        self.eps = eps
        # cpus or gpus can only represent numbers within a scale ,
        # by introducing the eps we are making sure we are setting
        # a ceiling on the normalized embedding item
        self.alpha = nn.Parameter(torch.ones(1)) # multiplicative param
        self.bias = nn.Parameter(torch.ones(1)) # additive param
        # maybe instead of 1 it should be d_model
        # so that each embedding of the token moves on its own


    def forward(self,x):
        # Output shape: (batch_size, seq_len, 1)

        # Each [b, s, :] vector (one token) is reduced to a single mean value.

        # keepdim=True ensures the output can still broadcast with x during subtraction.
        # if we dont use keepdim=True then
        # This won’t work because you're trying to subtract a
        # (batch_size, seq_len) tensor from
        # a (batch_size, seq_len, d_model) tensor. The dimensions don’t line up for broadcasting.
        # Because now you're subtracting a (batch_size, seq_len, 1) tensor from
        # a (batch_size, seq_len, d_model) tensor —
        # PyTorch automatically stretches the 1 to match d_model.
        # It computes the mean across the last dimension —
        # that is, across the embedding dimension (d_model) — for each token.
        mean = x.mean(dim=-1,keepdim=True)
        std = x.std(dim=-1,keepdim=True)
        normalized = (x-mean) / std+self.eps
        return self.alpha*normalized + self.bias




class FeedForward(nn.Module):
    def __init__(self,d_model,dff,dropout):
        super().__init__()
        self.linear1 = nn.Linear(d_model,dff) # W1 B1
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dff,d_model) # W2 B2


    def forward(self,x):
        # (Batch,seq_len,d_model) --> (Batch,seq_len,dff) --> (Batch,seq_len,d_model)
        x = self.linear1(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)

        return x




class MultiHeadAttentionBlock(nn.Module):
    def __init__(self,d_model , h , dropout):
        super().__init__()
        self.d_model = d_model
        self.h = h
        assert d_model%h == 0 ,"d_model is not div by h"

        self.d_k = d_model//h

        self.w_q = nn.Linear(d_model,d_model,bias=False)
        self.w_k = nn.Linear(d_model,d_model,bias=False)
        self.w_v = nn.Linear(d_model,d_model , bias= False)

        self.w_o = nn.Linear(d_model,d_model,bias=False)

        # Default

        # self.w_q = nn.Linear(d_model,d_model)
        # self.w_k = nn.Linear(d_model,d_model)
        # self.w_v = nn.Linear(d_model,d_model)

        # self.w_o = nn.Linear(d_model,d_model)


        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(q,k,v,mask,dropout):

        d_k = q.shape[-1]
        # @ is for mat mul
        # remember that this mul is :
        # doing matrix multiplication per head, per batch, between:
        # query vectors and
        # key vectors (transposed)
        # this arrangement/order is so that we have a meaningfull arch
        # for each query get the best fitting key and propagate its value

        # (batch , h , seq_len,d_k) --> (batch, h, seq_len,seq_len)
        attention_scores = (q @ k.transpose(-2,-1))/math.sqrt(d_k)

        #  this is where we utilize the mask
        if mask is not None :
            attention_scores.masked_fill_(mask==0,-1e9)

        # (batch,h,seq_len,seq_len)
        #  remember that softmax is not per scalar but per vector
        #  we use softmax over all keys per query ,
        attention_scores = attention_scores.softmax(dim=-1)

        if dropout is not None :
            attention_scores = dropout(attention_scores)


        #  the actual output is x
        x = attention_scores @ v
        return x ,attention_scores



    def forward(self,q,k,v,mask):
        # so when we dont want some tokens to interract with each other
        # we can implement that by using a mask
        # we leverage the fact that we are using a softmax to `pick up`
        # the values so we intercept the calculation of the attention
        # to remove the attention of certain tokens
        # the buffer we can intercept is the Q*Kt matrix
        # which is (d_model,d_model)
        query = self.w_q(q) # (batch,seq_len,d_model) --> (batch,seq_len,d_model)
        key = self.w_k(k) # (batch,seq_len,d_model) --> (batch,seq_len,d_model)
        value = self.w_v(v) # (batch,seq_len,d_model) --> (batch,seq_len,d_model)

        #  now we split into head chunks
        # (batch,seq_len,d_model) --> (batch,seq_len,h,d_k) ->(batch,h,seq_len,d_k)
        query = query.view(query.shape[0],query.shape[1],self.h,self.d_k).transpose(1,2)
        #  the transpose is so we could see the whole sentence head
        #  the transpose(1,2) swaps dims 1 and 2
        #  this will make the calculation much easier
        #  also each head will see (seq_len , d_k)
        #  all of the sentence and a subset of the embeddings
        #  (batch,seq_len,d_model)
        key = key.view(key.shape[0],key.shape[1],self.h,self.d_k).transpose(1,2)
        value = value.view(value.shape[0],value.shape[1],self.h,self.d_k).transpose(1,2)

        x,self.attention_scores = MultiHeadAttentionBlock.attention(query,key,value,mask,self.dropout)
        # reverse the transpose
        # (batch,h,seq_len,d_k) --> (batch,seq_len,h,d_k) --> (batch,seq_len,d_model)
        x = x.transpose(1,2).contiguous().view(x.shape[0],-1,self.h*self.d_k)

        # This is required because .transpose() returns a non-contiguous tensor
        # — basically, it messes with memory layout.
        # You can’t use .view() unless the tensor is contiguous in memory.

        # batch_size = x.shape[0]

        # seq_len = inferred from shape (because of -1)

        # self.h * self.d_k = d_model


        # (batch , seq_len,d_model) --> (batch,seq_len,d_model)
        x = self.w_o(x)

        return x
        # con.




# This is the equivalent of the add&norm
class ResidualConnection(nn.Module):
    def __init__(self,dropout):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNorm()


    def forward(self,x,sublayer):
        # in the paper they apply norm before the sublayer
        return x + self.dropout(sublayer(self.norm(x)))




class EncoderBlock(nn.Module):
    # self attention because the input is applied in 3 different roles
    # the input is all the query , the key and the value
    def __init__(self,self_attention_block,feed_forward_block,dropout):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.dropout = dropout
        self.residual_connections = nn.ModuleList(
            [
                ResidualConnection(self.dropout),
                ResidualConnection(self.dropout),
            ])


    def forward(self , x,src_mask) :
        # this mask is to hide the interraction between padding words
        # and actual words
        # self attention : input is all query key value
        # cros attention : querys from dec are watching k , v from enc
        attention_sublayer = lambda x : self.self_attention_block(x,x,x,src_mask)

        x = self.residual_connections[0](x,attention_sublayer)

        feed_forward_sublayer  = lambda x : self.feed_forward_block(x)

        x = self.residual_connections[1](x,feed_forward_sublayer)

        return x




class Encoder(nn.Module):
    def __init__(self,layers:nn.ModuleList):
        super().__init__()
        self.layers = layers
        self.norm = LayerNorm()



    def forward(self,x,mask):

        for layer in self.layers :
            x = layer(x,mask)


        return self.norm(x)




class DecoderBlock(nn.Module):
    def __init__(self,self_attention_block , cross_attention_block,feed_forward_block ,dropout):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.dropout= dropout
        self.residual_connections = nn.ModuleList([
            ResidualConnection(dropout),
            ResidualConnection(dropout),
            ResidualConnection(dropout),

        ])


    def forward(self,x , encoder_output,src_mask,tgt_mask):
        #
        # the src_mask is used by the decoder when looking into the encoder
        # so it ignores the padding tokens,
        # the tgt_mask is used by the decoder when looking into the
        # decoder input so it doesnt cheat , and can learn
        # --------------------------------------------------------------
        # so in general any decoder using self-attention needs tgt_mask
        # to mask future and force the autoregressive prediction
        #
        # any encoder_decoder attention needs the src_mask so it
        # knows what to not look at from the source (encoder)
        # basically ignore the padded tokens in the source

        # now we are only masking the future of the output embeddings
        self_attention_sublayer = lambda x: self.self_attention_block(x,x,x
                                                                      ,tgt_mask)
        x = self.residual_connections[0](x,self_attention_sublayer)

        # now we need to mask the padding from the encoder
        # the reason we don't need tgt in here is because
        # the query is already built upon masking
        # which means now we only need the
        # src_msking
        cross_attention_sublayer = lambda x : \
                self.cross_attention_block(
                                x,
                                encoder_output,
                                encoder_output,
                                src_mask)

        x = self.residual_connections[1](x,cross_attention_sublayer)

        feed_forward_sublayer = lambda x : self.feed_forward_block(x)
        x = self.residual_connections[2](x,feed_forward_sublayer)


        """
        so we agree that there is some waste of memory throught all the encoder's attention blocks :
        (batch , seq_len_usefull +seq_len_useless, d_model ) .
        now if the decoder also propagates useless information through masking shouldnt we keep on masking that vector across the decoder attention layers ?
        then at the cross attention we will need both src_mask and tgt_mask ?

        but why in every attention block in the decoder we keep using the tgt_mask?
        wouldnt we stop information from propagating ?

        so does that mean in generation ,
        We are using only the attention of existing tokens to generate the new token ?
        meaning we are trying extract next word soely from existing words ?
        and that is why we train the model this way ? so it learns this behaviour ?

        so in the decoder the future tokens play the role as padding
        since they take space and we dont use their info in any way


        since we are training in batches , we keep that wasted **per step** future
        tokens which keep decresing as we consume the batch

        in the decoder we have a wavy space loss
        in encoder we have a constant space loss
        but this loss is  tradeoff for training speed
        """


        return x





class Decoder(nn.Module):
    def __init__(self,layers):
        super().__init__()
        self.layers = layers
        self.norm = LayerNorm()


    def forward(self,x,encoder_output,src_mask,tgt_mask):
        for layer in self.layers :
            x = layer(x,encoder_output,src_mask,tgt_mask)


        return self.norm(x)




# Now at the end we get (batch , seq_len,d_model)
# but we are not really intrested in the embeddings
# so we want to somehow project the embeddings bck to words


# so basically the ids of the vocabulary are the same for the input and output
# remember that in input embeddings we go : eg.'cat'->voacb_indices(eg. 25)->embedding
# in the projection layer (transformer output) we learn
# mapping from output_embeddings->vocab_indices(eg. 29)-> eg. 'eats'
class ProjectionLayer(nn.Module):
    def __init__(self,d_model,vocab_size):
        super().__init__()
        self.proj = nn.Linear(d_model,vocab_size)


    def forward(self,x):
        # this is to use log_probs instead of probs
        # to escpe numericl underflow
        # because proba*proba*proba ...vocab_size times is tooo close to 0
        # we use log on the proba
        # and we get a sum of some neg numbers which is managable
        # remember that log(0.0001) ~ - 11
        # so we escape underflow and overflow
        # and we can still apply  the same comparaison
        # take the biggest probability is the same as take biggest log_probability
        # because log_probability is monotonic

        # (batch,seq_len , d_model) --> (batch, seq_len , vocab_size)

        # -The seq_len dimension persists because each token in the sequence gets
        # its own individual log-probability distribution over the entire vocabulary.
        # -The operations (linear projection and softmax) are applied independently to
        # each token, and they don’t alter the number of tokens in the sequence.
        # -Each token is processed separately, and for each token, you are generating
        # a vector of size vocab_size (the log-probabilities for all words in the
        # vocabulary). Thus, the sequence length (seq_len) remains intact.


        # right before we go to project to vocab size , that embedding or logit or
        # whatever u call it is the embedding of the generated word ?
        # meaning we are morphing an embedding through many layers using context
        # into an output embedding which will be mapped into a word ?
        return torch.log_softmax(self.proj(x),dim=-1)



# CHECKPOINT


class Transformer(nn.Module):
    def __init__(self,encoder , decoder,src_embedding,tgt_embedding,src_pos,tgt_pos,projection_layer):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embedding = src_embedding
        self.tgt_embedding = tgt_embedding
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        # maybe each language has its own posit encod
        self.projection_layer = projection_layer


    def encode(self,src,src_mask):
        src = self.src_embedding(src)
        src = self.src_pos(src)
        return self.encoder(src,src_mask)




    def decode(self,encoder_output,src_mask,tgt,tgt_mask):
        tgt = self.tgt_embedding(tgt)
        tgt= self.tgt_pos(tgt)
        #  remember this is not just one attention block its the whole DECODER!
        # thats why we pass both the src_msk nd tgt_mask
        tgt = self.decoder(tgt,encoder_output,src_mask,tgt_mask)
        return tgt


    def project(self,x):
        return self.projection_layer(x)





def build_transformer(src_vocab_size,tgt_vocab_size,src_seq_len,tgt_seq_len,d_model=512,N=6,h=8,dropout=0.1,d_ff=2048):
    src_embedding = InputEmbeddings(d_model,src_vocab_size)
    tgt_embedding = InputEmbeddings(d_model,tgt_vocab_size)

    src_pos = PositionalEncoding(src_seq_len,d_model,dropout)
    tgt_pos = PositionalEncoding(tgt_seq_len,d_model,dropout)


    # N encoder blocks
    encoder_blocks = []

    for _ in range(N):
        encoder_self_attention = MultiHeadAttentionBlock(d_model,h,dropout)
        # remember that dff is the a ff upwards projection size
        feed_forward = FeedForward(d_model,d_ff,dropout)
        encoder_block = EncoderBlock(encoder_self_attention,feed_forward,dropout)
        encoder_blocks.append(encoder_block)

    # N decoder blocks
    decoder_blocks = []

    for _ in range(N):
        decoder_self_attention = MultiHeadAttentionBlock(d_model,h,dropout)
        decoder_cross_attention = MultiHeadAttentionBlock(d_model,h,dropout)
        feed_forward = FeedForward(d_model,d_ff,dropout)
        decoder_block = DecoderBlock(decoder_self_attention,decoder_cross_attention , feed_forward,dropout)
        decoder_blocks.append(decoder_block)

    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))
    #  obv we project into the target vocab size
    projection_layer = ProjectionLayer(d_model,tgt_vocab_size)


    transformer = Transformer(encoder,decoder,src_embedding,tgt_embedding,src_pos,tgt_pos,projection_layer)

    #  TO MKE THE TRAINING FASTER WE INIT USING XAVIER
    for p in transformer.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform_(p)

    return transformer






In [25]:
# The tokenizer is what comes before embedding , the goal is to split sentence into single words.
# and map each token to a number
import torch
import torch.nn as nn
from torch.utils.data import Dataset , DataLoader , random_split
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer # to be researched
from tokenizers.pre_tokenizers import Whitespace

from pathlib import Path


def get_or_build_tokenizer(config,ds,lang):
    # config['tokenizer_file'] = '../tokenizer_{0}.e' we use the lang as a param
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    # if not Path.exists(tokenizer_path):
    #     tokenizer = Tokenizer(WordLevel(unk_token='[UNK]')) # token for unkown tokens
    #     tokenizer.pre_tokenizer = Whitespace()
    #     # trainer = WordLevelTrainer(specil_tokens=["[UNK]","[PAD]","[SOS]","[EOS]"],min_frequency=2)
    #     trainer = WordLevelTrainer(
    #                     special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
    #                     min_frequency=2
    #                 )

    #     tokenizer.train_from_iterator(get_all_sentences(ds,lang),trainer=trainer)
    #     tokenizer.save(str(tokenizer_path))
    # else :
    #     tokenizer = Tokenizer.from_file(str(tokenizer_path))



    tokenizer = Tokenizer(WordLevel(unk_token='[UNK]')) # token for unkown tokens
    tokenizer.pre_tokenizer = Whitespace()
    # trainer = WordLevelTrainer(specil_tokens=["[UNK]","[PAD]","[SOS]","[EOS]"],min_frequency=2)
    trainer = WordLevelTrainer(
                    special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
                    min_frequency=2
                )

    tokenizer.train_from_iterator(get_all_sentences(ds,lang),trainer=trainer)
    tokenizer.save(str(tokenizer_path))



    return tokenizer


def get_all_sentences(ds,lang):
    # print(ds)
    for item in ds['train'] :
        # print(item)
        yield item['translation'][lang]






In [26]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

class BiLingualDataset(Dataset):
    def __init__(self,ds,tokenizer_src,tokenizer_tgt,src_lang,tgt_lang,seq_len):
        super().__init__()
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.seq_len = seq_len

        self.sos_token = torch.tensor([tokenizer_src.token_to_id('[SOS]')],dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_src.token_to_id('[EOS]')],dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_src.token_to_id('[PAD]')],dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self,index):
        src_target_pair = self.ds[index]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        enc_num_pad_tokens = self.seq_len - len(enc_input_tokens) - 2
        dec_num_pad_tokens = self.seq_len - len(dec_input_tokens) - 1

        if(enc_num_pad_tokens < 0 or dec_num_pad_tokens < 0):
            raise ValueError('Sentence is too long')

        #  add special tokens to src txt
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens,dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token]*enc_num_pad_tokens,dtype=torch.int64)
            ]

        )


        #
        decoder_input = torch.cat([
            self.sos_token,
            torch.tensor(dec_input_tokens,dtype=torch.int64),
            torch.tensor([self.pad_token]*dec_num_pad_tokens,dtype=torch.int64)
        ])



        label = torch.cat(
            [
                # sos? we learn to remove the sos ?
                torch.tensor(dec_input_tokens,dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token]*dec_num_pad_tokens,dtype=torch.int64)
            ]
        )


        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len


        return {
            "encoder_input":encoder_input,
            "decoder_input":decoder_input,
            # we add the seq dimension and pad dimension (1,1,seq_len)
            "encoder_mask":(encoder_input!=self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            "decoder_mask":(decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() \
            & self.causal_mask(decoder_input.size(0)),
            # (1,seq_len) & (1,seq_len)
            "label":label,
            "src_text":src_text,
            "tgt_text":tgt_text
            }


    def causal_mask(self,seq_len):
        # mask future tokens using 0
        mask = torch.triu(torch.ones(1,seq_len,seq_len),diagonal=1).type(torch.int)
        return mask == 0







In [27]:

def get_ds(config):
    ds_raw = load_dataset('Helsinki-NLP/opus_books',f'{config["lang_src"]}-{config["lang_tgt"]}'
                        # ,split='train'
                          )

    tokenizer_src = get_or_build_tokenizer(config,ds_raw,config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config,ds_raw,config['lang_tgt'])

    # data splitting
    train_ds_size = int(0.9*len(ds_raw['train']))
    # val_ds_size = int(0.1*len(ds_raw))
    val_ds_size = len(ds_raw['train']) - train_ds_size  # ensures total adds up

    train_ds_raw , val_ds_raw = random_split(ds_raw['train'],[train_ds_size,val_ds_size])

    # dataset in tensors
    train_ds = BiLingualDataset(train_ds_raw,tokenizer_src,tokenizer_tgt,config['lang_src'],config['lang_tgt'],config['seq_len'])
    val_ds = BiLingualDataset(val_ds_raw,tokenizer_src,tokenizer_tgt,config['lang_src'],config['lang_tgt'],config['seq_len'])


    max_len_src = 0
    max_len_tgt = 0

    print(f"Training dataset size: {len(train_ds)}")
    print(f"Validation dataset size: {len(val_ds)}")

    for item in ds_raw['train']:

        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src,len(src_ids))
        max_len_tgt = max(max_len_tgt,len(tgt_ids))


    print(f'max_len_src : {max_len_src}')
    print(f'max_len_tgt : {max_len_tgt}')


    train_dataloader = DataLoader(train_ds,batch_size=config['batch_size'],shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1,shuffle=True)


    return train_dataloader , val_dataloader , tokenizer_src, tokenizer_tgt




def get_model(config,vocab_src_len,vocab_tgt_len):
    model = build_transformer(vocab_src_len,vocab_tgt_len,config['seq_len'],config['seq_len'],config['d_model'])
    return model











In [28]:
from pathlib import Path
from torch.utils.tensorboard import SummaryWriter

def get_config():
    return {
        "batch_size":8,
        "num_epochs":23,
        "lr":1e-4,
        "seq_len":512,
        "d_model":512,
        "lang_src":"en",
        "lang_tgt":"fr",
        "model_folder":"weights",
        "model_filenme":"smodel",
        "preload":None,
        "tokenizer_file":"tokenizer_{0}.json",
        "experiment_name":"runs/smodel",
    }



def get_weights_file_path(config,epoch):
    model_folder = config['model_folder']
    model_basename = config['model_basenme']
    model_filename = f"{model_basename}{epoch}.pt"
    return str(Path('.')/model_folder/model_filename)











In [29]:
from tqdm import tqdm

def train_model(config):
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
    print(f'Device : {device}')

    Path(config['model_folder']).mkdir(parents=True,exist_ok=True)
    train_dataloader , val_dataloader , tokenizer_src, tokenizer_tgt = get_ds(config)

    model = get_model(config,tokenizer_src.get_vocab_size(),tokenizer_tgt.get_vocab_size()).to(device)

    writer = SummaryWriter(config['experiment_name'])

    optimizer = torch.optim.Adam(model.parameters(),lr=config['lr'],eps=1e-9)

    initial_epoch = 0
    global_step = 0

    if config['preload']:
        model_filename = get_weights_file_path(config,config['preload'])
        print(f'PreLOADING')
        state = torch.load(model_filename)
        initial_epoch = state['epoch']+1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']


    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'),label_smoothing=0.1).to(device)

    for epoch in range(initial_epoch, config['num_epochs']):
        model.train()
        batch_iterator = tqdm(train_dataloader,desc=f'Epoch:{epoch}')

        for batch in batch_iterator :
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device) # (8,1,1,seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (8,1,1,seq_len,seq_len)

            encoder_output = model.encode(encoder_input,encoder_mask)
            decoder_output = model.decode(encoder_output , encoder_mask ,decoder_input,decoder_mask)

            projection_output = model.project(decoder_output)


            label = batch['label'].to(device) # (B,seq_len)
            # (B,seq_len,tgt_v_size) --> (B * seq_len , tgt_v_size)
            loss = loss_fn(projection_output.view(-1,tokenizer_tgt.get_vocab_size()),label.view(-1))

            batch_iterator.set_postfix({f"loss ":f"{loss.item():6.3f}"})
            writer.add_scalar('train loss',loss.item(),global_step)
            writer.flush()


            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

            global_step +=1

        model_filename = get_weights_file_path(config,f'{epoch}')

        torch.save({
            'epoch':epoch,
            'model_state_dict':model.state_dict(),
            'optimizer_state_dict':optimizer.state_dict(),
            'global_step':global_step
            },model_filename)




In [30]:
!pip install datasets tokenizers transformers



In [31]:
# !huggingface-cli login


In [32]:
# !pip install -U datasets fsspec huggingface_hub


In [None]:
import warnings


warnings.filterwarnings('ignore')
config = get_config()
train_model(config)


Device : cuda
Training dataset size: 114376
Validation dataset size: 12709
max_len_src : 471
max_len_tgt : 482


Epoch:0:   1%|▏         | 187/14297 [02:02<2:32:52,  1.54it/s, loss =6.997]

In [None]:
torch.cuda.empty_cache()

In [None]:
# from datasets import load_dataset

# ds = load_dataset("Helsinki-NLP/opus_books", "ca-de")

In [None]:
# from datasets import get_dataset_config_names

# print(get_dataset_config_names("Helsinki-NLP/opus_books"))

In [None]:
# from huggingface_hub import HfApi

# api = HfApi()
# dataset_info = api.dataset_info("Helsinki-NLP/opus_books")
# print(dataset_info)
# # configs = [config.id for config in dataset_info.siblings if config.id.endswith(".json")]

# # print("Available configs (language pairs):")
# # for cfg in configs:
# #     print(cfg.replace(".json", ""))