In [None]:
#Initial imports

In [5]:
#import libraries

# !pip install datasets
# !pip install tokenizers

In [6]:
# import sys
# sys.path.append(r"/home/vignesh/anaconda3/lib/python3.9/site-packages/datasets")
# import datasets

In [12]:
import os
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from datasets import load_dataset
from tqdm import tqdm

In [54]:
# Assign device value as "cuda" to train on GPU if GPU is available. Otherwise it will fall back to default as "cpu".
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
print(device)

cuda


In [None]:
#STEP 1: Load the dataset

In [20]:
# Loading train, validation, test dataset from huggingface path below.
raw_train_dataset = load_dataset("Helsinki-NLP/opus-100", "en-ta", split='train')
raw_validation_dataset = load_dataset("Helsinki-NLP/opus-100", "en-ta", split='validation')
raw_test_dataset = load_dataset("Helsinki-NLP/opus-100", "en-ta", split='test')

In [13]:
# Directory to store dataset files.
os.mkdir("./dataset-en")
os.mkdir("./dataset-ta")

In [14]:
# Directory to save model during model training after each EPOCHS (in step 10).
os.mkdir("./tamilgpt")

In [15]:
# Director to store source and target tokenizer.
os.mkdir("./tokenizer_en")
os.mkdir("./tokenizer_ta")

In [16]:
dataset_en = [] 
dataset_ta = []
file_count = 1 

In [20]:
# In order to train the tokenizer (in step 2), we'll separate the training dataset into english and tamil. 
# Create multiple small file of size 20k data each and store into dataset-en and dataset-ta directory.
for data in tqdm(raw_train_dataset["translation"]):
 dataset_en.append(data["en"].replace('\n', " "))
 dataset_ta.append(data["ta"].replace('\n', " "))
 if len(dataset_en) == 20000:
     with open(f'./dataset-en/file{file_count}.txt', 'w', encoding='utf-8') as fp:
         fp.write('\n'.join(dataset_en))
         dataset_en = []
    
     with open(f'./dataset-ta/file{file_count}.txt', 'w', encoding='utf-8') as fp:
         fp.write('\n'.join(dataset_ta))
         dataset_ta = []
         file_count += 1

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 227014/227014 [00:00<00:00, 769675.64it/s]


In [21]:
#STEP 2: Create Tokenizer

In [1]:
# import tokenzier library classes and modules.
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [4]:
from pathlib import Path
# path to the training dataset files which will be used to train tokenizer.
path_en = [str(file) for file in Path('./dataset-en').glob("**/*.txt")]
path_ta = [str(file) for file in Path('./dataset-ta').glob("**/*.txt")]

In [5]:
# [ Creating Source Language Tokenizer - English ].
# Additional special tokens are created such as [UNK] - to represent Unknown words, [PAD] - Padding token to maintain same sequence length across the model.
# [CLS] - token to denote start of sentence, [SEP] - token to denote end of sentence.
tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
trainer_en = BpeTrainer(min_frequency=2, special_tokens=["[PAD]","[UNK]","[CLS]", "[SEP]", "[MASK]"])

In [6]:
# splitting tokens based on whitespace.
tokenizer_en.pre_tokenizer = Whitespace()

# Tokenizer trains the dataset files created in step 1
tokenizer_en.train(files=path_en, trainer=trainer_en)

# Save tokenizer for future use.
tokenizer_en.save("./tokenizer_en/tokenizer_en.json")






In [7]:
# [ Creating Target Language Tokenizer - Tamil ].
tokenizer_ta = Tokenizer(BPE(unk_token="[UNK]"))
trainer_ta = BpeTrainer(min_frequency=2, special_tokens=["[PAD]","[UNK]","[CLS]", "[SEP]", "[MASK]"])


In [8]:
tokenizer_ta.pre_tokenizer = Whitespace()
tokenizer_ta.train(files=path_ta, trainer=trainer_ta)
tokenizer_ta.save("./tokenizer_ta/tokenizer_ta.json")

tokenizer_en = Tokenizer.from_file("./tokenizer_en/tokenizer_en.json")
tokenizer_ta = Tokenizer.from_file("./tokenizer_ta/tokenizer_ta.json")






In [9]:
# Getting size of both tokenizer.
source_vocab_size = tokenizer_en.get_vocab_size()
target_vocab_size = tokenizer_ta.get_vocab_size()

In [10]:
print(source_vocab_size)
print(target_vocab_size)

30000
30000


In [15]:
# Define token-ids variables, we need this for training model.
CLS_ID = torch.tensor([tokenizer_ta.token_to_id("[CLS]")], dtype=torch.int64).to(device)
SEP_ID = torch.tensor([tokenizer_ta.token_to_id("[SEP]")], dtype=torch.int64).to(device)
PAD_ID = torch.tensor([tokenizer_ta.token_to_id("[PAD]")], dtype=torch.int64).to(device)

In [16]:
#Step 3: Prepare Dataset and DataLoader

In [24]:
# This class takes raw dataset and max_seq_len (maximum length of a sequence in the entire dataset).
class EncodeDataset(Dataset):
 def __init__(self, raw_dataset, max_seq_len):
     super().__init__()
     self.raw_dataset = raw_dataset
     self.max_seq_len = max_seq_len
 
 def __len__(self):
     return len(self.raw_dataset)

 def __getitem__(self, index):
 
     # Fetching raw text for the given index that consists of source and target pair.
     raw_text = self.raw_dataset[index]
     
     # Separating text to source and target text and will be later used for encoding.
     source_text = raw_text["en"]
     target_text = raw_text["ta"]
    
     # Encoding source text with source tokenizer(tokenizer_en) and target text with target tokenizer(tokenizer_ta).
     source_text_encoded = torch.tensor(tokenizer_en.encode(source_text).ids, dtype = torch.int64).to(device) 
     target_text_encoded = torch.tensor(tokenizer_ta.encode(target_text).ids, dtype = torch.int64).to(device)
    
     # To train the model, the sequence lenth of each input sequence should be equal max seq length. 
     # Hence additional number of padding will be added to the input sequence if the length is less than the max_seq_len.
     num_source_padding = self.max_seq_len - len(source_text_encoded) - 2 
     num_target_padding = self.max_seq_len - len(target_text_encoded) - 1 
    
     encoder_padding = torch.tensor([PAD_ID] * num_source_padding, dtype = torch.int64).to(device)
     decoder_padding = torch.tensor([PAD_ID] * num_target_padding, dtype = torch.int64).to(device)
     
     # encoder_input has the first token as start of sentence - CLS_ID, followed by source encoding which is then followed by the end of sentence token - SEP.
     # To reach the required max_seq_len, addition PAD token will be added at the end. 
     encoder_input = torch.cat([CLS_ID, source_text_encoded, SEP_ID, encoder_padding]).to(device) 
    
     # decoder_input has the first token as start of sentence - CLS_ID, followed by target encoding.
     # To reach the required max_seq_len, addition PAD token will be added at the end. There is no end of sentence token - SEP in decoder_input.
     decoder_input = torch.cat([CLS_ID, target_text_encoded, decoder_padding ]).to(device) 
     
     # target_label has the first token as target encoding followed by end of sentence token - SEP. There is no start of sentence token - CLS in target label.
     # To reach the required max_seq_len, addition PAD token will be added at the end. 
     target_label = torch.cat([target_text_encoded,SEP_ID,decoder_padding]).to(device) 
     
     # As we've added extra padding token with input encoding, during training, we don't want this token to be trained by model as there is nothing to learn in this token.
     # So, we'll use encoder mask to nullify the padding token value prior to calculating output of self attention in encoder block.
     encoder_mask = (encoder_input != PAD_ID).unsqueeze(0).unsqueeze(0).int().to(device) 
     
     # We also don't want any token to get influenced by the future token during the decoding stage. Hence, Causal mask is being implemented during masked multihead attention to handle this. 
     decoder_mask = (decoder_input != PAD_ID).unsqueeze(0).unsqueeze(0).int() & causal_mask(decoder_input.size(0)).to(device) 

     return {
     'encoder_input': encoder_input,
     'decoder_input': decoder_input,
     'target_label': target_label,
     'encoder_mask': encoder_mask,
     'decoder_mask': decoder_mask,
     'source_text': source_text,
     'target_text': target_text
     }

# Causal mask will make sure any token that comes after the current token will be masked, meaning the value will be replaced by -ve infinity which will be converted to zero or close to zero after softmax function. 
# Hence the model will just ignore these value or willn't be able to learn anything from these values.
def causal_mask(size):
 # dimension of causal mask (batch_size, seq_len, seq_len)
    mask = torch.triu(torch.ones(1, size, size), diagonal = 1).type(torch.int)
    return mask == 0
# To calculate the max sequence lenth in the entire training dataset for the source and target dataset.
max_seq_len_source = 0
max_seq_len_target = 0

for data in raw_train_dataset["translation"]:
 enc_ids = tokenizer_en.encode(data["en"]).ids
 dec_ids = tokenizer_ta.encode(data["ta"]).ids
 max_seq_len_source = max(max_seq_len_source, len(enc_ids))
 max_seq_len_target = max(max_seq_len_target, len(dec_ids))

print(f'max_seqlen_source: {max_seq_len_source}') 
print(f'max_seqlen_target: {max_seq_len_target}')

# To simplify the training process, we'll just take single max_seq_len and add 20 to cover the additional length of tokens such as PAD, CLS, SEP in the sequence.
max_seq_len = 400

# Instantiate the EncodeRawDataset class and create the encoded train and validation-dataset.
train_dataset = EncodeDataset(raw_train_dataset["translation"], max_seq_len)
val_dataset = EncodeDataset(raw_validation_dataset["translation"], max_seq_len)

# Creating DataLoader wrapper for both training and validation dataset. This dataloader will be used later stage during training and validation of our LLM model.
train_dataloader = DataLoader(train_dataset, batch_size = 10, shuffle = True, generator=torch.Generator(device='cuda'))
val_dataloader = DataLoader(val_dataset, batch_size = 1, shuffle = True, generator=torch.Generator(device='cuda'))

max_seqlen_source: 665
max_seqlen_target: 978


In [25]:
#STEP 4: Input embedding and positional encoding

In [27]:
# Input embedding and positional encoding
class EmbeddingLayer(nn.Module):
 def __init__(self, vocab_size: int, d_model: int):
     super().__init__()
     self.d_model = d_model
     
     # Using pytorch embedding layer module to map token id to vocabulary and then convert into embeeding vector. 
     # The vocab_size is the vocabulary size of the training dataset created by tokenizer during training of corpus dataset in step 2.
     self.embedding = nn.Embedding(vocab_size, d_model)
     
 def forward(self, input):
     # In addition of feeding input sequence to the embedding layer, the extra multiplication by square root of d_model is done to normalize the embedding layer output
     embedding_output = self.embedding(input) * math.sqrt(self.d_model)
     return embedding_output


class PositionalEncoding(nn.Module):
 def __init__(self, max_seq_len: int, d_model: int, dropout_rate: float):
     super().__init__()
     self.dropout = nn.Dropout(dropout_rate)
     
     # We're creating a matrix of the same shape as embedding vector.
     pe = torch.zeros(max_seq_len, d_model)
     
     # Calculate the position part of PE functions.
     pos = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
    
     # Calculate the division part of PE functions. Take note that the div part expression is slightly different that papers expression as this exponential functions seems to works better.
     div_term = torch.exp(torch.arange(0, d_model, 2).float()) * (-math.log(10000)/d_model)
     
     # Fill in the odd and even matrix value with the sin and cosine mathematical function results.
     pe[:, 0::2] = torch.sin(pos * div_term)
     pe[:, 1::2] = torch.cos(pos * div_term)
     
     # Since we're expecting the input sequences in batches so the extra batch_size dimension is added in 0 postion.
     pe = pe.unsqueeze(0) 
 
 def forward(self, input_embdding):
     # Add positional encoding together with the input embedding vector.
     input_embdding = input_embdding + (self.pe[:, :input_embdding.shape[1], :]).requires_grad_(False) 
     
     # Perform dropout to prevent overfitting.
     return self.dropout(input_embdding)

In [28]:
#Step 5: Multi-Head Attention Block

In [30]:
class MultiHeadAttention(nn.Module):
 def __init__(self, d_model: int, num_heads: int, dropout_rate: float):
     super().__init__()
     # Define dropout to prevent overfitting.
     self.dropout = nn.Dropout(dropout_rate)
     
     # Weight matrix are introduced and are all learnable parameters.
     self.W_q = nn.Linear(d_model, d_model)
     self.W_k = nn.Linear(d_model, d_model)
     self.W_v = nn.Linear(d_model, d_model)
     self.W_o = nn.Linear(d_model, d_model)
    
     self.num_heads = num_heads
     assert d_model % num_heads == 0, "d_model must be divisible by number of heads"
     
     # d_k is the new dimension of each splitted self attention heads
     self.d_k = d_model // num_heads

 def forward(self, q, k, v, encoder_mask=None):
 
     # We'll be training our model with multiple batches of sequence at once in parallel, hence we'll need to include batch_size in the shape as well.
     # query, key and value are calculated by matrix multiplication of corresponding weights with the input embeddings. 
     # Change of shape: q(batch_size, seq_len, d_model) @ W_q(d_model, d_model) => query(batch_size, seq_len, d_model) [same goes to key and value]. 
     query = self.W_q(q) 
     key = self.W_k(k)
     value = self.W_v(v)
    
     # Splitting query, key and value into number of heads. d_model is splitted in d_k across 8 heads.
     # Change of shape: query(batch_size, seq_len, d_model) => query(batch_size, seq_len, num_heads, d_k) -> query(batch_size,num_heads, seq_len,d_k) [same goes to key and value].
     query = query.view(query.shape[0], query.shape[1], self.num_heads ,self.d_k).transpose(1,2)
     key = key.view(key.shape[0], key.shape[1], self.num_heads ,self.d_k).transpose(1,2)
     value = value.view(value.shape[0], value.shape[1], self.num_heads ,self.d_k).transpose(1,2)
    
     # :: SELF ATTENTION BLOCK STARTS ::
    
     # Attention score is calculated to find the similarity or relation between query with key of itself and all other embedding in the sequence.
     # Change of shape: query(batch_size,num_heads, seq_len,d_k) @ key(batch_size,num_heads, seq_len,d_k) => attention_score(batch_size,num_heads, seq_len,seq_len).
     attention_score = (query @ key.transpose(-2,-1))/math.sqrt(self.d_k)
    
     # If mask is provided, the attention score needs to modify as per the mask value. Refer to the details in point no 4.
     if encoder_mask is not None:
         attention_score = attention_score.masked_fill(encoder_mask==0, -1e9)
         
         # Softmax function calculates the probability distribution among all the attention scores. It assign higher probabiliy value to higher attention score. Meaning more similar tokens get higher probability value.
         # Change of shape: same as attention_score
         attention_weight = torch.softmax(attention_score, dim=-1)
    
     if self.dropout is not None:
         attention_weight = self.dropout(attention_weight)
        
         # Final step in Self attention block is, matrix multiplication of attention_weight with Value embedding vector.
         # Change of shape: attention_score(batch_size,num_heads, seq_len,seq_len) @ value(batch_size,num_heads, seq_len,d_k) => attention_output(batch_size,num_heads, seq_len,d_k)
         attention_output = attention_score @ value
         
         # :: SELF ATTENTION BLOCK ENDS ::
        
         # Now, all the heads will be combined back to a single head
         # Change of shape:attention_output(batch_size,num_heads, seq_len,d_k) => attention_output(batch_size,seq_len,num_heads,d_k) => attention_output(batch_size,seq_len,d_model) 
         attention_output = attention_output.transpose(1,2).contiguous().view(attention_output.shape[0], -1, self.num_heads * self.d_k)
        
         # Finally attention_output is matrix multiplied with output weight matrix to give the final Multi-Head attention output. 
         # The shape of the multihead_output is same as the embedding input
         # Change of shape: attention_output(batch_size,seq_len,d_model) @ W_o(d_model, d_model) => multihead_output(batch_size, seq_len, d_model)
         multihead_output = self.W_o(attention_output)
         
         return multihead_output

In [31]:
#Step 6: Feedforward Network, Layer Normalization and AddAndNorm

In [32]:
# Feedfoward Network, Layer Normalization and AddAndNorm Block
class FeedForward(nn.Module):
 def __init__(self, d_model: int, d_ff: int, dropout_rate: float):
     super().__init__()
    
     self.layer_1 = nn.Linear(d_model, d_ff)
     self.activation_1 = nn.ReLU()
     self.dropout = nn.Dropout(dropout_rate)
     self.layer_2 = nn.Linear(d_ff, d_model)
     
 def forward(self, input):
     return self.layer_2(self.dropout(self.activation_1(self.layer_1(input))))

class LayerNorm(nn.Module):
     def __init__(self, eps: float = 1e-5):
         super().__init__()
         #Epsilon is a very small value and it plays an important role to prevent potentially division by zero problem.
         self.eps = eps
        
         #Extra learning parameters gamma and beta are introduced to scale and shift the embedding value as the network needed.
         self.gamma = nn.Parameter(torch.ones(1))
         self.beta = nn.Parameter(torch.zeros(1))
         
     def forward(self, input):
         mean = input.mean(dim=-1, keepdim=True) 
         std = input.std(dim=-1, keepdim=True) 
        
         return self.gamma * ((input - mean)/(std + self.eps)) + self.beta
         
 
class AddAndNorm(nn.Module):
     def __init__(self, dropout_rate: float):
         super().__init__()
         self.dropout = nn.Dropout(dropout_rate)
         self.layer_norm = LayerNorm()
    
     def forward(self, input, sub_layer):
         return input + self.dropout(sub_layer(self.layer_norm(input)))

In [33]:
#Step 7: Encoder block and Encoder

In [35]:
class EncoderBlock(nn.Module):
 def __init__(self, multihead_attention: MultiHeadAttention, feed_forward: FeedForward, dropout_rate: float):
     super().__init__()
     self.multihead_attention = multihead_attention
     self.feed_forward = feed_forward
     self.add_and_norm_list = nn.ModuleList([AddAndNorm(dropout_rate) for _ in range(2)])

 def forward(self, encoder_input, encoder_mask):
     # First AddAndNorm unit taking encoder input from skip connection and adding it with the output of MultiHead attention block.
     encoder_input = self.add_and_norm_list[0](encoder_input, lambda encoder_input: self.multihead_attention(encoder_input, encoder_input, encoder_input, encoder_mask))
     
     # Second AddAndNorm unit taking output of MultiHead attention block from skip connection and adding it with the output of Feedforward layer.
     encoder_input = self.add_and_norm_list[1](encoder_input, self.feed_forward)
    
     return encoder_input

class Encoder(nn.Module):
 def __init__(self, encoderblocklist: nn.ModuleList):
     super().__init__()
    
     # Encoder class is initialized by taking encoderblock list.
     self.encoderblocklist = encoderblocklist
     self.layer_norm = LayerNorm()

 def forward(self, encoder_input, encoder_mask):
     # Looping through all the encoder block - 6 times.
     for encoderblock in self.encoderblocklist:
         encoder_input = encoderblock(encoder_input, encoder_mask)
        
         # Normalize the final encoder block output and return. This encoder output will be used later on as key and value for the cross attention in decoder block.
         encoder_output = self.layer_norm(encoder_input)
         return encoder_output

In [36]:
#Step 8: Decoder block, Decoder and Projection Layer

In [37]:
class DecoderBlock(nn.Module):
 def __init__(self, masked_multihead_attention: MultiHeadAttention,multihead_attention: MultiHeadAttention, feed_forward: FeedForward, dropout_rate: float):
     super().__init__()
     self.masked_multihead_attention = masked_multihead_attention
     self.multihead_attention = multihead_attention
     self.feed_forward = feed_forward
     self.add_and_norm_list = nn.ModuleList([AddAndNorm(dropout_rate) for _ in range(3)])

 def forward(self, decoder_input, decoder_mask, encoder_output, encoder_mask):
     # First AddAndNorm unit taking decoder input from skip connection and adding it with the output of Masked Multi-Head attention block.
     decoder_input = self.add_and_norm_list[0](decoder_input, lambda decoder_input: self.masked_multihead_attention(decoder_input,decoder_input, decoder_input, decoder_mask))
     # Second AddAndNorm unit taking output of Masked Multi-Head attention block from skip connection and adding it with the output of MultiHead attention block.
     decoder_input = self.add_and_norm_list[1](decoder_input, lambda decoder_input: self.multihead_attention(decoder_input,encoder_output, encoder_output, encoder_mask)) # cross attention
     # Third AddAndNorm unit taking output of MultiHead attention block from skip connection and adding it with the output of Feedforward layer.
     decoder_input = self.add_and_norm_list[2](decoder_input, self.feed_forward)
     return decoder_input

class Decoder(nn.Module):
 def __init__(self,decoderblocklist: nn.ModuleList):
     super().__init__()
     self.decoderblocklist = decoderblocklist
     self.layer_norm = LayerNorm()

 def forward(self, decoder_input, decoder_mask, encoder_output, encoder_mask):
     for decoderblock in self.decoderblocklist:
         decoder_input = decoderblock(decoder_input, decoder_mask, encoder_output, encoder_mask)
        
         decoder_output = self.layer_norm(decoder_input)
         return decoder_output

class ProjectionLayer(nn.Module):
 def __init__(self, vocab_size: int, d_model: int):
     super().__init__()
     self.projection_layer = nn.Linear(d_model, vocab_size)

 def forward(self, decoder_output):
     # Projection layer first take in decoder output and passed into the linear layer of shape (d_model, vocab_size) 
     # Change in shape: decoder_output(batch_size, seq_len, d_model) @ linear_layer(d_model, vocab_size) => output(batch_size, seq_len, vocab_size)
     output = self.projection_layer(decoder_output)
     
     # softmax function to output the probability distribution over the vocabulary
     return torch.log_softmax(output, dim=-1)

In [38]:
#Step 9: Create and build a Transformer

In [48]:
class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, source_embed: EmbeddingLayer, target_embed: EmbeddingLayer, source_pos: PositionalEncoding, target_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()

        self.source_embed = source_embed
        self.source_pos = source_pos
        self.encoder = encoder

        self.target_embed = target_embed
        self.target_pos = target_pos
        self.decoder = decoder

        self.projection_layer = projection_layer

    def encode(self, encoder_input, encoder_mask):
        encoder_input = self.source_embed(encoder_input)
        encoder_input = self.source_pos(encoder_input)
        encoder_output = self.encoder(encoder_input, encoder_mask)
        return encoder_output

    def decode(self, encoder_output, encoder_mask, decoder_input, decoder_mask):
        decoder_input = self.target_embed(decoder_input)
        decoder_input = self.target_pos(decoder_input)
        decoder_output = self.decoder(decoder_input, encoder_output, encoder_mask, decoder_mask)
        return decoder_output

    def project(self, decoder_output):
        return self.projection_layer(decoder_output)

def build_model(source_vocab_size: int, target_vocab_size: int, source_seq_len: int, target_seq_len: int, d_model: int=512, num_blocks: int=6, num_heads: int=8, dropout_rate: float=0.1, d_ff: int=2048) -> Transformer:
    # Create the embedding layers
    source_embed = EmbeddingLayer(d_model, source_vocab_size)
    target_embed = EmbeddingLayer(d_model, target_vocab_size)

    # Create the positional encoding layers
    source_pos = PositionalEncoding(d_model, source_seq_len, dropout_rate)
    target_pos = PositionalEncoding(d_model, target_seq_len, dropout_rate)

    # Create the encoder-block-list
    encoderblocklist = []
    for _ in range(num_blocks):
        multihead_attention = MultiHeadAttention(d_model, num_heads, dropout_rate)
        feed_forward = FeedForward(d_model, d_ff, dropout_rate)
        encoder_block = EncoderBlock(multihead_attention, feed_forward, dropout_rate)
        encoderblocklist.append(encoder_block)
    # Create the encoder
    encoder = Encoder(nn.ModuleList(encoderblocklist))

    # Create the decoder-block-list
    decoderblocklist = []
    for _ in range(num_blocks):
        masked_multihead_attention = MultiHeadAttention(d_model,num_heads, dropout_rate)
        cross_multihead_attention = MultiHeadAttention(d_model, num_heads, dropout_rate)
        feed_forward = FeedForward(d_model, d_ff, dropout_rate)
        decoder_block = DecoderBlock(masked_multihead_attention, cross_multihead_attention, feed_forward, dropout_rate)
        decoderblocklist.append(decoder_block)
    # Create the decoder
    decoder = Decoder(nn.ModuleList(decoderblocklist))

    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, target_vocab_size)

    # Now that we've initialized all the required blocks of transformer, we can now inititiate a model
    model = Transformer(encoder, decoder, source_embed, target_embed, source_pos, target_pos, projection_layer)

    # For the first time, we'll initialize the model parameters using xavier uniform method. Once training begings the parameters will be updated by the network
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return model

In [50]:

# Let's build the the final model.
model = build_model(tokenizer_en.get_vocab_size(), tokenizer_ta.get_vocab_size(),max_seq_len, max_seq_len, d_model=512).to(device)

# Let's look at the architecture that we've just build ourself
print(model)

Transformer(
  (source_embed): EmbeddingLayer(
    (embedding): Embedding(512, 30000)
  )
  (source_pos): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Encoder(
    (encoderblocklist): ModuleList(
      (0-5): 6 x EncoderBlock(
        (multihead_attention): MultiHeadAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (W_q): Linear(in_features=512, out_features=512, bias=True)
          (W_k): Linear(in_features=512, out_features=512, bias=True)
          (W_v): Linear(in_features=512, out_features=512, bias=True)
          (W_o): Linear(in_features=512, out_features=512, bias=True)
        )
        (feed_forward): FeedForward(
          (layer_1): Linear(in_features=512, out_features=2048, bias=True)
          (activation_1): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
          (layer_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (add_and_norm_list): ModuleList(
          (0-1): 2 x A

In [51]:
#Step 10: Training and validation of our build LLM model

In [58]:
def run_validation(model, validation_ds, tokenizer_en, tokenizer_my, max_seq_len, device, print_msg, global_step):
    model.eval()
    count = 0

    with torch.no_grad():
        for batch in validation_ds:
            count += 1
            encoder_input = batch["encoder_input"].to(device)
            encoder_mask = batch["encoder_mask"].to(device)

            cls_id = tokenizer_ta.token_to_id('[CLS]')
            sep_id = tokenizer_ta.token_to_id('[SEP]')

            # Computing the output of the encoder for the source sequence
            encoder_output = model.encode(encoder_input, encoder_mask)
            # for prediction task, the first token that goes in decoder input is the [CLS] token
            decoder_input = torch.empty(1, 1, device=device).fill_(cls_id).type_as(encoder_input).to(device)
            # since we need to keep adding the output back to the input until the [SEP] - end token is received.
            while True:
                # check if the max length is received
                if decoder_input.size(1) == max_seq_len:
                    break

                # recreate mask each time the new output is added the decoder input for next token prediction
                decoder_mask = causal_mask(decoder_input.size(1)).type_as(encoder_mask).to(device)

                # apply projection only to the next token
                out = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)

                # apply projection only to the next token
                prob = model.project(out[:, -1])

                # select the token with highest probablity which is a greedy search implementation
                _, next_word = torch.max(prob, dim=1)
                decoder_input = torch.cat(
                    [decoder_input, torch.empty(1, 1, device=device).type_as(encoder_input).fill_(next_word.item()).to(device)], dim=1
                )
                # check if the new token is the end of token
                if next_word == sep_id:
                    break
            # final output is the concatinated decoder input till the end token is reached
            model_out = decoder_input.squeeze(0)

            source_text = batch["source_text"][0]
            target_text = batch["target_text"][0]
            model_out_text = tokenizer_ta.decode(model_out.detach().cpu().numpy())

            # Print the source, target and model output
            print_msg('-'*55)
            # print_msg(f"{f'SOURCE: ':>12}{source_text}")
            # print_msg(f"{f'TARGET: ':>12}{target_text}")
            # print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")
            print_msg(f'Source Text: {source_text}')
            print_msg(f'Target Text: {target_text}')
            print_msg(f'Predicted by TamilGPT: {model_out_text}')

            if count == 2:
                break

def train_model(preload_epoch=None):
    # The entire training, validation cycle will run for 20 cycles or epochs.
    EPOCHS = 10
    initial_epoch = 0
    global_step = 0

    # Adam is one of the most commonly used optimization algorithms that hold the current state and will update the parameters based on the computed gradients.
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, eps=1e-9)

    # If the preload_epoch is not none, that means the training will start with the weights, optimizer that has been last saved and start with preload epoch + 1
    if preload_epoch is not None:
      model_filename = f"./tamilgpt/model_{preload_epoch}.pt"
      state = torch.load(model_filename)
      model.load_state_dict(state['model_state_dict'])
      initial_epoch = state['epoch'] + 1
      optimizer.load_state_dict(state['optimizer_state_dict'])
      global_step = state['global_step']

    # The CrossEntropyLoss loss function computes the difference between the projection output and target label.
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_en.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    for epoch in range(initial_epoch, EPOCHS):
        # torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:
            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)
            target_label = batch['target_label'].to(device) # (B, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            projection_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(projection_output.view(-1, tokenizer_ta.get_vocab_size()), target_label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1

        # VALIDATION BLOCK STARTS HERE [Runs every epoch after the training block is complete]
        run_validation(model, val_dataloader, tokenizer_en, tokenizer_my, max_seq_len, device, lambda msg: batch_iterator.write(msg), global_step)

        # Save the model at the end of every epoch
        model_filename = f"./tamilgpt/model_{epoch}.pt"
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)


     

# Train our model
train_model(preload_epoch=None)
     


Processing Epoch 00:   0%|                                                                                                                                                                                    | 0/22702 [00:00<?, ?it/s]


RuntimeError: Expected a 'cpu' device type for generator but found 'cuda'