In [8]:
# Import the important libraries
import torch
import torch.nn as nn
import torch.optim as optim
from transformer_model import build_transformer
# from datasets import load_dataset #hugging face datasets
from torch.utils.data import DataLoader, Dataset
import tqdm

In [9]:
# Huggingface datasets and tokenizers
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

ModuleNotFoundError: No module named 'datasets'

## Defining a Custom Dataset
    - pupose is to convert the dataset into the map-style dataset
    - Tokenize the datasets (source dataset and target dataset)
    - And Pad the sentences.

In [None]:
def cml(size):
    # torch.triu returns the upper triangular marix of ones
    mask = torch.triu(torch.ones((1, size, size)), diagonal = 1).type(torch.int)
    return mask

In [9]:
# torch.triu(1,5,5)
a = torch.ones((1,5,5))
print(a)
torch.triu(a, diagonal = 1)

tensor([[[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]]])


tensor([[[0., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0.]]])

In [None]:
class CustomDataset(Dataset):
    def __init__(self,data, src_len, tgt_len, src_tokenizer, tgt_tokenizer, max_seq_len):
        self.data = data
        self.src_len = src_len
        self.tgt_len = tgt_len
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.max_seq_len = max_seq_len

        # Create the token_id for special tokens
        self.sos_token_id = torch.tensor([tgt_tokenizer.token_to_id('<sos>')], dtype= int64)
        self.eos_token_id = torch.tensor([tgt_tokenizer.token_to_id('<eos>')], dtype = int64)
        self.pad_token_id = torch.tensor([tgt_tokenizer.token_to_id('<pad>')], dtype = int64)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the source and target sentence
        src_sent = self.data[idx]['translation'][self.src_len]
        tgt_sent = self.data[idx]['translation'][self.tgt_len]

        # Tokenize the source and target sentence
        encoder_sen = self.src_tokenizer.encode(src_sent).ids
        decoder_sen = self.tgt_tokenizer.encode(tgt_sent).ids

        # Add the special tokens to the soruce and target sentence
        src_pad_len = self.max_seq_len - len(encoder_sen) - 2
        tgt_pad_len = self.max_seq_len - len(decoder_sen) - 2

        # constraint check 
        if src_pad_len < 0 or tgt_pad_len < 0:
            raise ValueError("The sentence is too long")

        # use the cat function to concatenate the tensors

        encoder_input = torch.cat([self.sos_token_id, torch.tensor(encoder_sen), self.eos_token_id, self.pad_token_id.repeat(src_pad_len)], dim=0) # dim = 0 means concatenate the tokens along with rows
        decoder_input = torch.cat([self.sos_token_id, torch.tensor(decoder_sen), self.pad_token_id.repeat(tgt_pad_len)], dim = 0)

        # label for the decoder
        # add only end token
        label = torch.cat([
            torch.tensor(decoder_sen, dtypes = int64),
            self.eos_token_id,
            self.pad_token_id.repeat(tgt_pad_len)
        ], dim = 0)

        # check the max_seq_len equal
        assert encoder_input.size(0) == self.max_seq_len
        assert decoder_input.size(0) == self.max_seq_len
        assert label.size(0) == self.max_seq_len

        # return all the outputs
        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input,
            'label': label,
            'encoder_mask': (encoder_input != self.pad_token_id).unsqueeze(0).unsqueeze(0).int(),
            'decoder_mask': (decoder_input != self.pad_token_id).unsqueeze(0).unsqueeze(0).int() & cml(decoder_input.size(0)), #size = (1,1, max_seq_len)
            'src_sent': src_sent,
            'tgt_sent': tgt_sent
            
        }

In [None]:
def get_sentences(data, leng):
    for sen in data:
        yield sen['translation'][leng]

In [None]:
def tokenizer_train(data, leng):

    tokenizer_path = "tokenizer_{0}.json".format(leng)
    if not os.path.exists(tokenizer_path):
        tokenizer1 = Tokenizer(WordLevel(unk_token = "<unk>"))
        tokenizer1.pre_tokenizer = Whitespace()
        trainer1 = WordLevelTrainer(special_tokens=["<pad>", "<sos>", "<eos>"])
        trainer1.train_from_iterator(get_sentences(data, leng), tokenizer1)
        tokenizer1.save(str(tokenizer_path))
    else: 
        tokenizer1 = Tokenizer.from_file(str(tokenizer_path))

    return tokenizer1




In [None]:
# Load the dataset from the huggingface datasets
eng_hin_dataset = load_datsset('cfilt', 'iitb-english-hindi', split = 'train')
src_tokenizer = tokenizer_train(eng_hin_dataset, "en")
tgt_tokenizer = tokenizer_train(eng_hin_dataset, "hi")

In [None]:
# split the dataset into 80:20 ratio using sklearn train_test_split fix the seed value

from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(eng_hin_dataset, test_size = 0.2, random_state = 42)

In [None]:
train_dataset = CustomDataset(train_data, 'en', 'hi', src_tokenizer, tgt_tokenizer, 350) # 350 is the max_seq_len
val_dataset = CustomDataset(val_data, 'en', 'hi', src_tokenizer, tgt_tokenizer, 350)

In [None]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 1, shuffle = False)

NameError: name 'DataLoader' is not defined

In [None]:
"num_heads": 8,
"num_epochs": 20,
"max_seq_len": 350,
"d_model": 512,
"d_ff": 2048,
"dropout": 0.1,

In [None]:
model = build_transformer(src_vocab_size= len(src_tokenizer.get_vocab()), tgt_vocab_size= len(tgt_tokenizer.get_vocab()),src_seq_len= max_seq_len, tgt_seq_len= max_seq_len, d_model = d_model,num_heads = num_heads , d_ff = d_ff, dropout= dropout)

In [None]:
# set the device on cuda if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# shift it on parallel gpu
model = nn.DataParallel(model)
model = model.to(device)

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index = tgt_tokenizer.token_to_id('<pad>'), label_smoothing=0.1)
optimizer = optim.Adam(model.parameters(), lr = 10**-4, eps=1e-9)
num_epochs =  20

In [None]:
# Decoding the tokens
def decode_tokens(tokens, tokenizer):
    return tokenizer.decode(tokens, skip_special_tokens = True)

In [None]:
# def training(model, criterion, optimizer, train_loader, tokenizer, epoch, total_epoch):
#     model.train()
#     train_loss = 0
#     # use the tqdm for the progress bar
#     for idx, data in enumerate(tqdm(train_loader)):
#         encoder_input = data['encoder_input'].to(device)
#         decoder_input = data['decoder_input'].to(device)
#         target = data['label'].to(device)
#         encoder_mask = data['encoder_mask'].to(device)
#         decoder_mask = data['decoder_mask'].to(device)

#         optimizer.zero_grad()
#         encoder_output = model.encode(encoder_input, encoder_mask)
#         decoder_output = model.decode(encoder_output, decoder_input,encoder_mask,  decoder_mask)
#         output = model.linear(decoder_output)
#         loss = criterion(output.view(-1, len(tokenizer.get_vocab())), target.view(-1))
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad(set_to_none = True)
#         train_loss += loss.item()


In [None]:
import torch
from tqdm import tqdm

def training(model, criterion, optimizer, train_loader, tokenizer, epoch, total_epoch, device):
    model.train()
    train_loss = 0
    total_samples = 0

    # Use tqdm for progress tracking
    loop = tqdm(train_loader, desc=f"Training Epoch [{epoch}/{total_epoch}]", leave=True)

    for idx, data in enumerate(loop):
        encoder_input = data['encoder_input'].to(device)
        decoder_input = data['decoder_input'].to(device)
        target = data['label'].to(device)
        encoder_mask = data['encoder_mask'].to(device)
        decoder_mask = data['decoder_mask'].to(device)

        optimizer.zero_grad()  # Reset gradients before backpropagation

        encoder_output = model.encode(encoder_input, encoder_mask)
        decoder_output = model.decode(encoder_output, decoder_input, encoder_mask, decoder_mask)
        output = model.linear(decoder_output)

        loss = criterion(output.view(-1, len(tokenizer.get_vocab())), target.view(-1))
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * encoder_input.size(0)
        total_samples += encoder_input.size(0)

        # Update tqdm progress bar with loss
        loop.set_postfix(train_loss=train_loss / total_samples)

    return train_loss / total_samples


In [None]:
# def validation(model, criterion, val_loader, tokenizer, max_seq_len epoch, total_epoch):
#     model.eval()
#     val_loss = 0
#     with torch.no_grad():
#         for idx, data in enumerate(val_loader):
#             encoder_input = data['encoder_input'].to(device)
#             encoder_mask = data['encoder_mask'].to(device)
#             target = data['label'].to(device)
            

#             # check the batch size of encoder_input
#             assert encoder_input.size(0) == 1, "at least one sentence should be there"

#             # get the encoder_output
#             encoder_output = model.encode(encoder_input, encoder_mask)
#             # decoder input is the sos token
#             decoder_input = torch.empty(1,1).fill_(tokenizer.token_to_id('<sos>').type_as(encoder_input).to(device))
            
#             # create a while loop to generate the output
#             while True:
#                 if decoder_input.size(1) == max_seq_len:
#                     break
#                 decoder_mask = cml(decoder_input.size(1)).type_as(encoder_mask).to(device)
#                 decoder_output = model.decode(encoder_output, decoder_input, encoder_mask, decoder_mask)
#                 output = model.linear(decoder_output[:,-1])
#                 # get the last token
#                 _, next_word = torch.max(output, dim = 1)
#                 predicted_out = torch.cat([decoder_input, torch.empty(1,1).fill_(tokenizer.token_to_id(nex_word.item())).to(device)], dim = 1)

#                 if next_word.item() == tokenizer.token_to_id('<eos>'):
#                     break
            
#             # get the target sentence
#             predicted_sen = predicted_out.squeeze(0)








In [None]:
def validation(model, criterion, val_loader, tokenizer, max_seq_len, epoch, total_epoch, device):
    model.eval()
    val_loss = 0
    total_samples = 0

    with torch.no_grad():
        loop = tqdm(val_loader, desc=f"Validation Epoch [{epoch}/{total_epoch}]", leave=True)

        for idx, data in enumerate(loop):
            encoder_input = data['encoder_input'].to(device)
            encoder_mask = data['encoder_mask'].to(device)
            target = data['label'].to(device)

            batch_size = encoder_input.size(0)
            assert batch_size == 1, "Batch size must be 1 for auto-regressive decoding"

            encoder_output = model.encode(encoder_input, encoder_mask)

            # Convert source tokens to text
            source_sentence = tokenizer.decode(encoder_input.squeeze(0).tolist())

            # Start decoding with the <sos> token
            decoder_input = torch.full((batch_size, 1), tokenizer.token_to_id('<sos>'), dtype=torch.long, device=device)
            predicted_out = decoder_input  # To store generated sequence

            while True:
                if decoder_input.size(1) == max_seq_len:
                    break

                decoder_mask = torch.tril(torch.ones((decoder_input.size(1), decoder_input.size(1)), device=device)).unsqueeze(0)
                decoder_output = model.decode(encoder_output, decoder_input, encoder_mask, decoder_mask)
                output = model.linear(decoder_output[:, -1])  # Get last token predictions

                _, next_word = torch.max(output, dim=1)
                predicted_out = torch.cat([predicted_out, next_word.unsqueeze(1)], dim=1)

                if next_word.item() == tokenizer.token_to_id('<eos>'):
                    break

            # Compute loss
            loss = criterion(output.view(-1, len(tokenizer.get_vocab())), target.view(-1))
            val_loss += loss.item() * batch_size
            total_samples += batch_size

            # Convert predicted token IDs to sentence
            predicted_sentence = tokenizer.decode(predicted_out.squeeze(0).tolist())

            # Convert target tokens to text
            target_sentence = tokenizer.decode(target.squeeze(0).tolist())

            # Print source, predicted, and target sentences
            print("Source Sentence:  ", source_sentence)
            print("Predicted Sentence:", predicted_sentence)
            print("Target Sentence:   ", target_sentence)

            # Update tqdm progress bar
            loop.set_postfix(val_loss=val_loss / total_samples)

    return val_loss / total_samples


In [None]:
# training and validation loop 
def training_model(model, criterion, optimizer, train_loader, val_loader, num_epochs):
    for epoch in range(num_epochs):
        train_loss = training(model, criterion, optimizer, train_loader, src_tokenizer, epoch, num_epochs)
        val_loss = validation(model, criterion, val_loader, tgt_tokenizer, max_seq_len, epoch, num_epochs)

        print(f"Epoch: {epoch+1}, Train Loss: {train_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}")