* In this assignment you will be using the entire transformer architecture for a translation task.
* we will just be using one encoder layer and one decoder layer
* You can copy the encoder and decoder modules from the previous assignments. You are going to translate a few sentences from **English to Tamil**
  * Source language: English
  * Target language: Tamil

* You may experiment with a target language of your choice for checking the impelementation. (You may use google translate for that)

* We need to install torchdata and torchtext (which take about 3 minutes to finish installing) for tokenizing the text.
* We already defined useful functions for the tokenization of texts




In [2]:
!pip install torchdata==0.6.0 # to be compatible with torch 2.0
!pip install portalocker==2.0.0
!pip install -U torchtext==0.15.1

* Let's import all required libraries

In [16]:
import torch
from torch import Tensor

import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.nn.functional import one_hot

import torch.optim as optim

#text lib
import torchtext

# tokenizer
from torchtext.data.utils import get_tokenizer

#build vocabulary
from torchtext.vocab import vocab
from torchtext.vocab import build_vocab_from_iterator

# get input_ids (numericalization)
from torchtext.transforms import VocabTransform, LabelToIndex

# get embeddings
from torch.nn import Embedding

from  pprint import pprint
from yaml import safe_load
import copy
import numpy as np
import requests
import math

# Preparing Data

* Source and target text

In [3]:
src_text = """The most famous ruler of ancient India was Emperor Ashoka.
It was during his period that Buddhism spread to different parts of Asia.
Ashoka gave up war after seeing many people grieving death after the Kalinga war.
He embraced Buddhism and then devoted his life to spread the message of peace and dharma.
His service for the cause of public good was exemplary.
He was the first ruler to give up war after victory.
He was the first to build hospitals for animals.
He was the first to lay roads."""

In [4]:
tar_text = """பண்டைய இந்திய அரசர்களில் பேரும் புகழும் பெற்ற அரசர் அசோகர் ஆவார்.
இவரது ஆட்சியில் தான் புத்த மதம் ஆசியாவின் பல்வேறு பகுதிகளுக்குப் பரவியது.
கலிங்கப் போருக்குப் பின் பல உயிர்கள் மடிவதைக் கண்டு வருந்தி, போர் தொடுப்பதைக் கைவிட்டார்.
அதற்குப் பிறகு புத்த சமயத்தைத் தழுவி, அமைதியையும் அறத்தையும் பரப்புவதற்காகத் தன் வாழ்வையே அர்ப்பணித்தார்.
பொதுமக்களுக்கு அவர் ஆற்றிய சேவை முன் மாதிரியாக விளங்கியது.
வெற்றிக்குப் பின் போரைத் துறந்த முதல் அரசர் அசோகர்தான்.
உலகிலேயே முதன்முதலாக விலங்குகளுக்கும் தனியே மருத்துவமனை அமைத்துத் தந்தவரும் அசோகரே ஆவார்.
 இன்றும் அவர் உருவாக்கிய சாலைகளை நாம் பயன்படுத்திக்கொண்டு இருக்கிறோம்."""

* Tokenize and build vocabulary using a simple tokenization algorithm

In [2]:
# do not edit this cell
def seq_len(seq):
  return len(seq.strip('').split(' '))

# check the maximum leneght of the src and target seq to decide the context length of encdoer and decoder
src_raw_seq = src_text.strip('').split('\n')
src_max_seq_len =max(list(map(seq_len,src_raw_seq)))
print('Source max_seq_length:  ',src_max_seq_len)


tar_raw_seq = tar_text.strip('').split('\n')
tar_max_seq_len =max(list(map(seq_len,tar_raw_seq)))
print('Target max_seq_length: ',tar_max_seq_len)

* We encourage you to go through the code given below to understand the typical functionalities of Tokenizer object (If you want, you can skip)

In [9]:
# do not edit this cell
class Tokenizer(object):

  def __init__(self,text):
    self.text = text
    self.word_tokenizer = self.word_tokenizer
    self.vocab_size = None
    self.vocab = None

  @staticmethod
  def word_tokenizer(seq):
    return seq.strip('').split(' ')

  def get_tokens(self):
    for sentence in self.text.strip().split('\n'):
      yield self.word_tokenizer(sentence)

  def build_vocab(self):
    self.vocab = build_vocab_from_iterator(self.get_tokens(),
                                  min_freq=1,specials=['<pad>','<start>','<end>','<unk>'])
    self.vocab.set_default_index(self.vocab['<unk>']) # index of OOV
    self.vocab_size = len(self.vocab)
    return self.vocab

  def encode(self,sentence):
    v = self.build_vocab()
    vt = VocabTransform(v)
    token_ids = vt(self.word_tokenizer(sentence))
    # add special tokens
    token_ids.insert(0,v.vocab.get_stoi()['<start>'])
    token_ids.append(v.vocab.get_stoi()['<end>']) # <end>:2
    return torch.tensor(token_ids,dtype=torch.int64)

  def decode(self,ids):
    v = self.build_vocab()
    list_ids = ids.tolist()
    tokens = [v.vocab.get_itos()[id] for id in list_ids]
    return ' '.join(tokens)

  def encode_batch(self,batch_size,max_seq_len):
    batch_data = torch.zeros(size=(batch_size,max_seq_len+2)) # +2 for special tokens
    for i,sentence in enumerate(self.text.strip('').split('\n')):
      token_ids = self.encode(sentence)
      batch_data[i,0:len(token_ids)] = token_ids
    return batch_data.type(dtype=torch.int64)



* It is always go to check the implementation

In [None]:
batch_size = 8

In [10]:
# you can play with this
src_tokenizer = Tokenizer(src_text)
print(src_tokenizer.encode('The most famous ruler of ancient India was Emperor Ashoka.'))
print(src_tokenizer.encode_batch(batch_size,src_max_seq_len))

tensor([ 1, 27, 49, 39, 15,  8, 28, 24,  5, 22, 20,  2])
tensor([[ 1, 27, 49, 39, 15,  8, 28, 24,  5, 22, 20,  2,  0,  0,  0,  0,  0,  0],
        [ 1, 25,  5, 36, 14, 53, 58, 11, 16,  6, 35, 50,  8, 21,  2,  0,  0,  0],
        [ 1, 19, 40, 17, 18,  9, 56, 47, 52, 43, 32,  9,  4, 26, 61,  2,  0,  0],
        [ 1,  7, 37, 11, 12, 59, 33, 14, 46,  6, 16,  4, 48,  8, 51, 12, 34,  2],
        [ 1, 23, 57, 13,  4, 31,  8, 54, 42,  5, 38,  2,  0,  0,  0,  0,  0,  0],
        [ 1,  7,  5,  4, 10, 15,  6, 41, 17, 18,  9, 60,  2,  0,  0,  0,  0,  0],
        [ 1,  7,  5,  4, 10,  6, 30, 44, 13, 29,  2,  0,  0,  0,  0,  0,  0,  0],
        [ 1,  7,  5,  4, 10,  6, 45, 55,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0]])


In [11]:
# you can play with this
tar_tokenizer = Tokenizer(tar_text)
print(tar_tokenizer.encode('பண்டைய இந்திய அரசர்களில் பேரும் புகழும் பெற்ற அரசர் அசோகர் ஆவார்.'))
print(tar_tokenizer.encode_batch(batch_size,tar_max_seq_len))

tensor([ 1, 44, 22, 16, 53, 51, 52,  4, 11,  6,  2])
tensor([[ 1, 44, 22, 16, 53, 51, 52,  4, 11,  6,  2,  0,  0],
        [ 1, 25, 20, 39,  8, 59, 19, 49, 43, 47,  2,  0,  0],
        [ 1, 30, 55,  7, 48, 26, 58, 29, 65, 57, 41, 31,  2],
        [ 1, 13, 50,  8, 32, 38, 14, 18, 46, 37, 66, 17,  2],
        [ 1, 54,  5, 21, 34, 64, 61, 68,  2,  0,  0,  0,  0],
        [ 1, 69,  7, 56, 40, 63,  4, 12,  2,  0,  0,  0,  0],
        [ 1, 28, 62, 67, 36, 60, 15, 35, 10,  6,  2,  0,  0],
        [ 1,  9, 23,  5, 27, 33, 42, 45, 24,  2,  0,  0,  0]])


* Let's load the token ids of the words in the sentences of source and target languages

In [13]:
# do not edit this cell
x = src_tokenizer.encode_batch(batch_size,src_max_seq_len)
y = tar_tokenizer.encode_batch(batch_size,tar_max_seq_len)

* we have appended zeros to sentences that are shorter than max-seq-len
* We have to ignore computing loss over those padded tokens
* You have to take care of that in the cell below

In [14]:
# your code goes here
label = None

* Define the context lengths for encoder and decoder

In [None]:
# do not edit this cell
enc_ctxt_len = src_max_seq_len+2
dec_ctxt_len = tar_max_seq_len+2

# Load configuration file

In [21]:
# do not edit this cell
config_url = "https://raw.githubusercontent.com/Arunprakash-A/LLM-from-scratch-PyTorch/main/config_files/enc_config.yml"
response = requests.get(config_url)
config = response.content.decode("utf-8")
config = safe_load(config)
pprint(config)

{'input': {'batch_size': 10, 'embed_dim': 32, 'seq_len': 8, 'vocab_size': 10},
 'model': {'d_ff': 128,
           'd_model': 32,
           'dk': 4,
           'dq': 4,
           'dv': 4,
           'n_heads': 8,
           'n_layers': 6}}


In [19]:
# do not edit this cell
src_vocab_size =src_tokenizer.vocab_size
batch_size = x.shape[0]
embed_dim = config['input']['embed_dim']

In [20]:
# do not edit this cell
dq = torch.tensor(config['model']['dq'])
dk = torch.tensor(config['model']['dk'])
dv = torch.tensor(config['model']['dv'])
dmodel = embed_dim
heads = torch.tensor(config['model']['n_heads'])
d_ff = config['model']['d_ff']

In [None]:
# do not edit this cell
config_url = "https://raw.githubusercontent.com/Arunprakash-A/LLM-from-scratch-PyTorch/main/config_files/dec_config.yml"
response = requests.get(config_url)
config = response.content.decode("utf-8")
config = safe_load(config)
pprint(config)

{'input': {'batch_size': 10, 'embed_dim': 32, 'seq_len': 8, 'vocab_size': 12},
 'model': {'d_ff': 128,
           'd_model': 32,
           'dk': 4,
           'dq': 4,
           'dv': 4,
           'n_heads': 8,
           'n_layers': 6}}


In [None]:
# do not edit this cell
tar_vocab_size = tar_tokenizer.vocab_size

# Encoder

 * You can copy paste the required code from the previous assignments

In [None]:
class MHA(nn.Module):
  pass

class Prediction(nn.Module):
  pass

class EncoderLayer(nn.Module):
  pass

class Encoder(nn.Module):
  pass


# Decoder

In [None]:
class MHCA(nn.Module):
  pass

class MHMA(nn.Module):
  pass

class DecoderLayer(nn.Module):
  pass

class Decoder(nn.Module):
  pass

# Positional Embedding

 * You may take the code directly from any source.

In [None]:
class PositionalEncoding(nn.Module):
    pass

# Generate target mask

  * We will be passing the causal mask for the decoder layer as one of its arguments

In [None]:
mask = (torch.triu(torch.ones(dec_ctxt_len,dec_ctxt_len)) == 1).transpose(0,1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
print(mask)

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0.,

# Transformer

In [None]:
class Transformer(nn.Module):

  def __init__(self,src_vocab_size,tar_vocab_szie,src_seq_len,tar_seq_len,dmodel,dq,dk,dv,d_ff,heads,target_mask,num_layers=1):
    super(Transformer,self).__init__()
    self.src_embeddings = nn.Embedding(src_vocab_size,embed_dim)
    self.tar_embeddings = nn.Embedding(tar_vocab_size,embed_dim)
    self.pos_embeddings = PositionalEncoding(dmodel)
    self.encoder = Encoder(src_vocab_size,dmodel,dq,dk,dv,d_ff,heads,num_layers)
    self.decoder = Decoder(tar_vocab_size,dmodel,dq,dk,dv,d_ff,heads,target_mask,num_layers)

  def forward(self,src_token_ids,tar_token_ids):
    out = self.encoder(self.pos_embeddings(self.src_embeddings(src_token_ids)))
    out = self.decoder(out,self.tar_embeddings(tar_token_ids))
    return out

In [None]:
model = Transformer(src_vocab_size,tar_vocab_size,enc_ctxt_len,dec_ctxt_len,dmodel,dq,dk,dv,d_ff,heads,mask)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [None]:
def train(src_token_ids,tar_token_ids,labels,epochs=1000):
  loss_trace = []
  for epoch in range(epochs):
    out = model(src_token_ids,tar_token_ids)
    loss = criterion() # edit this
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

In [1]:
train(x,y,label,1000)

## Run the model AutoRegressively

In [None]:
@torch.inference_mode()
def inference(test_input):
  '''
  Run the model in autoregressive fashion and store the output at each time step in a list
  '''

  return None

* Modify the code below to suit your implementation
* Display the original and translated sentence (with all the spcial tokens)
* Note that, the second half of the second sentence is poorly translated
*  Same goes for 3rd and 4th sentence
* All other sentences are properly translated

In [None]:
for token_ids in x:
  print(src_tokenizer.decode(token_ids))
  print(tar_tokenizer.decode(inference(token_ids)))

<start> The most famous ruler of ancient India was Emperor Ashoka. <end> <pad> <pad> <pad> <pad> <pad> <pad>
<start> பண்டைய இந்திய அரசர்களில் பேரும் புகழும் பெற்ற அரசர் அசோகர் ஆவார். <pad> <pad> <pad>
<start> It was during his period that Buddhism spread to different parts of Asia. <end> <pad> <pad> <pad>
<start> இவரது ஆட்சியில் தான் புத்த மதம் ஆசியாவின் புத்த ஆற்றிய அசோகரே பின் <unk> பேரும்
<start> Ashoka gave up war after seeing many people grieving death after the Kalinga war. <end> <pad> <pad>
<start> கலிங்கப் தனியே மடிவதைக் பின் வாழ்வையே தழுவி, பெற்ற அரசர்களில் அரசர்களில் அரசர்களில் அரசர்களில் அரசர்களில்
<start> He embraced Buddhism and then devoted his life to spread the message of peace and dharma. <end>
<start> அதற்குப் பிறகு புத்த சமயத்தைத் தழுவி, அமைதியையும் அறத்தையும் பரப்புவதற்காகத் தன் வாழ்வையே பெற்ற அசோகர்
<start> His service for the cause of public good was exemplary. <end> <pad> <pad> <pad> <pad> <pad> <pad>
<start> பொதுமக்களுக்கு அவர் ஆற்றிய சேவை முன் மாதிரியாக விளங்கி