In [1]:
# testing pytorch
import torch 
torch.cuda.is_available()

True

<img src="transformer_architecture.png" width=380 >

=> For seq to seq operations we had Extended Neural GPU, ByteNet and Conv S2S 
=> In these models, no of operations required to related distant datapoints grew with that distance
=> No. of operations to relate signals from two arbitrary postitions grow **Linearly** for ConvS2S and **Logarithmically** for ByteNet
=> This limits how far we can connect to a sentence. Makes it difficult to learn dependencies between distant positions 

### Transformer fixes it 
=> In transformer architecture, this computation to relate distant datapoints reduce. Now to relate distant inputs/outputs we need a **constant amount of operations**.
=> But there was a problem, transformer has an issue of **reduced effective resolution**. Which means that model averages attention and context in attention equation. Solution was to introduce Multi-Head attention.

In [2]:
import torch 
import torch.nn as nn

class EncoderDecoder(nn.Module) :
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator) : 
        self.encoder = encoder 
        self.decoder = decoder 
        self.src_embed = src_embed 
        self.tgt_embed = tgt_embed 
        self.generator = generator 

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask) 

    def decode(self, memory, src_mask, tgt, tgt_mask) : 
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

    def forward(self, src, tgt, src_mask, tgt_mask):
            # take in and process masked src and target sequence 
            return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)


In [3]:
##########################################################
# TINKERING WITH ENCODER BLOCKS 
##########################################################
import torch 
import torch.nn as nn 

# setting manual seed 
torch.manual_seed(70404)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


# hyper parameters for experimenting 
VOCAB_SIZE:int = 5 # no of unique tokens we have
D_MODEL:int = 2 # dim of vectors we want to convert our tokens into

class EncoderBlock(nn.Module): 
    def __init__(self): 
        super(EncoderBlock, self).__init__()

        # lookup table (converting integers ----> vectors) 
        self.embedding = nn.Embedding(VOCAB_SIZE, D_MODEL)

        # the brain that simply learns patterns for now 
        self.nnl1 = nn.Linear(D_MODEL, D_MODEL)

    def forward(self, x, debug:bool=True): 
        # x is a batch of sentences represented as integers 
        # we want to convert them into vectors and then learn patterns from them

        # when we need debug flag, we will send debug=True and get printed logs to understand data better hehe ;)
        self.debug = debug 
        if self.debug:
            print("\n--- [ENCODER START] ---")
            print(f"1. Raw Input (Token IDs):\n   {x}")
            print(f"   Shape: {x.shape}")

        # STEP 1: EMBEDDING
        # We look up the vector for each integer.
        x = self.embedding(x)
        if self.debug:
            print(f"\n2. After Embedding (Integers -> Vectors):")
            print(f"Vector after embedding: {x.tolist()}") 
            print(f"Shape: {x.shape}")

        # STEP 2: PROCESSING (The Neural Network)
        # The model multiplies the embedding by weights to "understand" it.
        x = self.nnl1(x) # passing through the linear layer 1
        if self.debug:
            print(f"\n3. After Linear Layer (The 'Thinking' Step):")
            print(f"First token transformed to: {x[0][0].tolist()}")
            print(f"Output of Linear layer: {x.tolist()}")
            print(f"   Shape: {x.shape}")

        return x # returning the processed vectors to get the output pos
    

encoder : EncoderBlock = EncoderBlock()

In [4]:
# en_tokens 
en_tokens = torch.tensor([[1,2,3]]) # batch of 1 sentences

# running the encoder 
 # we don't need gradients for this test
encoder_output = encoder(en_tokens)
encoder_output


--- [ENCODER START] ---
1. Raw Input (Token IDs):
   tensor([[1, 2, 3]])
   Shape: torch.Size([1, 3])

2. After Embedding (Integers -> Vectors):
Vector after embedding: [[[-0.05909673497080803, 0.353969544172287], [0.2251233160495758, -0.27265700697898865], [-1.3874330520629883, 1.1006890535354614]]]
Shape: torch.Size([1, 3, 2])

3. After Linear Layer (The 'Thinking' Step):
First token transformed to: [-0.31872737407684326, 0.33177289366722107]
Output of Linear layer: [[[-0.31872737407684326, 0.33177289366722107], [-0.6656067371368408, 0.36282414197921753], [0.03372323513031006, 0.7046524882316589]]]
   Shape: torch.Size([1, 3, 2])


tensor([[[-0.3187,  0.3318],
         [-0.6656,  0.3628],
         [ 0.0337,  0.7047]]], grad_fn=<ViewBackward0>)

In [5]:
encoder.embedding.weight

Parameter containing:
tensor([[ 0.2030,  1.0051],
        [-0.0591,  0.3540],
        [ 0.2251, -0.2727],
        [-1.3874,  1.1007],
        [-1.9443, -0.2719]], requires_grad=True)

In [6]:
encoder.nnl1.weight

Parameter containing:
tensor([[ 0.0615,  0.5815],
        [-0.4142, -0.2374]], requires_grad=True)

##### Problems with this approach
Let us say, [1,2,3] is ['I', 'am', 'dog'].<br>
Here after embedding with d_model=3, we get [[0.1, 02], [0.3, 0.4], [0.5, 0.6]] (just an example).<br>
Now, the embedding vector for 'dog'(3) is [0.5, 0.6]<br>
1. The embedding vector for 'dog' = [0.5, 0.6] --> [0.9, 0.6] (after passing through linear layer) 

In [7]:
dummy_input = torch.tensor([[1,2,3]], dtype=torch.float32) 

print(f"Size of dummy input in memory: {dummy_input.element_size() * dummy_input.nelement()} bytes")

Size of dummy input in memory: 12 bytes


In [8]:
# tunnel problem 
# let us say [1,2,3] = ['I', 'am', 'dog']
embed_output = encoder.embedding(dummy_input.long())
print(f"Embedding Output for: {en_tokens.tolist()}")
for i in embed_output[0]: print(i.tolist())
print("\nPassing this embedded matrix into the linear layer...")
ll1_output = encoder.nnl1(embed_output)
for i in ll1_output[0]: print(i.tolist())

print("\nNeural net Weights:")
for i in encoder.nnl1.weight: print(i.tolist())

print("\nEmbedding Lookup Table:")
for i in encoder.embedding.weight: print(i.tolist())

Embedding Output for: [[1, 2, 3]]
[-0.05909673497080803, 0.353969544172287]
[0.2251233160495758, -0.27265700697898865]
[-1.3874330520629883, 1.1006890535354614]

Passing this embedded matrix into the linear layer...
[-0.31872737407684326, 0.33177289366722107]
[-0.6656067371368408, 0.36282414197921753]
[0.03372323513031006, 0.7046524882316589]

Neural net Weights:
[0.061545372009277344, 0.5814815163612366]
[-0.4141703248023987, -0.23740896582603455]

Embedding Lookup Table:
[0.20301564037799835, 1.005134105682373]
[-0.05909673497080803, 0.353969544172287]
[0.2251233160495758, -0.27265700697898865]
[-1.3874330520629883, 1.1006890535354614]
[-1.944254755973816, -0.27188217639923096]


In [9]:
dummy_input_reverse = torch.tensor([[3,2,1]], dtype=torch.float32)
embed_output = encoder.embedding(dummy_input_reverse.long())
print(f"Embedding Output for: {dummy_input_reverse.tolist()}")
for i in embed_output[0]: print(i.tolist())
print("\nPassing this embedded matrix into the linear layer...")
ll1_output = encoder.nnl1(embed_output)
for i in ll1_output[0]: print(i.tolist())

print("\nNeural net Weights:")
for i in encoder.nnl1.weight: print(i.tolist())

print("\nEmbedding Lookup Table:")
for i in encoder.embedding.weight: print(i.tolist())

Embedding Output for: [[3.0, 2.0, 1.0]]
[-1.3874330520629883, 1.1006890535354614]
[0.2251233160495758, -0.27265700697898865]
[-0.05909673497080803, 0.353969544172287]

Passing this embedded matrix into the linear layer...
[0.03372323513031006, 0.7046524882316589]
[-0.6656067371368408, 0.36282414197921753]
[-0.31872737407684326, 0.33177289366722107]

Neural net Weights:
[0.061545372009277344, 0.5814815163612366]
[-0.4141703248023987, -0.23740896582603455]

Embedding Lookup Table:
[0.20301564037799835, 1.005134105682373]
[-0.05909673497080803, 0.353969544172287]
[0.2251233160495758, -0.27265700697898865]
[-1.3874330520629883, 1.1006890535354614]
[-1.944254755973816, -0.27188217639923096]


In [10]:
iamdog:torch.Tensor = torch.tensor([0,1,2]) # 1='i', 2='am', 3='dog'
heisgod:torch.Tensor = torch.tensor([3,4,2]) # 3='he', 2='is', 1='dog'
# intuitively, dog in both sentence have different meaning 
# in sentence 1, dog is a pet
# in sentence 2, dog is an insult
# we need the sentences to be different in order to learn different patterns for the word 'dog' in both sentences.

# but here is it not like that, as seen below 
print("Dog representation in 'I am dog':", encoder(iamdog, debug=False)[2])
print("Dog representation in 'He is dog':", encoder(heisgod, debug=False)[2])

print(encoder(iamdog, debug=False)[2].tolist() == encoder(heisgod, debug=False)[2].tolist())
#"The 'dog' token has the same representation in both sentences, which is a problem for learning different meanings."

Dog representation in 'I am dog': tensor([-0.6656,  0.3628], grad_fn=<SelectBackward0>)
Dog representation in 'He is dog': tensor([-0.6656,  0.3628], grad_fn=<SelectBackward0>)
True


In [11]:
import pandas as pd 

df :pd.DataFrame = pd.read_parquet("../datasets/iitb/tokenised/train_tokens.parquet")
df.head()

Unnamed: 0,en,hi
0,"[44, 78, 1597, 1927, 5197, 1599, 2317, 1585, 1...","[1788, 1830, 2778, 1563, 2457, 3771, 2016, 258..."
1,"[2345, 2381, 72, 1578, 1546, 2345, 72, 1585, 1...","[4656, 1954, 1530, 1592, 1777, 932, 2457, 3771..."
2,"[1668, 1751, 5422, 1745, 90, 76, 1551, 1612, 9...","[1571, 5305, 479, 4523, 1520, 1614, 2025, 480,..."
3,"[1668, 1751, 5422, 1745, 90, 76, 1551, 1612, 9...","[447, 479, 1796, 479, 4523, 1520, 1614, 2025, ..."
4,"[934, 4053, 1526, 1745, 90, 76, 3455, 1634, 16...","[2091, 1827, 487, 963, 1066, 444, 1828, 1539, ..."


In [12]:
len(df)

1659083

In [13]:
en_vector = df.iloc[0]['en'].tolist()

In [14]:
# Trying to make the dataloader 
from torch.utils.data import Dataset, DataLoader
from typing import Any

# creating a custom dataset class 
class IITBDataset(Dataset):
    """Custom dataset for IITB English-Hindi translation data."""
    def __init__(self, dataframe:pd.DataFrame) -> None :
        self.df = dataframe 

    def __len__(self) -> int: 
        return int(len(self.df))

    def __getitem__(self, index) -> Any:
        """returns en_vector, hi_vector for the given index"""
        if index >= len(self.df): raise IndexError("Index out of range")
        en_vector:list[int] = self.df.iloc[index]['en'].tolist()
        hi_vector:list[int] = self.df.iloc[index]['hi'].tolist()
        return torch.tensor(en_vector, dtype=torch.long), torch.tensor(hi_vector, dtype=torch.long)
   
dataset = IITBDataset(df)
dataset[0]

(tensor([  44,   78, 1597, 1927, 5197, 1599, 2317, 1585, 1615, 4844, 1797,   80,
         1863]),
 tensor([1788, 1830, 2778, 1563, 2457, 3771, 2016, 2583, 2470, 1584,  975, 1558,
         3570, 3903]))

In [15]:
# checking vector lengths of our tokenised data 
df['en']

0          [44, 78, 1597, 1927, 5197, 1599, 2317, 1585, 1...
1          [2345, 2381, 72, 1578, 1546, 2345, 72, 1585, 1...
2          [1668, 1751, 5422, 1745, 90, 76, 1551, 1612, 9...
3          [1668, 1751, 5422, 1745, 90, 76, 1551, 1612, 9...
4          [934, 4053, 1526, 1745, 90, 76, 3455, 1634, 16...
                                 ...                        
1659078    [1668, 5148, 3898, 1088, 3569, 51, 1529, 2092,...
1659079    [2145, 925, 89, 2100, 1826, 1088, 1519, 5148, ...
1659080    [981, 2119, 1675, 2251, 1556, 1754, 1738, 1707...
1659081    [1668, 51, 41, 934, 4282, 1759, 2525, 4299, 16...
1659082    [981, 2208, 1845, 1707, 4701, 4430, 1743, 1519...
Name: en, Length: 1659083, dtype: object

In [16]:
import copy
len_df = copy.deepcopy(df) 
len_df['en'] = len_df['en'].apply(len) 
len_df['hi'] = len_df['hi'].apply(len)
len_df.head()

Unnamed: 0,en,hi
0,13,14
1,15,14
2,18,17
3,15,18
4,15,22


In [17]:
len_df.describe()

Unnamed: 0,en,hi
count,1659083.0,1659083.0
mean,23.58597,24.29923
std,26.4401,27.0571
min,0.0,0.0
25%,6.0,6.0
50%,16.0,16.0
75%,32.0,33.0
max,4573.0,4014.0


In [18]:
print("--- English Stats ---")
print(f"95th Percentile: {len_df['en'].quantile(0.95)}")
print(f"99th Percentile: {len_df['en'].quantile(0.99)}")
print("\n--- Hindi Stats ---")
print(f"95th Percentile: {len_df['hi'].quantile(0.95)}")
print(f"99th Percentile: {len_df['hi'].quantile(0.99)}")

--- English Stats ---
95th Percentile: 69.0
99th Percentile: 117.0

--- Hindi Stats ---
95th Percentile: 72.0
99th Percentile: 124.0


In [19]:
import tokenizers
from tokenizers import Tokenizer
from tokenizers import decoders

pad_index:int = 1
max_seq_length:int = 125

tokeniser : Tokenizer = Tokenizer.from_file("../datasets/iitb/tokeniser/bpe_tokeniser_32k.json")
tokeniser.decoder = decoders.BPEDecoder(suffix="</w>")
tokeniser.enable_padding(direction='right',pad_id=1, pad_token='[PAD]',length=max_seq_length)
tokeniser.enable_truncation(max_length=max_seq_length)

iamdog: tokenizers.Encoding = tokeniser.encode("[SOS] I am dog [EOS]")
iamdog_tokens, iamdog_ids = iamdog.tokens, iamdog.ids


print(f'decoded output: {tokeniser.decode(iamdog_ids)}')
print(f'length of tokenised vector: {len(iamdog_tokens)}')

decoded output: I am dog
length of tokenised vector: 125


In [29]:
df = pd.read_parquet("../datasets/iitb/raw/train.parquet", engine="pyarrow").map(lambda x: str("[SOS] " + str(x) + " [EOS]")) 
df.head()

Unnamed: 0,en,hi
0,[SOS] Give your application an accessibility w...,[SOS] अपने अनुप्रयोग को पहुंचनीयता व्यायाम का ...
1,[SOS] Accerciser Accessibility Explorer [EOS],[SOS] एक्सेर्साइसर पहुंचनीयता अन्वेषक [EOS]
2,[SOS] The default plugin layout for the bottom...,[SOS] निचले पटल के लिए डिफोल्ट प्लग-इन खाका [EOS]
3,[SOS] The default plugin layout for the top pa...,[SOS] ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका [EOS]
4,[SOS] A list of plugins that are disabled by d...,[SOS] उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप...


In [31]:
df['en'][0],df['hi'][0]

('[SOS] Give your application an accessibility workout [EOS]',
 '[SOS] अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें [EOS]')