<center style="font-size: 50px; font-family: Times New Roman; font-weight: bold">Transformer from scratch using Pytorch</center>

<div style="text-align: center; background-color:white;">
    <img src="https://production-media.paperswithcode.com/methods/new_ModalNet-21.jpg" alt="a title" width="300" height="100">
</div>

In [1]:
# ignore all the unwanted warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load modules/libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import numpy as np

<center style="font-size: 25px; font-family: Times New Roman; font-weight: bold">Translation Dataset</center>

In [98]:
english = ["Hello, how are you?"]
hindi = ["नमस्ते, आप कैसे हैं?"]
    
src_corpus = " ".join(english)
tgt_corpus = " ".join(hindi)
    
src_vocabulary = src_corpus.split()
src_dictionary = {word: pos for pos, word in enumerate(src_vocabulary)}

tgt_vocabulary = tgt_corpus.split()
tgt_dictionary = {word: pos for pos, word in enumerate(tgt_vocabulary)}

r_tgt_dictionary = {i:word for word, i in tgt_dictionary.items()}

<center style="font-size: 25px; font-family: Times New Roman; font-weight: bold">Encoder Input Components</center>

In [52]:
d_model = 8                                                 # embedding dimension

### Tokenization

In [51]:
# Tokens
ip_sne = english[0]
ip_tkns = ip_sne.split()
num_tkns = [src_dictionary[tkn] for tkn in ip_tkns]
print(f"Word Tokens: {ip_tkns}")
print(f"Word to Index: {num_tkns}")

Word Tokens: ['Hello,', 'how', 'are', 'you?']
Word to Index: [0, 1, 2, 3]


### Word Embedding

In [54]:
vocab_size = len(src_vocabulary)                            # Total words in vocabulary
ebdg_lyr = nn.Embedding(vocab_size, d_model)                # generates the embedding for given token    

wrd_ebds = []                                               # all tokens' embedding
for tkn in num_tkns:
    ebd = ebdg_lyr(torch.tensor(tkn))
    wrd_ebds.append(ebd)

print(f"{ip_tkns[0]} embedding: {wrd_ebds[0].tolist()}")
print()
print(f"{ip_tkns[1]} embedding: {wrd_ebds[1].tolist()}")

Hello, embedding: [-0.9397754669189453, -0.751417875289917, 1.698652744293213, -1.1321967840194702, -0.19192473590373993, -0.33042582869529724, -0.10228903591632843, -0.26329025626182556]

how embedding: [0.1551840752363205, 1.466767430305481, 1.4544599056243896, 0.4266636371612549, -0.4274609386920929, 0.5144870281219482, -0.06327193975448608, -1.214971661567688]


### Positional Encoding

<div style="text-align: center; background-color:white;">
    <img src="https://miro.medium.com/v2/resize:fit:1044/1*fX8TN02pB5G1pLNsJIC6QA.png" alt="a title" width="300" height="100">
</div>

In [56]:
psnl_encs = []                                                        # positional encodings of word embeddings

for pos, ebd in enumerate(wrd_ebds):
    d_enc = []                                                        # encoding by dimensions 
    for i in range(0, d_model//2):                                    # 0 <= i < d_model/2
        deno = math.pow(10000, (2*i)/d_model)                         # denominator term => 10000^(2i/d_model)
        evn_pos_v = np.sin(pos/deno)                                  # sine value for even positions
        d_enc.append(evn_pos_v)                
        
        odd_pos_v = np.cos(pos/deno)                                  # cosine value for odd positions
        d_enc.append(odd_pos_v)
    
    psnl_encs.append(d_enc)                                           # add each word initial positional encodings         
psnl_encs = torch.tensor(psnl_encs)


for pos, wrd_ebd in enumerate(wrd_ebds):
    psnl_encs[pos] = psnl_encs[pos] + wrd_ebd                         # positional encoding = word embedding + initial positional encoding
    
print(f"{ip_tkns[0]} Positional Encoding: {psnl_encs[0].tolist()}")
print()
print(f"{ip_tkns[1]} Positional Encoding: {psnl_encs[1].tolist()}")

Hello, Positional Encoding: [-0.9397754669189453, 0.248582124710083, 1.698652744293213, -0.13219678401947021, -0.19192473590373993, 0.6695741713047028, -0.10228903591632843, 0.7367097437381744]

how Positional Encoding: [0.996655060044217, 2.0070697361736207, 1.5542933222712179, 1.4216678024392806, -0.41746110535792624, 1.5144370285386135, -0.06227193992115274, -0.2149721615676463]


<center style="font-size: 25px; font-family: Times New Roman; font-weight: bold">Encoder Mechanism Components</center>

### Self Attention

<div style="text-align: center; background-color:white;">
    <img src="https://miro.medium.com/v2/resize:fit:920/1*3eAlr3AEQpoZ9voSp1TstQ.png" alt="a title" width="300" height="100">
</div>

In [57]:
##### Wq, Wk, Wv matrix of parameters for generating the Q, K, V vectors
Wq = nn.Linear(d_model, d_model, dtype=torch.float64)
Wk = nn.Linear(d_model, d_model, dtype=torch.float64)
Wv = nn.Linear(d_model, d_model, dtype=torch.float64)

Q, K, V = [], [], []                                                   # Matrix for all Query, Key and Value vectors
for ebd in psnl_encs:
    q = Wq(ebd)
    Q.append(q)
    
    k = Wk(ebd)
    K.append(k)
    
    v = Wv(ebd)
    V.append(v)

Q = torch.stack(Q, dim=0)                                              # Stacked all Query vector as matrix
K = torch.stack(K, dim=0)                                              # Stacked all Key vector as matrix
V = torch.stack(V, dim=0)                                              # Stacked all Value vector as matrix

d_k = d_model                                                          # dimensionality of Key vector

# calculate QK_T
QK_T = torch.matmul(Q, torch.t(K))                
sld_QK_T = QK_T/math.sqrt(d_k)                                         # scaled dot product of QK_T by sqrt(d_k)
soft_QK_T = torch.softmax(sld_QK_T, dim=-1)                            # apply softmax
self_attn = torch.matmul(soft_QK_T, V)         

print(f"Self Attention on {ip_tkns[0]}: {self_attn[0].tolist()}")
print()
print(f"Self Attention on {ip_tkns[1]}: {self_attn[1].tolist()}")

Self Attention on Hello,: [-0.11540083527637228, -0.7728133724431447, -0.3088951026179383, -0.5082695126283093, -1.24005988583988, -0.5787977238577567, 0.46306246460804634, 0.014817102869968141]

Self Attention on how: [-0.11636640240634882, -0.7819564715518782, -0.3233943365215331, -0.523987304255661, -1.268926464963557, -0.5916178181033684, 0.4697221668608989, 0.03930105218907754]


### MultiHead Attention

<div style="text-align: center; background-color:white;">
    <img src="https://production-media.paperswithcode.com/methods/multi-head-attention_l1A3G7a.png" alt="a title" width="300" height="100">
</div>

#### For simplicity we assume only 2 heads of Self Attention for Multihead Attention

In [59]:
'''*****************************************************: First Head :*****************************************************'''
##### Wq, Wk, Wv matrix of parameters for generating the Q, K, V vectors
Wq_1 = nn.Linear(d_model, d_model, dtype=torch.float64)
Wk_1 = nn.Linear(d_model, d_model, dtype=torch.float64)
Wv_1 = nn.Linear(d_model, d_model, dtype=torch.float64)

Q_1, K_1, V_1 = [], [], []                                                   # Matrix for all Query, Key and Value vectors
for ebd in psnl_encs:
    q = Wq_1(ebd)
    Q_1.append(q)
    
    k = Wk_1(ebd)
    K_1.append(k)
    
    v = Wv_1(ebd)
    V_1.append(v)

Q_1 = torch.stack(Q_1, dim=0)                                              # Stacked all Query vector as matrix
K_1 = torch.stack(K_1, dim=0)                                              # Stacked all Key vector as matrix
V_1 = torch.stack(V_1, dim=0)                                              # Stacked all Value vector as matrix

d_k = d_model                                                          # dimensionality of Key vector

# calculate QK_T
QK_T_1 = torch.matmul(Q_1, torch.t(K_1))                
sld_QK_T_1 = QK_T_1/math.sqrt(d_k)                                         # scaled dot product of QK_T by sqrt(d_k)
soft_QK_T_1 = torch.softmax(sld_QK_T_1, dim=-1)                            # apply softmax
self_attn_1 = torch.matmul(soft_QK_T_1, V_1)         
'''***************************************************: First Head End :***************************************************'''

'''*****************************************************: Second Head :****************************************************'''
##### Wq, Wk, Wv matrix of parameters for generating the Q, K, V vectors
Wq_2 = nn.Linear(d_model, d_model, dtype=torch.float64)
Wk_2 = nn.Linear(d_model, d_model, dtype=torch.float64)
Wv_2 = nn.Linear(d_model, d_model, dtype=torch.float64)

Q_2, K_2, V_2 = [], [], []                                                   # Matrix for all Query, Key and Value vectors
for ebd in psnl_encs:
    q = Wq_2(ebd)
    Q_2.append(q)
    
    k = Wk_2(ebd)
    K_2.append(k)
    
    v = Wv_2(ebd)
    V_2.append(v)

Q_2 = torch.stack(Q_2, dim=0)                                              # Stacked all Query vector as matrix
K_2 = torch.stack(K_2, dim=0)                                              # Stacked all Key vector as matrix
V_2 = torch.stack(V_2, dim=0)                                              # Stacked all Value vector as matrix

d_k = d_model                                                          # dimensionality of Key vector

# calculate QK_T
QK_T_2 = torch.matmul(Q_2, torch.t(K_2))                
sld_QK_T_2 = QK_T_2/math.sqrt(d_k)                                         # scaled dot product of QK_T by sqrt(d_k)
soft_QK_T_2 = torch.softmax(sld_QK_T_2, dim=-1)                            # apply softmax
self_attn_2 = torch.matmul(soft_QK_T_2, V)         
'''***************************************************: Second Head End :**************************************************'''

# concatenate both the self attention embeddings
ct_attn = torch.cat((self_attn_1, self_attn_2), 1)

# apply linear transformation
W_c = nn.Linear(ct_attn.shape[1], d_model, dtype=torch.float64)

multihead_attn = W_c(ct_attn)

print(f"MultiHead Attention on {ip_tkns[0]}: {multihead_attn[0].tolist()}")
print()
print(f"MultiHead Attention on {ip_tkns[1]}: {multihead_attn[1].tolist()}")

MultiHead Attention on Hello,: [-0.023973938862265287, 0.04155674399469244, -0.02479283062500319, -0.07631544042817404, 0.2267751287121317, -0.3596756759025933, 0.14273770078297232, -0.09438905317559945]

MultiHead Attention on how: [-0.04706596701729221, 0.07895212595552134, -0.004185196714070014, -0.11031068988956347, 0.2675477194032071, -0.336279049440434, 0.17618500779841367, -0.1270560167661626]


### MultiHead Attention + Add & Norm

In [60]:
# Normalize(O/P MultiHead Attention + Positional Encodings)
add_attn = multihead_attn + psnl_encs

# normalization layer
norm_l = nn.LayerNorm(d_model, dtype=torch.float64)

norm_ebds = norm_l(add_attn)

print(f"Normalized Embedding of {ip_tkns[0]}: {norm_ebds[0].tolist()}")
print()
print(f"Normalized Embedding of {ip_tkns[1]}: {norm_ebds[1].tolist()}")

Normalized Embedding of Hello,: [-1.6938680200926948, 0.08920712711974482, 2.0569092391444808, -0.6198930291081951, -0.2738224553422061, 0.11730604151198197, -0.2658615070045281, 0.5900226037714165]

Normalized Embedding of how: [0.13801448012055995, 1.5329489579299709, 0.87513217492175, 0.5820730264082029, -1.2115892606973535, 0.41857539705642693, -0.8877508193796517, -1.4474039563599042]


### Feed Forward + Addition & Normalization

In [61]:
# 2 layer feed forward network
# 1 layer activation ---> relu
# 2 lauer activation ---> linear
fc1 = nn.Linear(d_model, 2048, dtype=torch.float64)
fc2 = nn.Linear(2048, d_model, dtype=torch.float64)
fc1_atv = nn.ReLU()

fc_ebds = []
for ebd in norm_ebds:
    fc_ebd = fc2(fc1_atv(fc1(ebd)))
    
    fc_ebds.append(fc_ebd)
fc_ebds = torch.stack(fc_ebds, dim=0)

# Normalized Embeddings + FC Network Embedding
ebds = norm_ebds + fc_ebds

# Normalized Embeddings
norm_l = nn.LayerNorm(d_model, dtype=torch.float64)
e_norm_ebds = norm_l(ebds)
print(f"Normalized Embedding of {ip_tkns[0]}: {e_norm_ebds[0].tolist()}")
print()
print(f"Normalized Embedding of {ip_tkns[1]}: {e_norm_ebds[1].tolist()}")

Normalized Embedding of Hello,: [-1.5977213990549342, 0.2631307972251235, 1.9878160556149234, -0.7780540218213827, 0.09604315129400895, 0.333180174899737, -0.7243160447433891, 0.4199212865859132]

Normalized Embedding of how: [-0.08706379767271669, 1.5220678891994877, 0.9713555333059934, 0.6688357751029529, -0.891943047104795, 0.393775352442499, -1.3710556094616986, -1.2059720958117226]


In [62]:
e_norm_ebds   # final encoder output

tensor([[-1.5977,  0.2631,  1.9878, -0.7781,  0.0960,  0.3332, -0.7243,  0.4199],
        [-0.0871,  1.5221,  0.9714,  0.6688, -0.8919,  0.3938, -1.3711, -1.2060],
        [-1.2574, -1.0808, -0.3093,  1.5834,  0.8345,  1.0326, -0.9321,  0.1292],
        [-1.4462, -1.2725, -0.2780, -0.1909,  1.8371,  0.2640,  0.8085,  0.2780]],
       dtype=torch.float64, grad_fn=<NativeLayerNormBackward0>)

<center style="font-size: 25px; font-family: Times New Roman; font-weight: bold">Decoder Mechanism Components</center>

### Output Input(Only for Training)

In [64]:
# same as encoder input

# Tokens
op_ip_sne = hindi[0] 
op_ip_tkns = op_ip_sne.split()
num_tkns = [tgt_dictionary[tkn] for tkn in op_ip_tkns]
print(f"Word Tokens: {op_ip_tkns}")
print(f"Word to Index: {num_tkns}")

print("\n********************************************************************************************************************\n")

vocab_size = len(tgt_vocabulary)                                                        # Total words in vocabulary
ebdg_lyr = nn.Embedding(vocab_size, d_model)            # generates the embedding for given token    

wrd_ebds = []                                           # all tokens' embedding
for tkn in num_tkns:
    ebd = ebdg_lyr(torch.tensor(tkn))
    wrd_ebds.append(ebd)

print(f"{op_ip_tkns[0]} embedding: {wrd_ebds[0].tolist()}")
print(f"{op_ip_tkns[1]} embedding: {wrd_ebds[1].tolist()}")

print("\n********************************************************************************************************************\n")

psnl_encs = []         # positional encodings of word embeddings

for pos, ebd in enumerate(wrd_ebds):
    d_enc = []              # encoding by dimensions 
    for i in range(0, d_model//2):          # 0 <= i < d_model/2
        deno = math.pow(10000, (2*i)/d_model)            # denominator term => 10000^(2i/d_model)
        evn_pos_v = np.sin(pos/deno)        # sine value for even positions
        d_enc.append(evn_pos_v)                
        
        odd_pos_v = np.cos(pos/deno)        # cosine value for odd positions
        d_enc.append(odd_pos_v)
    
    psnl_encs.append(d_enc)             # add each word initial positional encodings         
psnl_encs = torch.tensor(psnl_encs)


for pos, wrd_ebd in enumerate(wrd_ebds):
    psnl_encs[pos] = psnl_encs[pos] + wrd_ebd         # positional encoding = word embedding + initial positional encoding
    
print(f"{op_ip_tkns[0]} Positional Encoding: {psnl_encs[0].tolist()}")
print(f"{op_ip_tkns[1]} Positional Encoding: {psnl_encs[1].tolist()}")

Word Tokens: ['नमस्ते,', 'आप', 'कैसे', 'हैं?']
Word to Index: [0, 1, 2, 3]

********************************************************************************************************************

नमस्ते, embedding: [0.16302567720413208, -0.13589109480381012, -0.3482878804206848, 0.545319676399231, 0.6449902057647705, 0.6906088590621948, 1.2094029188156128, 0.946586549282074]
आप embedding: [-0.0340677835047245, -1.6508357524871826, -0.36994829773902893, 1.3680638074874878, 1.79978346824646, -0.2353883981704712, 0.8821811079978943, -0.6579354405403137]

********************************************************************************************************************

नमस्ते, Positional Encoding: [0.16302567720413208, 0.8641089051961899, -0.3482878804206848, 1.545319676399231, 0.6449902057647705, 1.6906088590621948, 1.2094029188156128, 1.946586549282074]
आप Positional Encoding: [0.807403201303172, -1.1105334466190429, -0.2701148810922008, 2.3630679727655135, 1.8097833015806266, 0.76456160

### Masked (MultiHead Attention)

##### Same as in Encoder. There is only one difference, when we scale dot product of Query vectors and Key vectors, we apply mask matrix on resultant dot product matrix also.

In [67]:
# create a mask matrix (for inference only)
mask = torch.full((len(op_ip_tkns), len(op_ip_tkns)), fill_value=float('-inf'), dtype=torch.float64)
mask = torch.triu(mask, diagonal=1)

In [68]:
'''*****************************************************: First Head :*****************************************************'''
##### Wq, Wk, Wv matrix of parameters for generating the Q, K, V vectors
Wq_1 = nn.Linear(d_model, d_model, dtype=torch.float64)
Wk_1 = nn.Linear(d_model, d_model, dtype=torch.float64)
Wv_1 = nn.Linear(d_model, d_model, dtype=torch.float64)

Q_1, K_1, V_1 = [], [], []                                                   # Matrix for all Query, Key and Value vectors
for ebd in psnl_encs:
    q = Wq_1(ebd)
    Q_1.append(q)
    
    k = Wk_1(ebd)
    K_1.append(k)
    
    v = Wv_1(ebd)
    V_1.append(v)

Q_1 = torch.stack(Q_1, dim=0)                                              # Stacked all Query vector as matrix
K_1 = torch.stack(K_1, dim=0)                                              # Stacked all Key vector as matrix
V_1 = torch.stack(V_1, dim=0)                                              # Stacked all Value vector as matrix

d_k = d_model                                                          # dimensionality of Key vector

# calculate QK_T
QK_T_1 = torch.matmul(Q_1, torch.t(K_1))                
sld_QK_T_1 = QK_T_1/math.sqrt(d_k)                                         # scaled dot product of QK_T by sqrt(d_k)

# mask matrix apply
sld_QK_T_1 = sld_QK_T_1 + mask

soft_QK_T_1 = torch.softmax(sld_QK_T_1, dim=-1)                            # apply softmax
self_attn_1 = torch.matmul(soft_QK_T_1, V_1)         
'''***************************************************: First Head End :***************************************************'''

'''*****************************************************: Second Head :****************************************************'''
##### Wq, Wk, Wv matrix of parameters for generating the Q, K, V vectors
Wq_2 = nn.Linear(d_model, d_model, dtype=torch.float64)
Wk_2 = nn.Linear(d_model, d_model, dtype=torch.float64)
Wv_2 = nn.Linear(d_model, d_model, dtype=torch.float64)

Q_2, K_2, V_2 = [], [], []                                                   # Matrix for all Query, Key and Value vectors
for ebd in psnl_encs:
    q = Wq_2(ebd)
    Q_2.append(q)
    
    k = Wk_2(ebd)
    K_2.append(k)
    
    v = Wv_2(ebd)
    V_2.append(v)

Q_2 = torch.stack(Q_2, dim=0)                                              # Stacked all Query vector as matrix
K_2 = torch.stack(K_2, dim=0)                                              # Stacked all Key vector as matrix
V_2 = torch.stack(V_2, dim=0)                                              # Stacked all Value vector as matrix

d_k = d_model                                                          # dimensionality of Key vector

# calculate QK_T
QK_T_2 = torch.matmul(Q_2, torch.t(K_2))                
sld_QK_T_2 = QK_T_2/math.sqrt(d_k)                                   # scaled dot product of QK_T by sqrt(d_k)

# mask matrix apply
sld_QK_T_2 = sld_QK_T_2 + mask

soft_QK_T_2 = torch.softmax(sld_QK_T_2, dim=-1)                            # apply softmax
self_attn_2 = torch.matmul(soft_QK_T_2, V_2)         
'''***************************************************: Second Head End :**************************************************'''

# concatenate both the self attention embeddings
ct_attn = torch.cat((self_attn_1, self_attn_2), 1)

# apply linear transformation
W_c = nn.Linear(ct_attn.shape[1], d_model, dtype=torch.float64)

multihead_attn = W_c(ct_attn)

print(f"MultiHead Attention on {op_ip_tkns[0]}: {multihead_attn[0].tolist()}")
print()
print(f"MultiHead Attention on {op_ip_tkns[1]}: {multihead_attn[1].tolist()}")

MultiHead Attention on नमस्ते,: [0.19808594961956605, 0.16539107406989303, 0.16047227468059666, 0.8443892117467038, 0.3126989019256736, -0.7121538366547243, -0.29480363633604906, 0.3437514303539883]

MultiHead Attention on आप: [0.4279334990665706, 0.08269069198801696, 0.14926783603924776, 0.5763281096211011, 0.22916804844579108, -0.715859526398316, -0.12981713852731874, 0.3195667444414096]


### Addition & Normalization

In [69]:
# Normalize(O/P MultiHead Attention + Positional Encodings)
add_attn = multihead_attn + psnl_encs

# normalization layer
norm_l = nn.LayerNorm(d_model, dtype=torch.float64)

norm_ebds = norm_l(add_attn)

print(f"Normalized Embedding of {sne_tkns[0]}: {norm_ebds[0].tolist()}")
print()
print(f"Normalized Embedding of {sne_tkns[1]}: {norm_ebds[1].tolist()}")

Normalized Embedding of This: [-0.8926722070909469, -0.0759974170179957, -1.5633826268360298, 1.5859829586956378, -0.16374001195503973, -0.13836703981955928, -0.21638960381250613, 1.4645659478364395]

Normalized Embedding of were: [0.3563367313873898, -1.5672253138521985, -0.7963352643316063, 1.804681423117092, 1.039359297566334, -0.6522290366756707, -0.05331017057177913, -0.13127766663956061]


### Cross Attention

##### Cross attention takes Key and Value vector from encoder and apply the attention on Q(decoders previous step output), K, V vectors

In [71]:
encoder_ip = e_norm_ebds                  # encoder input

In [77]:
'''*****************************************************: First Head :*****************************************************'''
##### Wq, Wk, Wv matrix of parameters for generating the Q, K, V vectors
Wq = nn.Linear(d_model, d_model, dtype=torch.float64)
Wk = nn.Linear(d_model, d_model, dtype=torch.float64)
Wv = nn.Linear(d_model, d_model, dtype=torch.float64)

Q_1, K_1, V_1 = [], [], []
for ebd in encoder_ip:
    k = Wk(ebd)
    K_1.append(k)
    
    v = Wv(ebd)
    V_1.append(v)
    
for ebd in norm_ebds:
    q = Wq(q)
    Q_1.append(q)
    
Q_1 = torch.stack(Q_1, dim=0)                                              # Stacked all Query vector as matrix
K_1 = torch.stack(K_1, dim=0)                                              # Stacked all Key vector as matrix
V_1 = torch.stack(V_1, dim=0)                                              # Stacked all Value vector as matrix

d_k = K_1.size()[1]                                                        # dimensionality of Key vector

# calculate QK_T
QK_T_1 = torch.matmul(Q_1, torch.t(K_1))                
sld_QK_T_1 = QK_T_1/math.sqrt(d_k)                                         # scaled dot product of QK_T by sqrt(d_k)

soft_QK_T_1 = torch.softmax(sld_QK_T_1, dim=-1)                            # apply softmax
self_attn_1 = torch.matmul(soft_QK_T_1, V_1)         
'''***************************************************: First Head End :***************************************************'''

'''*****************************************************: Second Head :*****************************************************'''
##### Wq, Wk, Wv matrix of parameters for generating the Q, K, V vectors
Wq = nn.Linear(d_model, d_model, dtype=torch.float64)
Wk = nn.Linear(d_model, d_model, dtype=torch.float64)
Wv = nn.Linear(d_model, d_model, dtype=torch.float64)

Q_2, K_2, V_2 = [], [], []
for ebd in encoder_ip:
    k = Wk(ebd)
    K_2.append(k)
    
    v = Wv(ebd)
    V_2.append(v)
    
for ebd in norm_ebds:
    q = Wq(q)
    Q_2.append(q)
    
Q_2 = torch.stack(Q_2, dim=0)                                              # Stacked all Query vector as matrix
K_2 = torch.stack(K_2, dim=0)                                              # Stacked all Key vector as matrix
V_2 = torch.stack(V_2, dim=0)                                              # Stacked all Value vector as matrix

d_k = K_2.size()[1]                                                        # dimensionality of Key vector

# calculate QK_T
QK_T_2 = torch.matmul(Q_2, torch.t(K_2))                
sld_QK_T_2 = QK_T_2/math.sqrt(d_k)                                         # scaled dot product of QK_T by sqrt(d_k)

soft_QK_T_2 = torch.softmax(sld_QK_T_2, dim=-1)                            # apply softmax
self_attn_2 = torch.matmul(soft_QK_T_2, V_2)         
'''***************************************************: Second Head End:***************************************************'''

# concatenate both the self attention embeddings
ct_attn = torch.cat((self_attn_1, self_attn_2), 1)

# apply linear transformation
W_c = nn.Linear(ct_attn.shape[1], d_model, dtype=torch.float64)

multihead_attn = W_c(ct_attn)

print(f"MultiHead Attention on {op_ip_tkns[0]}: {multihead_attn[0].tolist()}")
print()
print(f"MultiHead Attention on {op_ip_tkns[1]}: {multihead_attn[1].tolist()}")

# Normalize(O/P MultiHead Attention + Positional Encodings)
add_attn = multihead_attn + norm_ebds

# normalization layer
norm_l = nn.LayerNorm(d_model, dtype=torch.float64)

norm_ebds = norm_l(add_attn)

print(f"Normalized Embedding of {op_ip_tkns[0]}: {norm_ebds[0].tolist()}")
print()
print(f"Normalized Embedding of {op_ip_tkns[1]}: {norm_ebds[1].tolist()}")

MultiHead Attention on नमस्ते,: [0.19412608319423683, -0.4134067730647788, -0.44157129144725205, 0.276438487789221, -0.06861453459434408, -0.13484692236640433, -0.04966532369715215, -0.26276689295352923]

MultiHead Attention on आप: [0.2425023851522156, -0.3920071292916776, -0.4463137353767231, 0.25749483058432004, -0.07412023114123495, -0.14043064094672436, -0.07044489287986477, -0.26863468648471495]
Normalized Embedding of नमस्ते,: [-0.5305889123318833, -0.34122555433786445, -1.7134495770644247, 1.7881876715772989, -0.10848512403299257, -0.1454804602121999, -0.13899845623729706, 1.190040412639363]

Normalized Embedding of आप: [0.5891429782951291, -1.5324944693208842, -0.9381677538780204, 1.8028192951038151, 0.8930312145483321, -0.5649513779909473, -0.010168978360420324, -0.23921090839700387]


### Feed Forward + Addition & Normalization

In [79]:
# 2 layer feed forward network
# 1 layer activation ---> relu
# 2 lauer activation ---> linear
fc1 = nn.Linear(d_model, 2048, dtype=torch.float64)
fc2 = nn.Linear(2048, d_model, dtype=torch.float64)
fc1_atv = nn.ReLU()

fc_ebds = []
for ebd in norm_ebds:
    fc_ebd = fc2(fc1_atv(fc1(ebd)))
    
    fc_ebds.append(fc_ebd)
fc_ebds = torch.stack(fc_ebds, dim=0)

# Normalized Embeddings + FC Network Embedding
ebds = norm_ebds + fc_ebds

# Normalized Embeddings
norm_l = nn.LayerNorm(d_model, dtype=torch.float64)
d_norm_ebds = norm_l(ebds)
print(f"Normalized Embedding of {op_ip_tkns[0]}: {d_norm_ebds[0].tolist()}")
print()
print(f"Normalized Embedding of {op_ip_tkns[1]}: {d_norm_ebds[1].tolist()}")

Normalized Embedding of नमस्ते,: [-0.7225607102665331, -0.8623719294065467, -1.3742873563282956, 1.8788633493526363, -0.08738754894528272, -0.07391608981899199, 0.10538202841485675, 1.1362782569981573]

Normalized Embedding of आप: [0.5968812457273729, -1.7655305946243558, -0.4309792128108882, 1.6987930322288112, 0.9172430057869435, -0.7017817356034783, 0.031860181011967276, -0.3464859217163721]


### Linear + Softmax Layer

In [82]:
d_norm_ebds

tensor([[-0.7226, -0.8624, -1.3743,  1.8789, -0.0874, -0.0739,  0.1054,  1.1363],
        [ 0.5969, -1.7655, -0.4310,  1.6988,  0.9172, -0.7018,  0.0319, -0.3465],
        [ 1.9808, -0.7952,  0.1023, -0.4037,  0.3948, -1.6221, -0.2892,  0.6323],
        [-0.8660, -1.7977, -0.1495,  1.7839,  0.0571,  0.5224, -0.2422,  0.6920]],
       dtype=torch.float64, grad_fn=<NativeLayerNormBackward0>)

In [91]:
fc = nn.Linear(d_model, vocab_size, dtype=torch.float64)
prbs = torch.softmax(fc(d_norm_ebds), dim=-1)


max_indices = torch.argmax(prbs, dim=1)
print(f"Maximum probability index: {max_indices}")

Maximum probability index: tensor([1, 3, 0, 1])


<center style="font-size: 25px; font-family: Times New Roman; font-weight: bold">Output Probability Index, Maping to Words</center>

In [101]:
trnl = ""
for index in max_indices.tolist():
    trnl += r_tgt_dictionary[index] + " "

print(f"English Sentence: {english[0]}")
print(f"Hindi Sentence(Orignal): {hindi[0]}")
print()
print(f"Hindi Sentence(Generated): {trnl}")

English Sentence: Hello, how are you?
Hindi Sentence(Orignal): नमस्ते, आप कैसे हैं?

Hindi Sentence(Generated): आप हैं? नमस्ते, आप 
