In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%autosave 20

Autosaving every 20 seconds


In [3]:
import torch
import torch.nn as nn
import math

In [4]:
batch_size = 4
seq_len = 16
d_model = 8
vocab_size = 100

# Input Embedding

In [5]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model # Dimention of model
        self.vocab_size = vocab_size  # total number of Words in the Input
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x)*math.sqrt(self.d_model)

In [6]:
x = torch.randint(0,vocab_size,(batch_size,seq_len))
x.shape
x

torch.Size([4, 16])

tensor([[45, 10, 47, 10,  1, 85, 94, 84,  7, 57, 56, 90, 88, 26, 82, 70],
        [74, 16, 34, 68, 93, 68, 88, 45, 54, 93, 76, 65, 35, 70, 72, 10],
        [71, 46, 96, 30, 70, 47, 23, 47, 35, 24, 80, 91, 29, 41, 53, 19],
        [21, 82, 24, 74, 70, 60, 57, 52,  9, 66, 98, 53, 16, 80,  5, 59]])

In [7]:
embeddings = InputEmbeddings(d_model, vocab_size)
output_emb = embeddings(x)
output_emb.shape
output_emb

torch.Size([4, 16, 8])

tensor([[[-3.0857,  2.5561, -1.5502,  4.6947,  2.7813,  1.6331,  3.3028,
          -1.7598],
         [-0.4488, -2.3459,  0.1917, -2.5710, -5.4087, -0.9784,  2.5236,
           2.5344],
         [-0.5505,  0.2772,  1.8915,  6.7931,  2.9333, -2.7175, -5.6576,
           4.5879],
         [-0.4488, -2.3459,  0.1917, -2.5710, -5.4087, -0.9784,  2.5236,
           2.5344],
         [-1.7179,  6.2368, -3.7011, -1.4160,  0.6128, -2.5922,  1.8761,
           3.8129],
         [ 4.5298,  0.4563,  2.1949,  2.5172, -0.1886,  0.5180,  1.7574,
           3.7915],
         [ 1.3536,  4.3405,  1.1270,  0.1371, -1.5373,  1.4282, -6.5935,
          -0.8932],
         [ 1.0093, -7.3339,  3.7362, -0.4357, -1.0572,  2.8729, -4.1980,
           6.2552],
         [ 1.6804,  2.9969,  0.3324,  0.1212, -0.5262, -1.2266, -1.4652,
          -2.2411],
         [ 4.2105,  4.4562,  0.9855,  5.0401,  2.8744, -2.0868, -0.9742,
           1.2223],
         [ 2.9400,  1.3464,  0.3139, -0.6258, -2.9078, -1.7433,  1.294

In [8]:
e = nn.Embedding(num_embeddings= vocab_size,embedding_dim=d_model)

In [9]:
x1 = [["i","Like","Math"],["I","Like","Music"]]

In [10]:
e(x).shape
(e(x) * (math.sqrt(d_model))).shape

torch.Size([4, 16, 8])

torch.Size([4, 16, 8])

# Positional Encoding

In [17]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

In [18]:
output_emb.shape[1]
torch.arange(0,d_model,2).float() 
torch.arange(0,seq_len,dtype=torch.float)
torch.zeros(seq_len,d_model)
torch.zeros(seq_len,d_model)

16

tensor([0., 2., 4., 6.])

tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
        14., 15.])

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [19]:
p = PositionalEncoding(d_model=d_model,seq_len=seq_len,dropout=0.1)
o_p_embeding = p(output_emb)

In [20]:
o_p_embeding.shape

torch.Size([4, 16, 8])

In [21]:
## Layer Normalization

In [22]:
nn.Parameter

torch.nn.parameter.Parameter

In [23]:
class LayerNormalization(nn.Module):
    def __init__(self,eps: float = 10**-6):
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))
    
    def forward(self,x):
        mean = x.mean(dim = -1,keepdim=True)
        std = x.std(dim = -1,keepdim=True)
        x = self.alpha*(x- mean)/(std+ self.eps) + self.bias 
        return x

In [24]:
torch.ones(1)
torch.zeros(1)

tensor([1.])

tensor([0.])

In [25]:
lm = LayerNormalization()
lm(o_p_embeding)[0][0]
o_p_embeding[0][0]

tensor([-1.7178,  0.6342, -0.6251,  1.3915,  0.3598,  0.3073,  0.5444, -0.8942],
       grad_fn=<SelectBackward0>)

tensor([-3.4286,  3.9512, -0.0000,  6.3274,  3.0903,  2.9257,  3.6697, -0.8442],
       grad_fn=<SelectBackward0>)

In [26]:
# Feed Forward Network 

In [27]:
class FeedForwardBlock(nn.Module):
    def __init__(self,d_model: int,d_ff : int , dropout : float)-> None :
        super().__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2  = nn.Linear(d_ff,d_model)
    
    def forward(self,x):
        x = self.linear_2(self.dropout(torch.relu(self.linear_1(x))))
        return x

In [28]:
l = FeedForwardBlock(8,12,0.1)

In [29]:
l(lm(o_p_embeding)).shape

torch.Size([4, 16, 8])

In [31]:
class ResidualConnection(nn.Module):
    def __init__(self,features,dropout):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization(features)
    
    def forward(self,x,sublayer):
        x = x+ self.dropout(sublayer(self.norm(x)))
        return x

In [None]:
class MultiheadAttentionBlock(nn.Module):
    def __init__(self,d_model:int,h:int,dropout:float)->None:
        super().__init__()
        self.d_model = d_model
        self.h = h
        assert d_model % h == 0, "d_model is not divisible by h"
        
        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query,key,value,mask,dropout):
        
