In [4]:
import torch
import torch.nn as nn
import math 

In [6]:
class InputEmbeddings(nn.Module):
    # Specifying the dimension of the vector and the vocabulary size
    # d_model is a vector of size 512
    def __init__(self, d_model:int, vocab_size:int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        
        # PyTorch inbuilt embedding feature
        # Embedding takes in the size of dictionary(vocab) and the size of each embedding vector(512)
        self.embedding = nn.Embedding(num_embeddings = vocab_size, embedding_dim = d_model)
        
    def forward(self, x):
        #  Multiplying the weights of the embedding by the Sqrt of d_model
        return self.embedding(x) * math.sqrt(self.d_model)

### Positional Encoding 
Positional encoding vector is added to the embedding to introduce the actual position of each word in the sentence.

For the even position of the embedding vector of each word, we use the formula `PE(pos, 2i)` ---> **Add the actual formula**

For the odd we use `PE(pos, 2i+1)`  ---> **Add the actual formula**


In [12]:
class PositionalEncoding(nn.Module):
    # Specifying the vector size, maximum length of sentence and dropout
    def __init__(self, d_model:int, seq_len: int, dropout:float ):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        
        # Creating a matrix of shape(seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        
        # Creating a position vector for the words in the sentence
        position = torch.arange(0, seq_len, datatype = torch.float).unsqueeze(1)
        
        # Representing the formula
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (math.log(50000.0) / d_model))
        
        # Applying to the even positions
        pe[:,0::2] = torch.sin(position * div_term)
        
        # odd positions
        pe[:,1::2] = torch.cos(position * div_term)
        
        # Batch dimension
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        
        self.register_buffer('pe', pe) # Saves the tensor pe alongside the model
        
    def forward(self, x):
        # We specify that we don't want the model to learn the positional encoding 
        # throughout the process because they are fixed, requires grad: False
        x = x + (self.pe[1, x.shape[1], :]).requires_grad(False)
        return self.dropout(x)

### Layer Normalization

Suppose we have a batch containing sentences stored as vectors labelled item 1 to item 4 , we find the mean and variance of each item then we calculate the new values(x) for each of the item using their respective mean and variance (layer normalization)

**-->Include the formula**
 
We introduce the gamma (multiplicative -> multiplied by x) and the beta (additive -> added to the x). The model uses this to amplify the values(x) when needed

In [13]:
class LayerNormalization(nn.Module):
    # Eps avoids to big or small numbers for numerical stability and it prevents division by 0
    def __init__(self, eps: float = 10** -6): 
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.ones(1))
        
    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True)
        std = x.std(dim = -1, keepdim = True)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

### Feed Forward Layer
Fully connected layers used in both the encoder and decoder

In [21]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model:int , d_ff:int, dropout: float):
        self().__init__()
        
        self.linear_1  = nn.Linear(d_model, d_ff) # W1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2  = nn.Linear(d_ff, d_model) # W2 and b2
    
    def forward(self, x):
        return self.linear_1(self.dropout(torch.relu(self.layer_2(x))))

### Multihead Attention

In [23]:
class MultiHeadAttention(nn.Module):
    # h -> number of heads
    def __init__(self, d_model:int, h:int, dropout:int):
        super().__init__()
        self.d_model = d_model
        self.h = h
        
        assert d_model % h == 0, 'd_model is not divisible by the number of heads'
        
        self.d_k = d_model // h
        
        # Matrices for multiplying the Query, Key and Value
        self.w_q = nn.Linear(d_model, d_model) 
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        
        self.w_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
    
    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            attention_scores.masked_fill(mask == 0, -10**9)
        
        attention_scores = attention_scores.softmax(dim = -1)
        
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        
        return (attention_scores @ value), attention_score
        
    def forward(self,x, q, k, v, mask):
        query = self.w_q(q)
        key = self.w_k(k)
        value = self.w_v(v)
        
        # Divide the Q,K,Y to smaller matrices to give each matrix into a different head
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1,2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1,2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1,2)
        
        x, self.attention_scores = MultiHeadAttention.attention(query, key, value, mask, self.dropout)
        
        x = x.transpose(1, 2).contigous().view(x.shape[0], -1, self.h * self.d_k)
        
        return self.w_o(x)

### Residual Connection

In [24]:
class ResidualConnection(nn.Module):
    def __init__(self, dropout:float):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization()
        
    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))
        

### Encoder Block

In [25]:
class EncoderBlock(nn.Module):
    def __init__(self, self_attention_block: MultiHeadAttention, feed_forward: FeedForwardBlock, dropout:float):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward
        self.residual_connection = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])
        
    def forward(self, x, src_mask):
        x = self.residual_connection[0](x, lambda a: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connection[1](x, self.feed_forward_block)
        return x

The Encoder block is made up of many encoders so we can have up to `N` encoders in the encoder block

In [26]:
class Encoder(nn.Module):
    def __init__(self, layers:nn.ModuleList):
        super().__init__()
        
        self.layers = layers
        self.norm = LayerNormalization()
        
    def forward(self, x):
        for layer in self.layers:
            x = layer(x, mask)
        
        return self.norm(x)

### Decoder