In [15]:
import math
import numpy as np

In [16]:
import torch
import torch.nn as nn

In [17]:
class InputEmbedding(nn.Module):

    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(d_model, vocab_size)
        # shape = [num of words * dimension of embedding layer]

    def forward(self, x):
        return self.embedding(x) * math.sqrt(d_model)
        # dimension same

In [18]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, seq_length, dropout = 0):
        super().__init__()
        self.d_model = d_model
        self.seq_length = seq_length
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(self.seq_length, self.d_model)  # To get the matrix of dimension as of embedding layer
        positions = torch.arange(0, self.seq_length, dtype = torch.float32).unsqueeze(1)  # matrix of [seq_length x 1]
        div_term = (positions /(torch.pow(10000, 2 * torch.arange(0, d_model, 2).float() /self.d_model))) #to calculate say (angle)  pos/(10000^(2i/dmodel))
        pe[:, 0::2] = torch.sin(div_term)   #Apply sine formula in even positions
        pe[:, 1::2] = torch.cos(div_term)   # Appply cosine formula in odd positions
        
        self.pe = pe.unsqueeze(0)  # for batches dimension [1 x seq_length x d_model]

        # self.register_buffer('pe', self.pe) # By adding this in register buffer this stores pe too while saving the model without considering it as a learning parameter
                

    def forward(self, x):
        x = x + self.pe.required_grad(False)  #To make it not to learn
        return self.dropout(x)
    # def forward(self, ..):
        # pe = torch.zeros()

In [19]:
class LayerNormalization(nn.Module):
    def __init__(self, epsilon=1e-5):
        super().__init__()
        self.epsilon = epsilon
        self.gamma = nn.Parameter(torch.ones(1))  # Scale
        self.beta = nn.Parameter(torch.zeros(1))  # Shift

    def forward(self, x):
        mean = x.mean(dim=1, keepdim=True)
        var = x.var(dim=1, keepdim=True) 
        return self.gamma * (x - mean) / torch.sqrt(var + self.epsilon) + self.beta


In [20]:
class FeedForward(nn.Module):

    def __init__(self, d_model, dff):
        super().__init__()
        self.forward1 = nn.Linear(d_model, dff)
        self.dropout = nn.Dropout(dropout)
        self.forward2 = nn.Linear(dff, d_model)

    def forward(self, x):
        return self.forward2(self.dropout(torch.relu(self.forward1(x))))

In [231]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, heads, dropout = 0.5):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.heads = heads
        self.dropout = dropout

        self.w_q = nn.ModuleList(nn.Linear(d_model, d_model) for _ in range(heads))
        self.w_k = nn.ModuleList(nn.Linear(d_model, d_model) for _ in range(heads))
        self.w_v = nn.ModuleList(nn.Linear(d_model, d_model) for _ in range(heads))

        self.w_o = nn.Linear(d_model * d_model , d_model)

        self.softmax = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, embeded_layer):

        attention_outputs = []

        for head in range(self.heads):
        
            query = self.w_q[head](embeded_layer)
            key = self.w_k[head](embeded_layer)
            value = self.w_v[head](embeded_layer)

            similarity = torch.matmul(query, torch.transpose(key, -2, -1))  / math.sqrt(self.d_model)

            sim = self.softmax(similarity)
            sim = self.dropout(sim)

            final = torch.matmul(sim, value)

            attention_outputs.append(final)
            
        concat_matrix = torch.cat(attention_outputs, -1)
        return self.w_o(concat_matrix)
        

        
        

In [233]:
m = MultiHeadAttention(2, 2, 0)
x = torch.rand(4,3,2, dtype = torch.float)
m.forward(x)

tensor([[[-0.0623, -0.6217],
         [-0.0622, -0.6216],
         [-0.0623, -0.6217]],

        [[-0.0728, -0.7456],
         [-0.0727, -0.7453],
         [-0.0738, -0.7495]],

        [[-0.0659, -0.7181],
         [-0.0672, -0.7192],
         [-0.0676, -0.7199]],

        [[-0.0620, -0.6872],
         [-0.0628, -0.6876],
         [-0.0621, -0.6873]]], grad_fn=<ViewBackward0>)

In [271]:
class ResidualConnection(nn.Module):

    def __init__(self, d_model ,dropout):

        super(ResidualConnection, self).__init__()
        self.ln = LayerNormalization()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x1, x2):

        return self.ln(x1 + self.dropout(x2))

In [255]:
class EncoderBlock(nn.Module):

    def __init__(self, attention, rc, ff):

        super(EncoderBlock, self).__init__()
        self.attention = attention
        self.rc = rc
        self.ff = ff

    def forward(self, x):

        x1 = self.attention(x)
        x2 = self.rc(x, x1)

        x3 = self.ff(x2)
        x4 = self.rc(x2, x3)
        return x4
        
        

In [257]:
class Encoder(nn.Module):

    def __init__(self, embedding, pos_encoding, attention, rc, ff, n= 6):

        super(Encoder, self).__init__()
        self.embedding = embedding
        self.pos_encoding = pos_encoding
        
        self.encoder_blocks = nn.ModuleList(EncoderBlock(attention, rc, ff) for _ in range(n))
        print(type(self.encoder_blocks))

    def forward(self,x):

        x = self.pos_encoding(self.embedding(x))

        for block in self.encoder_blocks:
            x = block(x)
        return x

In [259]:
Encoder(0, 0, 0, 0, 0)

<class 'torch.nn.modules.container.ModuleList'>


Encoder(
  (encoder_blocks): ModuleList(
    (0-5): 6 x EncoderBlock()
  )
)

In [262]:
# torch.manual_seed(44)

In [264]:
# a = torch.rand((4, 3, 3))
# a

In [266]:
# sm = nn.Softmax(dim = -1)

In [268]:
# sm(a)