In [1]:
import math
import numpy as np

In [2]:
import torch
import torch.nn as nn

In [3]:
class InputEmbedding(nn.Module):

    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(d_model, vocab_size)
        # shape = [num of words * dimension of embedding layer]

    def forward(self, x):
        return self.embedding(x) * math.sqrt(d_model)
        # dimension same

In [4]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, seq_length, dropout = 0):
        super().__init__()
        self.d_model = d_model
        self.seq_length = seq_length
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(self.seq_length, self.d_model)  # To get the matrix of dimension as of embedding layer
        positions = torch.arange(0, self.seq_length, dtype = torch.float32).unsqueeze(1)  # matrix of [seq_length x 1]
        div_term = (positions /(torch.pow(10000, 2 * torch.arange(0, d_model, 2).float() /self.d_model))) #to calculate say (angle)  pos/(10000^(2i/dmodel))
        pe[:, 0::2] = torch.sin(div_term)   #Apply sine formula in even positions
        pe[:, 1::2] = torch.cos(div_term)   # Appply cosine formula in odd positions
        
        self.pe = pe.unsqueeze(0)  # for batches dimension [1 x seq_length x d_model]

        # self.register_buffer('pe', self.pe) # By adding this in register buffer this stores pe too while saving the model without considering it as a learning parameter
                

    def forward(self, x):
        x = x + self.pe.required_grad(False)  #To make it not to learn
        return self.dropout(x)
    # def forward(self, ..):
        # pe = torch.zeros()

In [5]:
class LayerNormalization(nn.Module):
    def __init__(self, epsilon=1e-5):
        super().__init__()
        self.epsilon = epsilon
        self.gamma = nn.Parameter(torch.ones(1))  # Scale
        self.beta = nn.Parameter(torch.zeros(1))  # Shift

    def forward(self, x):
        mean = x.mean(dim=1, keepdim=True)
        var = x.var(dim=1, keepdim=True) 
        return self.gamma * (x - mean) / torch.sqrt(var + self.epsilon) + self.beta


In [6]:
class FeedForward(nn.Module):

    def __init__(self, d_model, dff):
        super().__init__()
        self.forward1 = nn.Linear(d_model, dff)
        self.dropout = nn.Dropout(dropout)
        self.forward2 = nn.Linear(dff, d_model)

    def forward(self, x):
        return self.forward2(self.dropout(torch.relu(self.forward1(x))))

In [269]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, heads, dropout = 0.5):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.heads = heads
        self.dropout = dropout

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)

        self.w_o = nn.Linear(d_model, d_model)

        self.softmax = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, embeded_layer):

        # print(embeded_layer)

        query = self.w_q(embeded_layer)
        # print(query)
        key = self.w_k(embeded_layer)
        # print(key)
        value = self.w_v(embeded_layer)
        # print(value)
        # print(query)
        # print(torch.transpose(key, 0,1))

        print(value.shape)
        print(key.shape)

        similarity = torch.matmul(query, torch.transpose(key, -2, -1))  / math.sqrt(self.d_model)
        # print(similarity)

        sim = self.softmax(similarity)
        print(sim.shape)
        # print(sim)
        sim = self.dropout(sim)

        final = torch.matmul(sim, value)
        return final
        

        
        

In [271]:
m = MultiHeadAttention(2, 1, 0)
x = torch.rand(4,3,2, dtype = torch.float)
m.forward(x)

torch.Size([4, 3, 2])
torch.Size([4, 3, 2])
torch.Size([4, 3, 3])


tensor([[[-0.2519,  0.6355],
         [-0.2464,  0.6360],
         [-0.2481,  0.6358]],

        [[-0.1795,  0.5754],
         [-0.1793,  0.5755],
         [-0.1805,  0.5747]],

        [[-0.2397,  0.6525],
         [-0.2436,  0.6531],
         [-0.2484,  0.6538]],

        [[-0.1735,  0.5218],
         [-0.1839,  0.5254],
         [-0.1746,  0.5224]]], grad_fn=<UnsafeViewBackward0>)

In [257]:
torch.manual_seed(44)

<torch._C.Generator at 0x7d538c168d50>

In [243]:
a = torch.rand((4, 3, 3))
a

tensor([[[0.7196, 0.7307, 0.8278],
         [0.1343, 0.6280, 0.7297],
         [0.2882, 0.2112, 0.9836]],

        [[0.8722, 0.9650, 0.7837],
         [0.8076, 0.0608, 0.7226],
         [0.3354, 0.5350, 0.7117]],

        [[0.7979, 0.2785, 0.8947],
         [0.6694, 0.8950, 0.4479],
         [0.4788, 0.3541, 0.0467]],

        [[0.7471, 0.7821, 0.4296],
         [0.1128, 0.8413, 0.4978],
         [0.3593, 0.7470, 0.0869]]])

In [219]:
sm = nn.Softmax(dim = -1)

In [221]:
sm(a)

tensor([[[0.2566, 0.3410, 0.4023],
         [0.2519, 0.4897, 0.2584],
         [0.3432, 0.2442, 0.4127]],

        [[0.4097, 0.3154, 0.2750],
         [0.3533, 0.4340, 0.2127],
         [0.2992, 0.3959, 0.3049]],

        [[0.3284, 0.2620, 0.4095],
         [0.2967, 0.4014, 0.3019],
         [0.3303, 0.3583, 0.3114]],

        [[0.2864, 0.3279, 0.3857],
         [0.3272, 0.3400, 0.3328],
         [0.2196, 0.3593, 0.4211]]])