In [1]:
import torch
import torch.nn as nn 
import math 



In [4]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model:int, vocabu_size:int):
        super().__init__()
        self.d_model = d_model
        self.vocabu_size = vocabu_size
        self.embedding = nn.Embedding(vocabu_size, d_model)
    def forward(self, x):
        return self.embedding(x)*math.sqrt(self.d_model)
    

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model:int, seq_len:int, dropout:float)->None:
        super.__init__()
        self.d_model = d_model
        self.seq_len=seq_len
        self.dropout = nn.Dropout(dropout)

        #initialize matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        #create position indices
        position = torch.arrange(0, seq_len, d_type=torch.float).unsqueeze(1)
        #numerically stable dividing term 
        div_term = torch.exp(torch.arrange(0,d_model, 2)).float()*(-math.log(10000.0)/d_model)
        pe[:, 0::2] = torch.sin(position*div_term)
        pe[:, 1::2] = torch.cos(position*div_term)
        #add batch dimension
        pe=pe.unsqueeze (0)
        #register buffer as a buffer is not a parameter, it is not updated during backprop
        self.register_buffer('pe',pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)
    


In [1]:
#import torch

#pos = torch.arange(0, 5)          # positions: 0..4
#print("pos:", pos)
#print("pos.shape:", pos.shape)     # (5,)

#pos_u1 = pos.unsqueeze(1)          # add dim at index 1
#print("\npos_u1:\n", pos_u1)
#print("pos_u1.shape:", pos_u1.shape)  # (5, 1)

#pos_u0 = pos.unsqueeze(0)          # add dim at index 0
#print("\npos_u0:\n", pos_u0)
#print("pos_u0.shape:", pos_u0.shape)  # (1, 5)

pos: tensor([0, 1, 2, 3, 4])
pos.shape: torch.Size([5])

pos_u1:
 tensor([[0],
        [1],
        [2],
        [3],
        [4]])
pos_u1.shape: torch.Size([5, 1])

pos_u0:
 tensor([[0, 1, 2, 3, 4]])
pos_u0.shape: torch.Size([1, 5])


In [None]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma=nn.Parameter(torch.ones(parameters_shape))
        self.beta=nn.Parameter(torch.zeros(parameters_shape))

    def forward(self,x):
        dims=[-(i+1) for i in range(len(self.parameters_shape))]
        mean=x.mean(dim=dims, keepdim=True)
        print(f"Mean \n ({mean.size()}) : \n{mean}")
        var = ((x-mean)**2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard Deviation \n ({std.size()}) : \n{std}")
        y=(x-mean)/std
        print(f"y \n ({y.size()})= \n {y}")
        out = self.gamma *y +self.beta 
        return out 
        


In [None]:
class MultiheadAttention(nn.Module):

    def __init__(self, d_model,n_heads, dropout) -> None:
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        assert d_model % n_heads==0, "Model dimensions is not divisible by number of heads"
        self.d_k= d_model // n_heads
        self.w_q=nn.Linear(d_model, d_model)
        self.w_k=nn.Linear(d_model, d_model)
        self.w_v=nn.Linear(d_model, d_model)
        
        self.w_o=nn.Linear(d_model, d_model)
        self.dropout=nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k=query.shape[-1]
        #Attention scores
        attention_scores = (query @ key.transpose(-2,-1))//math.sqrt(d_k)
        if mask is not None:
            attention_scores.mask_fill_(mask==0, -1e11)
            attention_scores=attention_scores.softmax(dim=-1)
        if dropout is not None:
            attention_scores=dropout(attention_scores)
        qkv=attention_scores@value
        return qkv, attention_scores
        

    def forward(self, q, k, v, mask):
        #linear multiplication by weight matrix
        query=self.w_q(q)
        key=self.w_k(k)
        value=self.w_v(v)
        #(Batch size, sequence length, embedding size)->(Batch size, sequence length, number of heads, d_k)->(Batch size, number of heads,sequence length, d_k)
        query=query.view(query.shape[0],query.shape[1], self.n_heads, self.d_k).transpose(-2,-1)
        key=key.view(key.shape[0],key.shape[1], self.n_heads, self.d_k).transpose(-2,-1)
        value=value.view(value.shape[0],value.shape[1], self.n_heads, self.d_k).transpose(-2,-1)

        x, self.attention_scores=self.attention(query, key, value,mask, self.dropout)

        # Combine all the heads together
        # (batch size, number of heads, seq_len, d_k) --> (batch size, seq_len, number of heads, d_k) --> (batch size, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.n_heads * self.d_k)


        return self.w_o(x)
        