In [178]:
import torch
import torch.nn as nn
import math
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super(InputEmbeddings, self).__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)

# # Example usage
# net = InputEmbeddings(d_model=100, vocab_size=2000)

# # Generate random input tensor
# batch_size = 10
# seq_length = 20

# input_data = torch.randint(0,100,(batch_size, seq_length))  # Convert to 'long' data type
# print(input_data.shape)
# # Pass the tensor input to the network
# output = net(input_data)

# print(output.shape)  # Output shape: (2, 5, 100)

In [179]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model:int,seq_length:int,dropout:float)->None: #->None means return nothing
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_length
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(seq_length, d_model)
        position = torch.arange(0,seq_length,dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000.0)/d_model))
        pe[:,0::2] = torch.sin(position * div_term)
        pe[:,1::2] = torch.cos(position * div_term)
        # print(pe.shape)
        pe = pe.unsqueeze(0) # (1,seq_len,d_model)
        # print(pe.shape)
        self.register_buffer('pe',pe)
    def forward(self,x):
        print(f"tem pe: {self.pe[:,:x.shape[1],:].shape}")
        x = x + self.pe[:,:x.shape[1],:].requires_grad_(False)
        return self.dropout(x)
# input = torch.randint(0,100,(10,20))
# em = InputEmbeddings(150,1200)
# pe = PositionalEncoding(150,20,0.2)
# out = em(input)
# print(out.shape)
# out = pe(out)
# print(out.shape)

In [202]:
class LayerNormalization(nn.Module):
    def __init__(self,eps: float = 10**-6)->None: # eps to a small number to avoid divide to 0
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim = True)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias
# tensor = torch.randn(5,10)
# ln = LayerNormalization()
# out = ln(tensor)
# print(out.shape)
        

torch.Size([5, 10])


In [206]:
class FeedForwardBlock(nn.Module):
    def __init__(self,d_model:int, dff:int, dropout:float)->None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, dff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(dff, d_model)
    def forward(self,x):
        # (batch, seq_len, d_model) -> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu((self.linear_1(x)))))
# x = torch.randn(10,20,100)
# fw = FeedForwardBlock(100,1000,0.2)
# out = fw(x)
# print(out.shape)
        

torch.Size([10, 20, 100])


In [180]:
seq_length = 20
pos = torch.arange(0, seq_length, dtype=torch.float).unsqueeze(1)
# pos = torch.arange(0, 10, dtype=torch.float)
print(pos.shape)

torch.Size([20, 1])


In [181]:
d_model = 100
div_term = torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000.0)/d_model))
print(div_term.shape)

torch.Size([50])


In [182]:
out = torch.sin(pos * div_term)
print(out.shape)
out = out[:,0::2]# from 0, every 2 elem skip
print(out.shape)
out = out[:,1::2]# from 1, every 2 elem skip
print(out.shape)


torch.Size([20, 50])
torch.Size([20, 25])
torch.Size([20, 12])


In [183]:
div = torch.tensor([[2],[3],[4],[1]])
print(a.shape)
pos = torch.tensor([0, 1, 2])
print(b.shape)
print(a*b)
print((a*b).shape)

torch.Size([4, 1])
torch.Size([3])
tensor([[0, 2, 4],
        [0, 3, 6],
        [0, 4, 8],
        [0, 1, 2]])
torch.Size([4, 3])


In [194]:
x = torch.tensor([[1,2.0,6],[3,4,7]])
print(x.shape)
out = x.mean(dim=-1,keepdim=True)
print(out.shape)

torch.Size([2, 3])
torch.Size([2, 1])


In [195]:
x = torch.tensor([[1,2.0,6],[3,4,7]])
print(x.shape)
out = x.std(dim=-1,keepdim=True)
print(out.shape)

torch.Size([2, 3])
torch.Size([2, 1])


In [197]:
out = nn.Parameter(torch.ones(1))
print(out)
out = nn.Parameter(torch.zeros(1))
print(out)

Parameter containing:
tensor([1.], requires_grad=True)
Parameter containing:
tensor([0.], requires_grad=True)
