In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%autosave 20

Autosaving every 20 seconds


In [2]:
import torch
import torch.nn as nn
import math


# Input Embedding

In [3]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model # Dimention of model
        self.vocab_size = vocab_size  # total number of Words in the Input
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x) , math.sqrt(self.d_model) , self.embedding(x)*math.sqrt(self.d_model)

In [4]:
batch_size = 2
seq_len = 5
d_model = 16
vocab_size = 100

x = torch.randint(0, vocab_size, (batch_size, seq_len))
embeddings = InputEmbeddings(d_model, vocab_size)
output_emb = embeddings(x)

In [5]:
x.shape

torch.Size([2, 5])

In [6]:
output_emb[0].shape

torch.Size([2, 5, 16])

In [7]:
output_emb[1]

4.0

In [8]:
output_emb[2]

tensor([[[ 2.5196e+00,  3.5437e+00,  5.4313e-01,  1.9134e+00, -1.2026e+00,
           8.1953e+00, -1.6043e+00,  6.1331e-01, -2.8480e+00, -1.2120e+00,
          -1.4446e+00, -1.0584e-02,  1.6469e+00,  3.9262e+00, -1.5580e+00,
           4.3438e-01],
         [ 5.1470e+00, -1.1315e-01, -5.3884e+00, -1.3013e+00, -1.4926e+00,
           2.4929e+00,  2.5210e+00,  4.1980e-01, -5.1099e-02,  1.9285e+00,
           1.2127e+00,  3.2001e+00,  4.0178e+00,  1.2280e+00,  4.3481e+00,
           1.2110e+00],
         [-1.2578e+01,  2.2901e+00, -3.3737e-01, -2.1863e+00, -1.7225e+00,
          -2.7290e+00, -1.6102e+00, -2.7874e+00,  3.1315e+00, -2.7698e+00,
          -7.6416e-01, -4.0520e+00,  3.0876e+00,  5.4205e+00, -4.5514e+00,
           5.5078e+00],
         [-1.9107e-01,  6.0942e+00, -3.4973e+00, -1.3215e+00,  3.7744e-01,
          -2.0132e+00, -1.6465e+00,  3.9348e+00, -4.4714e+00, -8.2981e+00,
          -1.6041e+00, -8.9192e-01, -1.8988e+00,  1.9794e+00, -9.0373e+00,
           1.8060e+00],
    

# Positional Encoding

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

In [None]:
d_model = 16
seq_len = 5
dropout = 0.1
pos_encoding = PositionalEncoding(d_model, seq_len, dropout)

# Create a sample input tensor (batch_size, seq_len, d_model)
batch_size = 2
x = torch.randn(batch_size, seq_len, d_model)



In [9]:
pe = torch.zeros(seq_len, d_model)

In [16]:
pe[:, 0::2]

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [17]:
pe[:, 1::2]

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [12]:
position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)

In [13]:
position

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.]])

In [14]:
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

In [15]:
div_term

tensor([1.0000e+00, 3.1623e-01, 1.0000e-01, 3.1623e-02, 1.0000e-02, 3.1623e-03,
        1.0000e-03, 3.1623e-04])

In [10]:
x.shape

torch.Size([2, 5])

In [None]:
output_emb[2].shape

In [None]:

# Forward pass
output_pos_emb = pos_encoding(output_emb[2])

In [None]:
output_pos_emb.shape

In [None]:
output_pos_emb

In [None]:
output_emb