In [1]:
import torch
import torch.nn as nn
import math

In [2]:
class InputEmbeddings(nn.Module):

    def __init__(self, d_model:int, vocab_size:int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self,x):
        return self.embedding(x) * math.sqrt(self.d_model)

In [3]:
d_model = 8
vocab_size = 20
batch_size, seq_len = 2, 5

layer = InputEmbeddings(d_model, vocab_size)

# Some random token IDs
tokens = torch.randint(0, vocab_size, (batch_size, seq_len))
print("Token IDs:\n", tokens)

# Raw embeddings (with scaling)
out = layer(tokens)
print("\nEmbeddings (with scaling):\n", out)

# To see their size, print the L2 norm of each token vector
norms = out.norm(dim=-1)
print("\nNorm of each embedding vector (after scaling):\n", norms)

Token IDs:
 tensor([[14,  1, 18, 14,  3],
        [ 6, 17, 18,  9, 12]])

Embeddings (with scaling):
 tensor([[[ 3.1498, -5.8949,  2.5421, -0.7695,  2.0498,  0.0136, -3.4912,
          -0.5024],
         [ 1.4175,  8.3697,  3.1052,  6.5225, -3.0622,  0.9387,  2.8692,
           4.7533],
         [ 1.8225,  1.1268, -2.9127,  1.5103,  1.2764,  2.2662, -1.6036,
          -2.1378],
         [ 3.1498, -5.8949,  2.5421, -0.7695,  2.0498,  0.0136, -3.4912,
          -0.5024],
         [-0.4442, -1.1071, -6.5675,  0.5003,  0.5041, -0.6278, -3.3942,
          -2.6709]],

        [[ 0.3182,  0.8019, -2.9850, -1.4625,  0.5113,  5.4729, -0.9389,
          -1.2096],
         [-6.7336, -0.8448,  2.8472, -3.8637,  2.2501,  1.1553,  3.4675,
           3.0849],
         [ 1.8225,  1.1268, -2.9127,  1.5103,  1.2764,  2.2662, -1.6036,
          -2.1378],
         [-1.8019,  1.9768,  2.5436, -1.2389,  2.1357,  0.1672,  1.7539,
          -3.1171],
         [-4.7433,  0.8906, -0.9285, -2.9469, -5.2080,  1.4

In [36]:
class PositionalEncodings(nn.Module):
    def __init__(self, d_model:int, seq_len:int, dropout:float) -> None:

        super().__init__()
        self.d_model = d_model 
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        pe[:, 0::2] = torch.sin(position * div_term) 
        pe[:, 1::2] = torch.cos(position * div_term) 

        pe = pe.unsqueeze(0)

        self.register_buffer('pe',pe)


    def forward(self,x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)

In [42]:
pos_enc = PositionalEncodings(d_model=8, seq_len=10, dropout=0.1)

# 2. Example: a batch of 2 sentences, each with 5 tokens, already embedded
batch_size = 2
sentence_len = 5
x = torch.randn(batch_size, sentence_len, 8)   # random "word embeddings"

print("x shape (before PE):", x.shape)

# 3. Pass through positional encoding
out = pos_enc(x)
print("out shape (after PE):", out.shape)

x shape (before PE): torch.Size([2, 5, 8])
out shape (after PE): torch.Size([2, 5, 8])


In [43]:
import torch
import torch.nn as nn
import math

# ---- Your classes (unchanged) ----
class InputEmbeddings(nn.Module):
    def __init__(self, d_model:int, vocab_size:int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self,x):
        return self.embedding(x) * math.sqrt(self.d_model)

class PositionalEncodings(nn.Module):
    def __init__(self, d_model:int, seq_len:int, dropout:float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_len, d_model)                             # (max_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)                   # even dims
        pe[:, 1::2] = torch.cos(position * div_term)                   # odd dims
        self.register_buffer('pe', pe.unsqueeze(0))                    # (1, max_len, d_model)

    def forward(self,x):
        # x: (batch, seq_len, d_model)
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)

# ---- Demo: different-length sentences with the same modules ----
torch.manual_seed(0)

d_model = 8
vocab_size = 50
max_len_for_pe = 10     # this is the maximum length we precomputed PE for
dropout = 0.0           # set to 0.1 if you want to see dropout in action

embed = InputEmbeddings(d_model, vocab_size)
posenc = PositionalEncodings(d_model, seq_len=max_len_for_pe, dropout=dropout)

# Sentence A: length 4 tokens
tokens_A = torch.tensor([[4, 7, 13, 2]])               # shape (batch=1, seq_len=4)
x_A = embed(tokens_A)                                  # (1, 4, d_model)
y_A = posenc(x_A)                                      # (1, 4, d_model)

# Sentence B: length 7 tokens
tokens_B = torch.tensor([[1, 12, 15, 6, 19, 3, 9]])    # shape (1, 7)
x_B = embed(tokens_B)                                  # (1, 7, d_model)
y_B = posenc(x_B)                                      # (1, 7, d_model)

print("Sentence A shapes -> tokens:", tokens_A.shape, "emb:", x_A.shape, "with PE:", y_A.shape)
print("Sentence B shapes -> tokens:", tokens_B.shape, "emb:", x_B.shape, "with PE:", y_B.shape)

# Peek at one token to see the effect of adding position
i = 0   # batch index
t = 1   # position index inside the sentence
print("\nSentence A, token at pos 1:")
print("  embedding only:      ", x_A[i, t])
print("  + positional encoding:", y_A[i, t])

# Safety: what happens if a sentence is LONGER than max_len_for_pe?
try:
    tokens_long = torch.randint(0, vocab_size, (1, 12))   # seq_len=12 > max_len_for_pe=10
    x_long = embed(tokens_long)
    y_long = posenc(x_long)                                # expected to fail
except Exception as e:
    print("\nAs expected: seq_len > max_len causes an error:")
    print(" ", e)

Sentence A shapes -> tokens: torch.Size([1, 4]) emb: torch.Size([1, 4, 8]) with PE: torch.Size([1, 4, 8])
Sentence B shapes -> tokens: torch.Size([1, 7]) emb: torch.Size([1, 7, 8]) with PE: torch.Size([1, 7, 8])

Sentence A, token at pos 1:
  embedding only:       tensor([-1.2621,  2.1044,  4.3020,  9.6464, -4.3308, -3.4907,  5.1470, -1.5600],
       grad_fn=<SelectBackward0>)
  + positional encoding: tensor([-0.4206,  2.6447,  4.4018, 10.6414, -4.3208, -2.4907,  5.1480, -0.5600],
       grad_fn=<SelectBackward0>)

As expected: seq_len > max_len causes an error:
  The size of tensor a (12) must match the size of tensor b (10) at non-singleton dimension 1
