In [38]:
import torch
import torch.nn as nn
import math

class InputEmbedding(nn.Module):

    def __init__(self,d_model:int, vocab_size:int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self,x):
        return self.embedding(x) * math.sqrt(self.d_model)
    

In [39]:
input_embedding = InputEmbedding(d_model=512, vocab_size=10000)
input_embedding

InputEmbedding(
  (embedding): Embedding(10000, 512)
)

In [40]:
x = torch.randint(0, 10000, (1, 20))  # batch size of 32, sequence length of 20
x[0:10]

tensor([[6050, 3023, 3757, 8340, 1534, 6031, 6737, 9188, 6621, 3106, 4246, 3837,
         1123, 5032, 7876, 6707, 9585, 1238, 3706, 9414]])

In [41]:
input_embedding(x).shape

torch.Size([1, 20, 512])

In [42]:
position = torch.arange(0, 5, dtype=torch.float)
print(position)
print(position.shape)

tensor([0., 1., 2., 3., 4.])
torch.Size([5])


In [43]:
position = torch.arange(0, 5, dtype=torch.float).unsqueeze(1) 
print(position)
print(position.shape)

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.]])
torch.Size([5, 1])


In [44]:
torch.arange(0,5,2).float()

tensor([0., 2., 4.])

In [45]:
(-math.log(10000.0) / 5)

-1.8420680743952367

In [46]:
torch.arange(0,5,2).float() * (-math.log(10000.0) / 5)

tensor([-0.0000, -3.6841, -7.3683])

In [47]:
torch.exp(torch.arange(0,5,2).float() * (-math.log(10000.0) / 5))

tensor([1.0000e+00, 2.5119e-02, 6.3096e-04])

In [48]:
div_term = torch.exp(torch.arange(0,5,2).float() * (-math.log(10000.0) / 5))
print(div_term)

tensor([1.0000e+00, 2.5119e-02, 6.3096e-04])


In [49]:
seq_len = 6
d_model = 5

pe = torch.zeros(seq_len, d_model)
        
## create position tensor of shape (seq_len, 1)
position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) 

div_term = torch.exp(torch.arange(0,d_model,2).float() * (-math.log(10000.0) / d_model))

## apply sin to even indices in the array; 2i
pe[:, 0::2] = torch.sin(position * div_term)
print("before apply cos",pe)

pe[:, 1::2] = torch.cos(position * div_term)
print("after apply cos",pe)

before apply cos tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 8.4147e-01,  0.0000e+00,  2.5116e-02,  0.0000e+00,  6.3096e-04],
        [ 9.0930e-01,  0.0000e+00,  5.0217e-02,  0.0000e+00,  1.2619e-03],
        [ 1.4112e-01,  0.0000e+00,  7.5285e-02,  0.0000e+00,  1.8929e-03],
        [-7.5680e-01,  0.0000e+00,  1.0031e-01,  0.0000e+00,  2.5238e-03],
        [-9.5892e-01,  0.0000e+00,  1.2526e-01,  0.0000e+00,  3.1548e-03]])


RuntimeError: The expanded size of the tensor (2) must match the existing size (3) at non-singleton dimension 1.  Target sizes: [6, 2].  Tensor sizes: [6, 3]

In [None]:
pe[:, 1::2] = torch.cos(position * div_term)


RuntimeError: The expanded size of the tensor (2) must match the existing size (3) at non-singleton dimension 1.  Target sizes: [3, 2].  Tensor sizes: [3, 3]