## Positional Encoding

In [8]:
import torch
import torch.nn as nn

max_length = 10 #max number of words that can be passed to the model
d_model = 6 #dimension of the model

$$
PE(\text{k}, i) = \sin\bigg( \frac{ \text{k} }{10000^\frac{i}{d_{model}}} \bigg) \text{ when i is even}
$$

$$
PE(\text{k}, i) = \cos\bigg( \frac{ \text{k} }{10000^\frac{i-1}{d_{model}}} \bigg) \text{ when i is odd}
$$
Here:

* k: Position of an object in the input sequence, 

* d_model: Dimension of the output embedding space

* 10,000 is the user defined scalar, set to 10,000 by the authors of [Attention is All You Need](https://arxiv.org/abs/1706.03762)


In [11]:
denominator = torch.pow(10000, torch.arange(0, d_model, 2).float()/d_model)
print(denominator)

tensor([  1.0000,  21.5443, 464.1590])


In [14]:
position = torch.arange(0, max_length).float().reshape(-1,1)
position

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.]])

In [15]:
even_PE = torch.sin(position/denominator)
odd_PE = torch.cos(position/denominator)

even_PE, odd_PE

(tensor([[ 0.0000,  0.0000,  0.0000],
         [ 0.8415,  0.0464,  0.0022],
         [ 0.9093,  0.0927,  0.0043],
         [ 0.1411,  0.1388,  0.0065],
         [-0.7568,  0.1846,  0.0086],
         [-0.9589,  0.2300,  0.0108],
         [-0.2794,  0.2749,  0.0129],
         [ 0.6570,  0.3192,  0.0151],
         [ 0.9894,  0.3629,  0.0172],
         [ 0.4121,  0.4057,  0.0194]]),
 tensor([[ 1.0000,  1.0000,  1.0000],
         [ 0.5403,  0.9989,  1.0000],
         [-0.4161,  0.9957,  1.0000],
         [-0.9900,  0.9903,  1.0000],
         [-0.6536,  0.9828,  1.0000],
         [ 0.2837,  0.9732,  0.9999],
         [ 0.9602,  0.9615,  0.9999],
         [ 0.7539,  0.9477,  0.9999],
         [-0.1455,  0.9318,  0.9999],
         [-0.9111,  0.9140,  0.9998]]))

In [21]:
stacked = torch.stack((even_PE, odd_PE), dim=2)
stacked.shape

torch.Size([10, 3, 2])

In [18]:
stacked_PE = stacked.reshape(max_length, d_model)
stacked_PE

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])

## Defining a Class

In [22]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_length):
    super().__init__()
    self.max_length = max_length  
    self.d_model = d_model

  def forward(self):
    denominator = torch.pow(10000, torch.arange(0, self.d_model, 2).float()/self.d_model)
    position = torch.arange(0, self.max_length).float().reshape(-1,1)
    even_PE = torch.sin(position/denominator)
    odd_PE = torch.cos(position/denominator)
    stacked = torch.stack((even_PE, odd_PE), dim=2)
    stacked_PE = stacked.reshape(self.max_length, self.d_model)
    return  stacked_PE
  

PE = PositionalEncoding(512, 100)
PE.forward()

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
          1.0366e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
          2.0733e-04,  1.0000e+00],
        ...,
        [ 3.7961e-01, -9.2515e-01, -6.2536e-01,  ...,  9.9995e-01,
          1.0055e-02,  9.9995e-01],
        [-5.7338e-01, -8.1929e-01,  2.8505e-01,  ...,  9.9994e-01,
          1.0159e-02,  9.9995e-01],
        [-9.9921e-01,  3.9821e-02,  9.5015e-01,  ...,  9.9994e-01,
          1.0262e-02,  9.9995e-01]])