<a href="https://colab.research.google.com/github/DavoodSZ1993/Dive-into-Deep-Learning-Notes-/blob/main/11_2_attention_pooling_notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install d2l==1.0.0-alpha1.post0 --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.0/93.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.2/121.2 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.9/84.9 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[?25h

## 11.2 Attention Pooling by Similarity

* `torch.sort(input)`: Sorts the elements of the `input` tensor along a given dimension in an **ascending order** by value. 

In [2]:
import torch

X = torch.tensor([[1, 2, 3, 4],
                  [5, 6, 7, 8],
                  [9, 10, 11, 12],
                  [13, 14, 15, 16]])
sorted, indices = torch.sort(X, descending=True)
sorted, indices

(tensor([[ 4,  3,  2,  1],
         [ 8,  7,  6,  5],
         [12, 11, 10,  9],
         [16, 15, 14, 13]]),
 tensor([[3, 2, 1, 0],
         [3, 2, 1, 0],
         [3, 2, 1, 0],
         [3, 2, 1, 0]]))

* `torch.rand(size)`: Returns a tensor filled with random numbers from a uniform distribution on the interval $[0, 1)$.

In [3]:
torch.rand(2)

tensor([0.1970, 0.4328])

* `torch.randn(size)`: Returns a tensor filled with random numbers from a normal distribution with mean 0 and variance (also called the standard normal distribution).

In [4]:
torch.randn(2)

tensor([1.3805, 1.2968])

In [5]:
n = 4
x_train, _ = torch.sort(torch.rand(n) * 5)
x_val = torch.arange(0, 4, 1)

In [6]:
x_train = x_train.reshape((-1, 1))
x_val = x_val.reshape((1, -1))
x_train, x_val

(tensor([[1.2338],
         [1.6897],
         [2.6793],
         [3.5953]]),
 tensor([[0, 1, 2, 3]]))

In [7]:
dist = x_train - x_val    # Broadcasting!! - Each column is query and each row is key!
dist

tensor([[ 1.2338,  0.2338, -0.7662, -1.7662],
        [ 1.6897,  0.6897, -0.3103, -1.3103],
        [ 2.6793,  1.6793,  0.6793, -0.3207],
        [ 3.5953,  2.5953,  1.5953,  0.5953]])

In [9]:
# Defining Kernels
def gaussian(x):
  return torch.exp(-x**2 / 2)

def boxcar(x):
  return torch.abs(x) < 1.0

def constant(x):
  return 1.0 + 0 * X

def epanechikov(x):
  return torch.max(1 - torch.abs(x), torch.zeros_like(x))

In [21]:
kernels = (gaussian, boxcar, constant, epanechikov)

In [22]:
def f(x):
  return 2 * torch.sin(x) + x

n = 40
x_train, _ = torch.sort(torch.rand(n) * 5)
y_train = f(x_train) + torch.randn(n)
x_val = torch.arange(0, 5, 0.1)
y_val = f(x_val)

x_train.shape, x_val.shape

(torch.Size([40]), torch.Size([50]))

* `torch.tensor.type(dtype=None)`: Returns the type if *dtype* is not provided, else casts this object to the specific type.

In [24]:
# nadaraya_watson
dists = x_train.reshape((-1, 1)) - x_val.reshape((1, -1)) # Broadcasting
k = kernels[0](dists).type(torch.float32)                 # a(q, k)
attention_w = k / k.sum(0)
k.shape

torch.Size([40, 50])

In [8]:
x = torch.tensor([[1, 0.2],
                  [3, 0.4]])

Y = torch.abs(x) < 1         # False = 0 and True = 1
Y

tensor([[False,  True],
        [False,  True]])