<a href="https://colab.research.google.com/github/AyoubMDL/transformers_from_scratch/blob/main/attention_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import math

Each word in a sequence is represented with a query, key and value

In [2]:
seq_length, hid_dim = 4, 8
q = torch.randn(seq_length, hid_dim)
k = torch.randn(seq_length, hid_dim)
v = torch.randn(seq_length, hid_dim)

## Mask

The mask is used in the decoder part

In [8]:
mask = torch.tril(torch.ones((seq_length, seq_length)))
mask

tensor([[1., 0., 0., 0.],
        [1., 1., 0., 0.],
        [1., 1., 1., 0.],
        [1., 1., 1., 1.]])

In [9]:
mask[mask == 0] = -torch.inf
mask[mask == 1] = 0

In [10]:
mask

tensor([[0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf],
        [0., 0., 0., 0.]])

In [11]:
def softmax(x):
    return (torch.exp(x).T / torch.sum(torch.exp(x), axis=-1)).T

In [19]:
def scaled_dot_product_attention(q, k, v, mask=None):
    d_k = q.shape[-1]
    prod = torch.matmul(q, k.T)
    scaled = prod/math.sqrt(d_k)

    if mask is not None:
        scaled += mask
    
    attention = softmax(scaled)
    
    #multiply with the value
    out = torch.matmul(attention, v)

    return out, attention

In [21]:
context_vectors, attention = scaled_dot_product_attention(q, k, v)

In [22]:
context_vectors

tensor([[-0.0629, -0.7108,  1.2975, -0.1208, -0.2076, -1.7786,  0.9569, -0.1328],
        [-0.0214, -0.6624,  1.1605,  0.2065, -0.3132, -1.8920,  0.7031, -0.0079],
        [ 0.3608, -0.0316,  0.8481,  0.0792, -0.2344, -0.4652, -0.5787,  0.6163],
        [ 0.5420,  0.4159,  0.6575,  0.1071, -0.1018,  0.4453, -1.4695,  0.9779]])

In [23]:
attention

tensor([[0.1490, 0.6234, 0.1139, 0.1136],
        [0.0710, 0.6071, 0.2469, 0.0750],
        [0.2467, 0.2406, 0.3402, 0.1725],
        [0.3837, 0.0058, 0.4339, 0.1765]])