### Using Pytorch to Create self attention

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import torch.nn.functional as F


In [2]:
# Lets create a word vocabulary
word = "I will be an AI/ML engineer insha Allah"

# Lets define word vocab
word_vocab = dict(enumerate(word.split()))

In [3]:
# display the value key 
word_vocab

{0: 'I',
 1: 'will',
 2: 'be',
 3: 'an',
 4: 'AI/ML',
 5: 'engineer',
 6: 'insha',
 7: 'Allah'}

In [4]:
# Reverse the dictionary 
def reverse_dict(input_dict):
    return {value: key for key, value in input_dict.items()}

In [5]:
# Apply the function the reversed dict
vocab = reverse_dict(word_vocab)

In [6]:
# Display the function
vocab

{'I': 0,
 'will': 1,
 'be': 2,
 'an': 3,
 'AI/ML': 4,
 'engineer': 5,
 'insha': 6,
 'Allah': 7}

In [7]:
# Check the vocabulary size 
vocab_size = len(vocab)
print("Vocabulary size:",vocab_size)

# Define embedding dimension
embedding_dim = 4

Vocabulary size: 8


In [8]:
# Creating an embedding layer
embedding_layer = nn.Embedding(vocab_size, embedding_dim)
print(embedding_layer)

Embedding(8, 4)


In [9]:
# Convert sentence into token indices
token_indices = [vocab[word] for word in word.split()]
input_indices = torch.tensor(token_indices)

In [10]:
# Get word embeddings
X = embedding_layer(input_indices)

# Display the x
print(X)

tensor([[-0.0590,  0.5473,  0.6190,  0.0151],
        [ 0.3312, -1.0329, -0.6412,  0.7122],
        [ 0.3374,  0.1075, -0.0401,  0.6973],
        [-0.3271,  0.8767,  0.6305,  0.6581],
        [ 1.4292,  1.6924,  0.3518, -2.2700],
        [-2.2555,  1.1181,  0.0195,  0.1689],
        [ 0.9794,  2.2261,  1.6699, -1.5310],
        [ 0.6123,  0.6297,  0.1830, -0.0592]], grad_fn=<EmbeddingBackward0>)


In [11]:
# Check the shape of X
X.shape

torch.Size([8, 4])

In [12]:
print(X)

tensor([[-0.0590,  0.5473,  0.6190,  0.0151],
        [ 0.3312, -1.0329, -0.6412,  0.7122],
        [ 0.3374,  0.1075, -0.0401,  0.6973],
        [-0.3271,  0.8767,  0.6305,  0.6581],
        [ 1.4292,  1.6924,  0.3518, -2.2700],
        [-2.2555,  1.1181,  0.0195,  0.1689],
        [ 0.9794,  2.2261,  1.6699, -1.5310],
        [ 0.6123,  0.6297,  0.1830, -0.0592]], grad_fn=<EmbeddingBackward0>)


In [13]:
# LEts compute for Q, K, V
# Define trainable weights matrices (embedding_dim, embedding_dim)
W_Q = torch.randn(embedding_dim, embedding_dim)
W_K = torch.randn(embedding_dim, embedding_dim)
W_V = torch.randn(embedding_dim, embedding_dim)

In [14]:
print(W_Q)

tensor([[-1.2928, -0.7106, -0.7896,  1.2144],
        [ 0.9387, -1.5529,  1.2481, -0.3445],
        [-0.3503, -1.9532,  0.5707, -0.5729],
        [-1.9309, -0.3712, -0.7140,  0.2926]])


In [15]:
# Compute Q, K, V
Q = X @ W_Q # (8, 4) @ (4, 4) -> (8, 4)
K = X @ W_K # (8, 4) @ (4, 4) -> (8, 4)
V = X @ W_V # (8, 4) @ (4, 4) -> (8, 4)

In [16]:
# Display Q, K, V
print("Query (Q):\n", Q)
print("\nKey (K):\n", K)
print("\nValue (V):\n", V)

Query (Q):
 tensor([[ 0.3440, -2.0226,  1.0721, -0.6104],
        [-2.5485,  2.3566, -2.4252,  1.3337],
        [-1.6678, -0.5872, -0.6531,  0.5997],
        [-0.2459, -2.6048,  1.2423, -0.8678],
        [ 4.0012, -3.4882,  2.8055,  0.2868],
        [ 3.6324, -0.2343,  3.0671, -3.0859],
        [ 3.1949, -6.8461,  4.0512, -0.9822],
        [-0.1503, -1.7485,  0.4492,  0.4044]], grad_fn=<MmBackward0>)

Key (K):
 tensor([[-2.0337e-01,  1.7661e-01,  2.6429e-01,  2.1793e-01],
        [ 8.6337e-01,  3.2638e-01, -1.6291e+00,  1.6517e-04],
        [ 1.1916e+00,  4.6152e-01, -7.4304e-01,  2.6999e-01],
        [ 5.1300e-01, -3.0364e-01,  3.7816e-01, -1.1556e-01],
        [-1.3676e-01,  2.0521e+00,  2.0122e+00,  1.8141e+00],
        [-9.5498e-01, -3.6832e+00,  3.3265e+00, -2.3656e+00],
        [-6.1319e-01,  2.0655e+00,  1.6451e+00,  1.8918e+00],
        [ 8.2418e-01,  9.1279e-01,  1.6272e-02,  7.1570e-01]],
       grad_fn=<MmBackward0>)

Value (V):
 tensor([[ 0.1780,  1.0484, -0.0150,  0.3694],

In [18]:
# Lets compute Attention Scores (dot product of Q and K ^ T)
attention_scores = Q @ K.T
print(attention_scores)

tensor([[ -0.2768,  -2.1098,  -1.4849,   1.2666,  -3.1476,  12.1314,  -3.7797,
          -1.9821],
        [  0.5842,   2.5198,   0.2129,  -3.0942,   2.7240, -17.4684,   4.9638,
           0.9658],
        [  0.1936,  -0.5675,  -1.6111,  -0.9936,  -1.2031,   0.1642,  -0.1300,
          -1.4919],
        [ -0.2708,  -3.0864,  -2.6526,   1.2349,  -4.3862,  16.0143,  -4.8275,
          -3.1812],
        [ -0.6258,  -2.2542,   1.1509,   4.1396,  -1.5398,  17.6807,  -4.5006,
           0.3646],
        [ -0.6420,  -1.9373,   1.1082,   3.4511,  -0.4042,  14.8967,  -3.5037,
           0.6212],
        [ -1.0022,  -6.0759,  -2.6279,   5.3632,  -8.1157,  37.9643, -11.2934,
          -4.2529],
        [ -0.0714,  -1.4321,  -1.2106,   0.5770,  -1.9301,   7.1212,  -2.0154,
          -1.4231]], grad_fn=<MmBackward0>)


In [20]:
# Let's scale the scores
scaled_scores = attention_scores / (embedding_dim ** 0.5)
scaled_scores

tensor([[-0.1384, -1.0549, -0.7425,  0.6333, -1.5738,  6.0657, -1.8898, -0.9910],
        [ 0.2921,  1.2599,  0.1065, -1.5471,  1.3620, -8.7342,  2.4819,  0.4829],
        [ 0.0968, -0.2838, -0.8056, -0.4968, -0.6015,  0.0821, -0.0650, -0.7460],
        [-0.1354, -1.5432, -1.3263,  0.6174, -2.1931,  8.0072, -2.4138, -1.5906],
        [-0.3129, -1.1271,  0.5754,  2.0698, -0.7699,  8.8404, -2.2503,  0.1823],
        [-0.3210, -0.9686,  0.5541,  1.7255, -0.2021,  7.4483, -1.7519,  0.3106],
        [-0.5011, -3.0379, -1.3140,  2.6816, -4.0579, 18.9821, -5.6467, -2.1265],
        [-0.0357, -0.7161, -0.6053,  0.2885, -0.9650,  3.5606, -1.0077, -0.7116]],
       grad_fn=<DivBackward0>)

In [22]:
# Apply softmax 
attention_weights = F.softmax(scaled_scores, dim=-1)
print(attention_weights)

tensor([[2.0011e-03, 8.0029e-04, 1.0938e-03, 4.3293e-03, 4.7632e-04, 9.9010e-01,
         3.4725e-04, 8.5306e-04],
        [5.6559e-02, 1.4888e-01, 4.6977e-02, 8.9900e-03, 1.6488e-01, 6.7989e-06,
         5.0527e-01, 6.8448e-02],
        [1.8500e-01, 1.2645e-01, 7.5039e-02, 1.0219e-01, 9.2021e-02, 1.8230e-01,
         1.5736e-01, 7.9648e-02],
        [2.9054e-04, 7.1089e-05, 8.8311e-05, 6.1682e-04, 3.7117e-05, 9.9880e-01,
         2.9767e-05, 6.7800e-05],
        [1.0568e-04, 4.6816e-05, 2.5692e-04, 1.1450e-03, 6.6915e-05, 9.9819e-01,
         1.5226e-05, 1.7341e-04],
        [4.1985e-04, 2.1970e-04, 1.0073e-03, 3.2501e-03, 4.7285e-04, 9.9374e-01,
         1.0039e-04, 7.8958e-04],
        [3.4558e-09, 2.7340e-10, 1.5329e-09, 8.3326e-08, 9.8596e-11, 1.0000e+00,
         2.0130e-11, 6.8023e-10],
        [2.4272e-02, 1.2292e-02, 1.3732e-02, 3.3565e-02, 9.5829e-03, 8.8503e-01,
         9.1824e-03, 1.2347e-02]], grad_fn=<SoftmaxBackward0>)


In [24]:
# Compute final output
output = attention_weights @ V
print(output)

tensor([[-1.9398,  1.8761, -3.6498,  2.2542],
        [ 1.9508,  0.8007,  2.5593,  0.5141],
        [ 0.4631,  0.9066,  0.3508,  0.8758],
        [-1.9602,  1.8832, -3.6841,  2.2657],
        [-1.9588,  1.8830, -3.6819,  2.2654],
        [-1.9478,  1.8793, -3.6634,  2.2601],
        [-1.9628,  1.8840, -3.6885,  2.2672],
        [-1.6650,  1.7673, -3.1884,  2.0972]], grad_fn=<MmBackward0>)
