**A simple self-attention mechanism without trainable weights**

In [1]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [2]:
input_query = inputs[1]
input_query 

tensor([0.5500, 0.8700, 0.6600])

In [3]:
input_1 = inputs[0]
input_1

tensor([0.4300, 0.1500, 0.8900])

In [None]:
torch.dot(input_query, input_1) #dot product calculates the similarity between the query and the key

tensor(0.9544)

In [5]:
query = inputs[1] #2nd input is taken as example and this is called as query

attn_scores_2 = torch.empty(inputs.shape[0]) #empty tensor is created to store the attention score of the query token

for i, x_i in enumerate(inputs): #enumerate function is used to get the index and the value of the input tensor
    attn_scores_2[i] = torch.dot(query, x_i) #dot product is calculated between the query and the key (key refers to other tokens including the query token)

In [8]:
attn_scores_2 #this is the attention score of the query token attention scores refers to the similarity between the query and the key if you see the query token is most similar to the query token itself that is why it is the highest

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])

In [9]:
attn_weights_2 = torch.nn.functional.softmax(attn_scores_2, dim=0) #softmax is applied to the attention scores to get the attention weights
attn_weights_2 #this is the attention weight of the query token

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])

In [14]:
#now we will calculate the context vector of the query token which is the weighted sum of the input tokens with this attention weights

query = inputs[1]

context_vec_2 = torch.zeros(query.shape) #empty tensor is created to store the context vector of the query token and the shape of the context vector is the same as the query token

for i, x_i in enumerate(inputs): #enumerate function is used to get the index and the value of the input tensor
    context_vec_2 += attn_weights_2[i] * x_i #weighted sum is calculated between the attention weights and the input tokens

In [15]:
context_vec_2 #this is the context vector of the query token

tensor([0.4419, 0.6515, 0.5683])

In [21]:
attn_scores = torch.zeros(6,6) #empty tensor is created to store the attention scores of the all the token with respect to all the other tokens so it's a 6x6 matrix

for i, x_i in enumerate(inputs): #enumerate function is used to get the index and the value of the input tensor
    for j, x_j in enumerate(inputs): #enumerate function is used to get the index and the value of the input tensor
        attn_scores[i, j] = torch.dot(x_i, x_j) #dot product is calculated between the input tokens

In [23]:
attn_scores #if you look at this you will see that attn_scores_2 is the second row of this matrix so each row is the attention score of that token with respect to all the other tokens and the diagonal elements are the attention scores of the query token with respect to itself which is the highest

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

In [24]:
attn_scores = inputs @ inputs.T #this is the same as the above code but it is more efficient and it is a matrix multiplication
attn_scores

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

In [26]:
attn_weights = torch.softmax(attn_scores, dim=1) #softmax is applied to the attention scores to get the attention weights dim=1 means that the softmax is applied to each row of the matrix so each row adds up to 1

attn_weights

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])

In [None]:
context_vec = attn_weights @ inputs #context vector is the weighted sum of the input tokens with this attention weights
context_vec #each row of the context vector of that token 

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])

**Implementing self attention with trainable weights**