In [33]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5) 
   [0.05, 0.80, 0.55]] # step     (x^6)
)

## Calculating Attention scores

In [34]:
query=inputs[1]

attn_scores2=torch.empty(inputs.shape[0])

for id,val in enumerate(inputs):
    attn_scores2[id]=torch.dot(query,val)

print(attn_scores2)


tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


## Very basic normalization

In [35]:
attn_scores2_nor=attn_scores2/attn_scores2.sum()

In [36]:
attn_scores2_nor

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])

In [37]:
attn_scores2_nor.sum()

tensor(1.0000)

## Naive Softmax

In [38]:
def softmax_naive(x):
    return torch.exp(x)/torch.exp(x).sum(dim=0)

attn_weights_2_naive = softmax_naive(attn_scores2)

print("Attention weights:", attn_weights_2_naive)
print("Sum:", attn_weights_2_naive.sum())

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: tensor(1.)


## PyTorch Softmax

In [39]:
attn_weights_2_pt=torch.softmax(attn_scores2,dim=0)

In [40]:
attn_weights_2_pt

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])

In [41]:
attn_weights_2_pt.sum()

tensor(1.)

## Calculating Context Vector 

In [42]:
query=inputs[1]

context_vector2=torch.empty(query.shape)

for id,val in enumerate(inputs):
    print(f"id : {id} val : {val} attention score : {attn_weights_2_pt[id]}")
    context_vector2+=attn_weights_2_pt[id]*val
    print(f"context_vector: {context_vector2}")

print(f"Final Context vector for second vector : {context_vector2}")

id : 0 val : tensor([0.4300, 0.1500, 0.8900]) attention score : 0.13854756951332092
context_vector: tensor([0.0596, 0.0208, 0.4844])
id : 1 val : tensor([0.5500, 0.8700, 0.6600]) attention score : 0.2378913015127182
context_vector: tensor([0.1904, 0.2277, 0.6414])
id : 2 val : tensor([0.5700, 0.8500, 0.6400]) attention score : 0.23327402770519257
context_vector: tensor([0.3234, 0.4260, 0.7907])
id : 3 val : tensor([0.2200, 0.5800, 0.3300]) attention score : 0.12399158626794815
context_vector: tensor([0.3507, 0.4979, 0.8316])
id : 4 val : tensor([0.7700, 0.2500, 0.1000]) attention score : 0.10818186402320862
context_vector: tensor([0.4340, 0.5250, 0.8424])
id : 5 val : tensor([0.0500, 0.8000, 0.5500]) attention score : 0.15811361372470856
context_vector: tensor([0.4419, 0.6515, 0.9294])
Final Context vector for second vector : tensor([0.4419, 0.6515, 0.9294])


## Calculating context vector for all inputs

In [43]:
attn_score_matrix=torch.zeros(inputs.shape[0],inputs.shape[0])

for i,val in enumerate(inputs):
    for j,jval in enumerate(inputs):
        attn_score_matrix[i,j]+=torch.dot(val,jval)


print(attn_score_matrix)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


## attention score using optimized approach

In [44]:
attn_score=inputs @ inputs.T

In [45]:
attn_score

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

In [57]:
attn_weight=torch.softmax(attn_score,dim=-1)

In [58]:
final_context_vector_matrix=torch.zeros(inputs.shape)

for i,val in enumerate(inputs):
    for j,jval in enumerate(inputs):
        final_context_vector_matrix[i]+=attn_weight[i,j]*jval

In [59]:
final_context_vector_matrix

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])

## optimized way to find context vector

In [60]:
op_context_vec=attn_weight @ inputs

In [61]:
op_context_vec

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])