####  Exploring Attention Mechanisms

In [None]:
import torch
import numpy as np

inputs = torch.tensor([
  [0.46, 0.86, 0.83],
  [0.36, 0.74, 0.78],
  [0.37, 0.94, 0.63],
  [0.64, 0.52, 0.94],
  [0.96, 0.84, 0.31],
  [0.53, 0.74, 0.18],
])

query_token = inputs[1]
#print(inputs.shape)

## This vector holds the dot product results between each embedding vector
## for each token in our context and the embedding vector of our query token,
## our query token being the current token we're using to try and predict what word
## should come next.
attention_score_vector = torch.empty(inputs.shape[0])
#print("Attention", attention_result_vector)

for index, tensor in enumerate(inputs):
  attention_score_vector[index] = torch.dot(query_token, inputs[index])

print(torch.round(attention_score_vector, decimals=4))

tensor([1.4494, 1.2856, 1.3202, 1.3484, 1.2090, 0.8788])


#### Now, we wish to normalize the attention score vector
##### Helps with optimization - to work with smaller numbers.

In [4]:
attention_score_vector_normalized = attention_score_vector/torch.sum(attention_score_vector)
print(attention_score_vector)
print(attention_score_vector_normalized)


# This is a more stable version of softmax based normalization
# Softmax formula: e^{x_i} / sum_[of all x_i] (e^{x_i})
torch_normalized_attention_score = torch.softmax(attention_score_vector, dim=0)
print("Torch softmax'd", torch_normalized_attention_score)

tensor([1.4494, 1.2856, 1.3202, 1.3484, 1.2090, 0.8788])
tensor([0.1935, 0.1716, 0.1762, 0.1800, 0.1614, 0.1173])
Torch softmax'd tensor([0.2007, 0.1704, 0.1764, 0.1814, 0.1578, 0.1134])


#### Now we want to go through each value of our normalized attention score vector, multiply it to each embedding vector in our inputs array, and then add the product, individually, to an all-zeros vector, creating our context vector for the second token, our query token.

##### attention_score_vector_unnormalized: query_token [of shape (d)], dot_product_with_each_input_token_embedding_vector[of shape (n, d)], creates new vector of shape (n,)

##### attention_score_vector: normalized version of attention_score_vector_unnormalized, using torch.softmax(vector, dim=0), on the unnormalized vector

##### context vector: take each element in your normalized attention score vector [of shape (n,)] for the current query token, multiply it by the corresponding input token embedding vector [the array of embedding vectors is of shape (n, d)], and add it to a vector initially filled with all zeros [of shape (d)]

In [5]:
query_token

tensor([0.3600, 0.7400, 0.7800])