In [5]:
import torch

In [6]:
# we are going to firs implement simplified self-attention mechanism and then step by step we will build the actual self
# attention mechanism
inputs = torch.tensor(
    [[0.43, 0.15, 0.89],  # your    (x^1)
    [0.55, 0.87, 0.66],   # journey (x^2)
    [0.57, 0.85, 0.64],   # starts  (x^3)
    [0.22, 0.58, 0.33],   # with    (x^4)
    [0.77, 0.25, 0.10],   # one     (x^5)
    [0.05, 0.80, 0.55]]   # step    (x^6)
)

In [7]:
# how the attention weights for each token of inputs will be calculated for finding context vector for "journey"?
# take query as embedding of "journey"
# dot product its embedding with each token of the inputs and thus you will find attention score/weight for respective query
query = inputs[1] # embedding of "journey"

In [8]:
query

tensor([0.5500, 0.8700, 0.6600])

In [9]:
attention_scores = torch.empty(inputs.shape[0])

In [10]:
for index, element in enumerate(inputs):
    attention_scores[index] = torch.dot(query, element) # this dot opearation is done by summing the elementwise multiplication of 2 vectors

In [11]:
attention_scores 

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])

In [12]:
# we have to normalize the attention_scores here because 
# we want the training to have stability and we want interpretation
attention_weights = attention_scores/attention_scores.sum() # normalizing each value by dividing sum of initial values

In [13]:
# normalized_attention_scores is equal to attention_weights

In [14]:
attention_weights

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])

In [15]:
print(attention_weights.sum())

tensor(1.0000)


In [16]:
#higher the dot product, the greater the similarity between two vectors

In [17]:
#but it is prefered to use softmax
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)


In [18]:
attention_weights = softmax_naive(attention_scores)

In [19]:
attention_weights

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])

In [20]:
print(attention_weights.sum())

tensor(1.)


In [22]:
# but still due to some over flow under flow issue, it is best to use softmax of pytorch!
attention_weights = torch.softmax(attention_scores, dim=0)

In [23]:
print(attention_weights)

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


In [24]:
# calculate the context vector for inputs second token
query_2 = inputs[1]


In [29]:
query_2

tensor([0.5500, 0.8700, 0.6600])

In [41]:
context_vector_z2 = torch.zeros(query_2.shape)

In [42]:
context_vector_z2

tensor([0., 0., 0.])

In [43]:
for index, input_embedding in enumerate(inputs):
    context_vector_z2 += attention_weights[index] * input_embedding

In [45]:
print("context vector of the 2nd token of inputs called 'journey' is ")
print(context_vector_z2)

context vector of the 2nd token of inputs called 'journey' is 
tensor([0.4419, 0.6515, 0.5683])


In [53]:
inputs.shape[0]

6

In [62]:
# NOW lets generalize and create context vectors for all tokens of the input simultaneosly
attention_score_for_all_tokens = torch.zeros(inputs.shape[0], inputs.shape[0])

In [63]:
attention_score_for_all_tokens

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [64]:
for i in range(attention_score_for_all_tokens.size(0)):        # loop over rows
    for j in range(attention_score_for_all_tokens.size(1)):    # loop over columns
        attention_score_for_all_tokens[i][j] = torch.dot(inputs[i], inputs[j])

In [65]:
attention_score_for_all_tokens

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

In [67]:
# we did use nested for loop there. but we could efficiently do the same thing using matrix multiplication like below
attention_score_for_all_tokens = inputs @ inputs.T # here .T meaning the transposed version of input

In [68]:
attention_score_for_all_tokens

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

In [71]:
attention_weight_for_all_tokens = torch.softmax(attention_score_for_all_tokens, dim=-1)

In [72]:
attention_weight_for_all_tokens

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])

In [73]:
# calculate context vector
context_vector = attention_weight_for_all_tokens @ inputs

In [75]:
context_vector
#finally we have created our simple self attention mechanism.

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])