## Attention Mechanism

In [4]:
import numpy as np
from numpy import array
from numpy import random
from numpy import dot
from scipy.special import softmax

In [5]:
word_1 = np.array([1, 0, 0])
word_2 = np.array([0, 1, 0])
word_3 = np.array([1, 1, 0])
word_4 = np.array([0, 0, 1])

In [6]:
# stacking the word embeddings into a single array
words = array([word_1, word_2, word_3, word_4])

In [7]:
words

array([[1, 0, 0],
       [0, 1, 0],
       [1, 1, 0],
       [0, 0, 1]])

In [8]:
# generating the weight matrices
random.seed(42)
W_Q = random.randint(3, size=(3, 3))
W_K = random.randint(3, size=(3, 3))
W_V = random.randint(3, size=(3, 3))

In [14]:
# generating the queries, keys and values
Q = words @ W_Q
K = words @ W_K
V = words @ W_V

In [15]:
# scoring the query vectors against all key vectors
scores = Q @ K.transpose()

In [16]:
scores

array([[ 8,  2, 10,  2],
       [ 4,  0,  4,  0],
       [12,  2, 14,  2],
       [10,  4, 14,  3]])

In [17]:
# computing the weights by a softmax operation
weights = softmax(scores / K.shape[1] ** 0.5, axis=1)

In [18]:
weights

array([[2.36089863e-01, 7.38987555e-03, 7.49130386e-01, 7.38987555e-03],
       [4.54826323e-01, 4.51736775e-02, 4.54826323e-01, 4.51736775e-02],
       [2.39275049e-01, 7.43870015e-04, 7.59237211e-01, 7.43870015e-04],
       [8.99501754e-02, 2.81554063e-03, 9.05653685e-01, 1.58059922e-03]])

In [19]:
# computing the attention by a weighted sum of the value vectors
attention = weights @ V

In [20]:
print(attention)

[[0.98522025 1.74174051 0.75652026]
 [0.90965265 1.40965265 0.5       ]
 [0.99851226 1.75849334 0.75998108]
 [0.99560386 1.90407309 0.90846923]]
