## Self Attention Explained

In [37]:
import numpy as np
import math

L,dim_v,dim_k = 4,8,8 # L = 4 Since we are using only 4 words for illustration. dim_v and dim_k are set to 8(Just for illustration).count

#Setting q,v,k
q = np.random.randn(L,dim_k)
k = np.random.randn(L,dim_k)
v = np.random.randn(L,dim_v)

### Here each row of the vectors(Query,Key,Value) corresponds to each word in the sentence.

In [38]:
print("===================== Query Vector q =========================")
print(q)
print("===================== Key Vector k ===========================")
print(k)
print("===================== Value Vector v =========================")
print(v)

[[ 0.27710041  0.03188016  1.44596882 -1.34205436  0.14321269  0.27645839
   0.2120455  -0.03424386]
 [-1.68562268  1.35760768  0.30797969 -2.00162779 -0.88145719 -1.16567825
  -0.42736467 -0.86933783]
 [ 0.97788076 -0.08125846 -1.12267725  1.27057746 -0.03749489  0.37929365
  -1.45820779  0.2642624 ]
 [ 0.74134556 -1.02667713 -0.48092183  1.50016204 -0.18407928  0.49035576
   0.43729467 -0.60433436]]
[[-1.09750997  0.60600674 -1.34702431 -0.00707761 -0.32464676  0.87536513
   0.93560748  0.20497881]
 [-1.20536592  0.7792677   0.29531355 -0.80187422 -1.30155823 -0.62707862
   0.63349846 -0.30552488]
 [-0.61109375 -1.48863168  1.2227034   0.50665623 -2.35235382 -0.86774457
   0.35195043 -0.33173589]
 [ 0.30374713 -1.57928372 -0.63721208  1.94223431  0.32633268 -1.55898264
  -1.68295535  0.11461672]]
[[-0.4981263  -0.11508445  0.8109274   0.10882082 -0.52270394  1.20622013
   0.66105516 -0.21629269]
 [ 0.5561423  -2.27664162 -0.22577993  0.28111198  0.53082793 -1.00733069
  -2.64668928 -

### Self Attention 

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$ 
 

In [39]:
np.matmul(q,k.T)

array([[-1.83617689,  0.97904083,  0.38044587, -4.23919907],
       [ 0.95974443,  6.65884549,  1.59451584, -4.59071041],
       [-0.58514505, -3.78596764, -2.04737915,  5.48934073],
       [-0.02435007, -2.64485003,  1.60925565,  3.43697   ]])

In [40]:
# why we need sqrt(dim_k) in denominator
q.var() , k.var() , np.matmul(q,k.T).var()


(0.8353093225875383, 0.9690186755983412, 10.10051644599982)

In [41]:
# So inorder to reduce variance we use sqrt in denominator 
scaled = (np.matmul(q,k.T))/np.sqrt(dim_k)
q.var() , k.var() , scaled.var()

(0.8353093225875383, 0.9690186755983412, 1.2625645557499774)

In [42]:
scaled

array([[-0.64918657,  0.3461432 ,  0.13450793, -1.49878321],
       [ 0.3393209 ,  2.3542574 ,  0.56374648, -1.62306123],
       [-0.20688001, -1.3385417 , -0.72385784,  1.94077503],
       [-0.00860905, -0.9350957 ,  0.56895779,  1.2151524 ]])

### Masking

- Masking is done in decoders so that  words don't get context from words generated in the future.

In [43]:
mask = np.tril(np.ones((L,L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [44]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [45]:
scaled + mask


array([[-0.64918657,        -inf,        -inf,        -inf],
       [ 0.3393209 ,  2.3542574 ,        -inf,        -inf],
       [-0.20688001, -1.3385417 , -0.72385784,        -inf],
       [-0.00860905, -0.9350957 ,  0.56895779,  1.2151524 ]])

### Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [46]:
def softmax(x):
    #Compute softmax values for each sets of scores in x.
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [48]:
attention = softmax(scaled+mask)
attention

array([[0.14003465, 0.        , 0.        , 0.        ],
       [0.376304  , 0.94145988, 0.        , 0.        ],
       [0.21793491, 0.02344442, 0.21537662, 0.        ],
       [0.26572645, 0.0350957 , 0.78462338, 1.        ]])

In [51]:
new_v = np.matmul(attention,v)
new_v

array([[-0.06975494, -0.01611581,  0.11355793,  0.01523868, -0.07319666,
         0.16891261,  0.09257063, -0.03028847],
       [ 0.33613874, -2.18667348,  0.09259248,  0.30560536,  0.30305761,
        -0.49445597, -2.24299406, -0.18262909],
       [-0.37732892, -0.4753015 ,  0.13222328, -0.16876532,  0.12560082,
         0.31480883,  0.23405905,  0.14777957],
       [-0.8589682 , -1.26718809,  0.28576481, -1.09263352,  2.55954429,
         0.91356181,  0.87476643,  1.25265134]])

### Function

In [54]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention, v)
  return out, attention

In [53]:
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[ 0.27710041  0.03188016  1.44596882 -1.34205436  0.14321269  0.27645839
   0.2120455  -0.03424386]
 [-1.68562268  1.35760768  0.30797969 -2.00162779 -0.88145719 -1.16567825
  -0.42736467 -0.86933783]
 [ 0.97788076 -0.08125846 -1.12267725  1.27057746 -0.03749489  0.37929365
  -1.45820779  0.2642624 ]
 [ 0.74134556 -1.02667713 -0.48092183  1.50016204 -0.18407928  0.49035576
   0.43729467 -0.60433436]]
K
 [[-1.09750997  0.60600674 -1.34702431 -0.00707761 -0.32464676  0.87536513
   0.93560748  0.20497881]
 [-1.20536592  0.7792677   0.29531355 -0.80187422 -1.30155823 -0.62707862
   0.63349846 -0.30552488]
 [-0.61109375 -1.48863168  1.2227034   0.50665623 -2.35235382 -0.86774457
   0.35195043 -0.33173589]
 [ 0.30374713 -1.57928372 -0.63721208  1.94223431  0.32633268 -1.55898264
  -1.68295535  0.11461672]]
V
 [[-0.4981263  -0.11508445  0.8109274   0.10882082 -0.52270394  1.20622013
   0.66105516 -0.21629269]
 [ 0.5561423  -2.27664162 -0.22577993  0.28111198  0.53082793 -1.00733069
  -2.6