# Self Attention in Transformers

## Generate Data

In [1]:
import numpy as np
import math

L, d_k, d_v = 4, 8, 8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [2]:
print("Q\n", q)
print("K\n", k)
print("V\n", v)

Q
 [[ 0.88888321 -0.82574469  0.46908683 -0.45678327 -1.38265852  0.1694321
  -1.59119032 -1.00098324]
 [ 0.28561517  0.74317068 -0.12289067  0.80702098  0.40739247 -1.09360065
  -1.434       0.29304955]
 [-0.38901001 -0.25273029  0.68103981  0.79027909  0.41875152  0.30114817
  -2.40908679  0.54390039]
 [-0.824473   -1.08835662 -0.09119343 -0.33749289  0.64509967 -0.98941865
  -0.51312064 -0.81792968]]
K
 [[-0.81240097  1.5552482   0.35304198 -0.43006368 -0.11817176 -0.40379548
  -1.51102001 -0.58216536]
 [-0.66054204  0.2284632   0.37370332  0.52223403  0.88503708  0.02266103
  -0.24734445 -0.74583914]
 [-0.09362249  0.3931177  -1.09009438 -0.27058443  0.51030466 -0.76357283
   0.28079781  0.21337443]
 [ 1.61403764 -0.89593099 -0.47805923 -0.55238847 -0.71730832 -0.10996484
  -0.54357201 -0.07042023]]
V
 [[ 0.25071438  0.48167573 -0.83957978  1.01643075 -1.23111387 -0.18240207
  -0.20065327  0.95013036]
 [ 0.82162369  0.10292045 -0.05746185  0.52864418 -0.66952928 -1.78059581
   0.77

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$ 

In [3]:
np.matmul(q, k.T)

array([[ 1.43771917, -0.91876546, -2.29092261,  4.11114863],
       [ 2.92297295,  0.82855493,  0.88381188, -0.00499881],
       [ 2.9759906 ,  1.43408237, -1.59583712, -0.22584542],
       [ 0.66488242,  1.37109747,  0.60614886, -0.14303226]])

In [4]:
# Why we need sqrt(d_k) in denominator
q.var(), k.var(), np.matmul(q, k.T).var()

(0.6897827402551442, 0.4723119359876851, 2.6379631225244347)

In [5]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(0.6897827402551442, 0.4723119359876851, 0.3297453903155543)

Notice the reduction in variance of the product

In [6]:
scaled

array([[ 0.50831049, -0.32483264, -0.80996346,  1.45351054],
       [ 1.033427  ,  0.2929384 ,  0.31247469, -0.00176735],
       [ 1.05217157,  0.50702468, -0.56421362, -0.07984841],
       [ 0.23507143,  0.48475616,  0.21430599, -0.05056954]])

## Masking

- This is to ensure words don't get context from words generated in the future. 
- Not required in the encoders, but required int he decoders

In [7]:
mask = np.tril(np.ones( (L, L) ))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [8]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

In [9]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [10]:
scaled + mask

array([[ 0.50831049,        -inf,        -inf,        -inf],
       [ 1.033427  ,  0.2929384 ,        -inf,        -inf],
       [ 1.05217157,  0.50702468, -0.56421362,        -inf],
       [ 0.23507143,  0.48475616,  0.21430599, -0.05056954]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [11]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [12]:
attention = softmax(scaled + mask)

In [13]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.67710269, 0.32289731, 0.        , 0.        ],
       [0.56231205, 0.32600414, 0.11168381, 0.        ],
       [0.24909071, 0.31973798, 0.24397157, 0.18719974]])

In [14]:
new_v = np.matmul(attention, v)
new_v

array([[ 0.25071438,  0.48167573, -0.83957978,  1.01643075, -1.23111387,
        -0.18240207, -0.20065327,  0.95013036],
       [ 0.43505946,  0.35937667, -0.587036  ,  0.85892578, -1.04977971,
        -0.69845453,  0.11539511,  0.51382558],
       [ 0.33415551,  0.14521868, -0.47928027,  0.70021034, -1.00312728,
        -0.53053884,  0.25303442,  0.41540167],
       [ 0.12311842, -0.4273589 , -0.24232009,  0.42247033, -0.91872176,
        -0.11387245,  0.31648533,  0.04656133]])

In [15]:
v

array([[ 0.25071438,  0.48167573, -0.83957978,  1.01643075, -1.23111387,
        -0.18240207, -0.20065327,  0.95013036],
       [ 0.82162369,  0.10292045, -0.05746185,  0.52864418, -0.66952928,
        -1.78059581,  0.77813588, -0.40108804],
       [-0.66864595, -1.42532636,  0.10349183, -0.39111405, -0.82901721,
         1.36554832,  1.00452033,  0.1064458 ],
       [-0.20783097, -1.24203338, -0.21402217,  0.51111155, -1.04557928,
         0.89600215, -0.68059817, -0.46919933]])

# Function

In [16]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention, v)
  return out, attention

In [17]:
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[ 0.88888321 -0.82574469  0.46908683 -0.45678327 -1.38265852  0.1694321
  -1.59119032 -1.00098324]
 [ 0.28561517  0.74317068 -0.12289067  0.80702098  0.40739247 -1.09360065
  -1.434       0.29304955]
 [-0.38901001 -0.25273029  0.68103981  0.79027909  0.41875152  0.30114817
  -2.40908679  0.54390039]
 [-0.824473   -1.08835662 -0.09119343 -0.33749289  0.64509967 -0.98941865
  -0.51312064 -0.81792968]]
K
 [[-0.81240097  1.5552482   0.35304198 -0.43006368 -0.11817176 -0.40379548
  -1.51102001 -0.58216536]
 [-0.66054204  0.2284632   0.37370332  0.52223403  0.88503708  0.02266103
  -0.24734445 -0.74583914]
 [-0.09362249  0.3931177  -1.09009438 -0.27058443  0.51030466 -0.76357283
   0.28079781  0.21337443]
 [ 1.61403764 -0.89593099 -0.47805923 -0.55238847 -0.71730832 -0.10996484
  -0.54357201 -0.07042023]]
V
 [[ 0.25071438  0.48167573 -0.83957978  1.01643075 -1.23111387 -0.18240207
  -0.20065327  0.95013036]
 [ 0.82162369  0.10292045 -0.05746185  0.52864418 -0.66952928 -1.78059581
   0.77