In [3]:
import numpy as np

L, d_k, d_v = 4,8,8

# random.randn() -> generate data that follows normal distribution with with mean = 0 and standard deviation = 1
# The values can be positive or negative and are centered around 0.

q = np.random.randn(L,d_k)
k = np.random.randn(L,d_k)
v = np.random.randn(L,d_k)


In [7]:
print("Query\n",q)
print("Key\n",k)
print("Value\n",v)

Query
 [[ 1.87604879  1.0894955   0.92131853 -0.69520508 -0.15763455 -1.08497782
   0.4037962  -0.74904617]
 [-0.46314594  0.23445128 -0.3592481  -0.88368339  0.1536767   0.28977602
   0.95884298  0.83218026]
 [-1.68177452 -0.33330035  0.38734617 -1.834288    1.17236142 -0.11415349
  -1.77938838 -1.73784345]
 [-0.22117961 -0.40622376 -1.08984183 -0.14067862  0.5211167  -0.53582774
  -3.31447763 -0.88837207]]
Key
 [[ 0.92951036 -1.36709148 -0.59088346  0.82162489 -0.28907528 -0.05373544
   1.1508031  -0.44655926]
 [-0.04507338  0.66035027 -0.03346834 -0.97592705 -0.7210328  -1.01253913
  -0.64381646  0.64713899]
 [-1.14807747 -0.78885182 -0.29219078  0.14101149 -1.5610208  -0.45963644
   0.53587505  1.31900619]
 [ 0.50306034  1.69757601  1.65403951 -0.94702131 -0.7028133   1.1977334
  -0.65861741 -0.31729829]]
Value
 [[-0.77058434 -0.18086082  0.34717439 -0.44675901  0.34852945 -1.29106083
   0.30556096  0.8858823 ]
 [-0.33032634  1.12268768  0.73350555  0.03797777 -0.45899746 -1.929759

Self-Attention Mechanism

In [8]:
np.matmul(q,k.T)

array([[ 0.04183054,  1.7500578 , -3.40737859,  3.75853578],
       [-0.5929715 ,  0.56713138,  1.56552638, -0.2488267 ],
       [-4.44799011,  0.92412284, -3.20147021,  1.72863325],
       [-2.66130864,  1.64130598, -2.64211608, -1.01345311]])

In [9]:
q.var() , k.var(), np.matmul(q,k.T).var()

(1.1171506359230339, 0.7808630761759632, 5.075968774921185)

In [18]:
import math

#we divide by sqrt(d_k) to stabilize the values
scaled = np.matmul(q,k.T)/math.sqrt(d_k)
scaled

array([[ 0.01478933,  0.61873887, -1.20469025,  1.32884307],
       [-0.20964708,  0.20051122,  0.55349716, -0.08797352],
       [-1.57260198,  0.32672676, -1.13189065,  0.61116414],
       [-0.94091469,  0.58028929, -0.9341291 , -0.35830978]])

In [19]:
q.var(),k.var(),scaled.var()

(1.1171506359230339, 0.7808630761759632, 0.6344960968651479)

**Masking**
1. Used in Decoder to ensure we don't see the future word while prediction.
2. In encoder not essential

In [26]:
mask = np.tril(np.ones((L,L)))
#creates a lower triangular matrix of ones
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [27]:
mask[mask==0] = -np.infty
mask[mask==1] = 0
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [30]:
# now the values applied with mask that we dont see the future word
scaled + mask

array([[ 0.01478933,        -inf,        -inf,        -inf],
       [-0.20964708,  0.20051122,        -inf,        -inf],
       [-1.57260198,  0.32672676, -1.13189065,        -inf],
       [-0.94091469,  0.58028929, -0.9341291 , -0.35830978]])

**Softmax**
- Converts vector to probability

In [31]:
def softmax(x):
  return (np.exp(x).T/np.sum(np.exp(x),axis=1)).T


In [33]:
attention = softmax(scaled+mask)
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.39887416, 0.60112584, 0.        , 0.        ],
       [0.10828112, 0.72347036, 0.16824852, 0.        ],
       [0.11939958, 0.54657943, 0.12021253, 0.21380846]])

In [35]:
new_v = np.matmul(attention,v)
new_v

array([[-0.77058434, -0.18086082,  0.34717439, -0.44675901,  0.34852945,
        -1.29106083,  0.30556096,  0.8858823 ],
       [-0.50593388,  0.60273586,  0.57940803, -0.1553712 , -0.13689584,
        -1.674999  ,  0.89447286,  0.15722183],
       [-0.48473845,  0.72910927,  0.70264443, -0.04773369, -0.55274605,
        -1.85728125,  1.01094781, -0.1081702 ],
       [-0.65254383,  0.11826537,  0.65652914,  0.10464037, -0.34560109,
        -1.51266715,  0.83387896,  0.10833803]])

To sum up..

In [39]:
def softmax(x):
  return (np.exp(x).T/np.sum(np.exp(x),axis=1)).T

def scaled_dot_product_attention(q,k,v,mask = None):
  d_k = q.shape[-1]
  scaled = np.matmul(q,k.T)/math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention,v)
  return out, attention

In [41]:
values , attention = scaled_dot_product_attention(q,k,v,mask = mask)
print("Query\n",q)
print("Key\n",k)
print("Value\n",v)
print("values\n",values)
print("attention\n",attention)

Query
 [[ 1.87604879  1.0894955   0.92131853 -0.69520508 -0.15763455 -1.08497782
   0.4037962  -0.74904617]
 [-0.46314594  0.23445128 -0.3592481  -0.88368339  0.1536767   0.28977602
   0.95884298  0.83218026]
 [-1.68177452 -0.33330035  0.38734617 -1.834288    1.17236142 -0.11415349
  -1.77938838 -1.73784345]
 [-0.22117961 -0.40622376 -1.08984183 -0.14067862  0.5211167  -0.53582774
  -3.31447763 -0.88837207]]
Key
 [[ 0.92951036 -1.36709148 -0.59088346  0.82162489 -0.28907528 -0.05373544
   1.1508031  -0.44655926]
 [-0.04507338  0.66035027 -0.03346834 -0.97592705 -0.7210328  -1.01253913
  -0.64381646  0.64713899]
 [-1.14807747 -0.78885182 -0.29219078  0.14101149 -1.5610208  -0.45963644
   0.53587505  1.31900619]
 [ 0.50306034  1.69757601  1.65403951 -0.94702131 -0.7028133   1.1977334
  -0.65861741 -0.31729829]]
Value
 [[-0.77058434 -0.18086082  0.34717439 -0.44675901  0.34852945 -1.29106083
   0.30556096  0.8858823 ]
 [-0.33032634  1.12268768  0.73350555  0.03797777 -0.45899746 -1.929759