**For a 4 word sentence (eg. MY NAME IS ABIR), How will a Transformer Encode and Decode it?** (SINGLE HEAD ATTENTION)

In [None]:
import numpy as np
import math

L, d_k, d_v = 4, 8, 8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)
# L is Sequence Length

In [None]:
print("q: ", q)
print("k: ", k)
print("v: ", v)

q:  [[-1.79932719 -0.25184335  0.54347279 -0.95999358  0.25094355 -0.88107778
  -1.30744487 -1.18706659]
 [ 0.92948492  0.44894292  0.96960645  0.25047715  0.74166373  0.33778092
  -0.79207137 -1.79199775]
 [-0.22163177 -0.23482594 -0.45257996  0.48404388  2.51667063 -0.3860275
  -0.05443547  0.52163572]
 [ 1.98255471  2.36259526  0.07288554  1.21674816 -0.45803133  0.59609164
  -0.67089895 -0.06279828]]
k:  [[ 0.39624406  0.27260912 -0.14608955  0.88020554 -0.03726402 -1.80458518
  -0.61451359 -1.37704053]
 [-0.03324017  1.3226862   2.37472845 -1.48966041  0.81140608  0.92788884
   0.16833299 -0.79401723]
 [ 1.84084197  0.66160859 -0.15261633 -0.98547081 -0.32006183 -1.73026909
  -0.35642909 -0.65857833]
 [-0.3290325  -0.98616798  0.08136521 -1.06982481  0.48279425 -0.53656397
  -0.91236698  0.84537273]]
v:  [[-0.1386253  -0.02008414  1.25762313 -0.76065198 -1.39874583  0.3908417
  -1.10553442 -0.24095311]
 [-0.26801562 -0.99225188 -0.77689387 -0.65760615 -0.00371309  1.27382244
  -0.

In [None]:
np.matmul(q, k.T)
# Attention Table

array([[ 2.3126953 ,  2.55590504,  0.07617601,  2.69490619],
       [ 2.88671115,  4.69709943,  2.25390168, -1.55305466],
       [ 0.2583154 , -0.83854925, -1.43298628,  1.66263976],
       [ 1.93010345,  1.53799936,  3.3981748 , -4.25997583]])

In [None]:
q.var(), k.var(), np.matmul(q, k.T).var()
# Variance of Attention Table

(1.066726941348061, 0.9622066881026449, 4.85960643856248)

In [None]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(1.066726941348061, 0.9622066881026449, 0.60745080482031)

In [None]:
scaled

array([[ 0.81766126,  0.90364889,  0.02693228,  0.95279322],
       [ 1.02060652,  1.66067543,  0.79687458, -0.54908774],
       [ 0.09132829, -0.29647193, -0.50663716,  0.58783192],
       [ 0.68239462,  0.54376489,  1.20143622, -1.5061289 ]])

In [None]:
mask = np.tril(np.ones((L, L)))
mask
# Sequential Word Selection

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [None]:
# Transform
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

In [None]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [None]:
# Soft Max
def softmax(x):
  return(np.exp(x).T / np.sum(np.exp(x), axis= -1)).T

In [None]:
attention = softmax(scaled + mask)
attention
# The 0 values are because of MASKING (not prediction future values of words)

array([[1.        , 0.        , 0.        , 0.        ],
       [0.34523096, 0.65476904, 0.        , 0.        ],
       [0.44873691, 0.30448949, 0.2467736 , 0.        ],
       [0.27299664, 0.23765731, 0.45874802, 0.03059804]])

In [None]:
new_v = np.matmul(attention, v)
print("new_v:", new_v)
print("v:", v)
# Attention (Value) Vectors Comparison

new_v: [[-0.1386253  -0.02008414  1.25762313 -0.76065198 -1.39874583  0.3908417
  -1.10553442 -0.24095311]
 [-0.22334608 -0.65662948 -0.07451561 -0.69318076 -0.48532159  0.96899015
  -0.65948706  0.85829481]
 [-0.01952047 -0.40218255  0.26895135 -0.86942419 -0.31913154  0.5442779
  -0.68829609 -0.09134931]
 [ 0.12487887 -0.47192682  0.08600354 -1.01171891  0.25966959  0.39040574
  -0.49199415 -0.52287967]]
v: [[-0.1386253  -0.02008414  1.25762313 -0.76065198 -1.39874583  0.3908417
  -1.10553442 -0.24095311]
 [-0.26801562 -0.99225188 -0.77689387 -0.65760615 -0.00371309  1.27382244
  -0.42430588  1.43787997]
 [ 0.50367526 -0.36892031 -0.23841512 -1.3285757   1.25486654 -0.07688316
  -0.25531624 -1.7061958 ]
 [-0.15167785 -2.00622477  1.19888805 -1.25161948  2.18110049  0.5308857
   0.90784391 -0.52647211]]
