<a href="https://colab.research.google.com/github/Carlos1729/Transformers_Code/blob/main/Self_Attention_for_Transformer_Neural_Networks_Madhukar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Self Attention in Transformers

## Generate Data

In [1]:
import numpy as np
import math

L, d_k, d_v = 4, 8, 8 #4 is for My name is Ajay what is the use of the remaining ones
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [3]:
print("Q\n", q)#each first row represents the value of MY
print("K\n", k)
print("V\n", v)

Q
 [[-2.6879324   0.07854285 -0.64721039  0.36714661  0.32883658  0.62780827
  -3.39731454  0.94502965]
 [-0.70253061  1.22888519 -0.29549868 -1.0314771  -0.91978412  0.27206952
   0.89239788 -0.12553903]
 [-0.47566418  0.979592   -0.88340703  0.82766499  1.09016252 -0.23835494
   0.21535181  0.57839964]
 [-0.81785089 -0.51255382 -1.22857287 -0.3379141  -1.28545635 -0.25243038
  -0.12786602  0.11140824]]
K
 [[ 0.08154103 -0.95601322  1.12336335 -0.47340832 -0.68075519  0.48105207
  -0.0127978  -0.45762074]
 [-0.97462519 -2.40703059 -0.0684793  -0.22713621 -1.07703894  1.86346115
  -0.47046248  0.85431961]
 [ 0.07276018  2.00754453 -0.51775134 -0.43810541  2.01571525  0.27616779
   1.65457744  0.19981189]
 [-1.98762627  0.47380283  0.68431491 -0.65272383  1.20062299  0.08587655
   0.51920804 -0.15466817]]
V
 [[-0.49189828 -0.11500711 -0.4024976  -1.48276893 -0.12739775  0.11687734
   0.50024686 -0.70605724]
 [-1.37109215 -0.02025637  0.07030794  0.07045627 -0.14907388 -0.08567621
  -0.5

In [4]:
q.shape, k.shape

((4, 8), (4, 8))

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$

In [6]:
np.matmul(q, k.T)#my depends more on the word name by this is being generated from normal data only how his he infering it from this

array([[-1.50596321,  5.61299273, -4.45972156,  3.23592155],
       [-0.27270216, -1.04819476,  2.6933872 ,  1.85148057],
       [-3.48373827, -3.24730067,  4.63026447,  1.57556829],
       [-0.09253997,  3.26114188, -3.15447501, -0.88608037]])

In [7]:
# Why we need sqrt(d_k) in denominator this is applied to decrease the variance
q.var(), k.var(), np.matmul(q, k.T).var()

(1.0145062333184773, 1.055915255518609, 8.911587506672388)

In [8]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(1.0145062333184773, 1.055915255518609, 1.1139484383340483)

Notice the reduction in variance of the product

In [9]:
scaled

array([[-0.5324384 ,  1.98449261, -1.57674968,  1.14407104],
       [-0.09641477, -0.37059281,  0.95225618,  0.65459723],
       [-1.23168748, -1.14809416,  1.6370457 ,  0.55704751],
       [-0.03271782,  1.15298777, -1.11527533, -0.31327672]])

## Masking

- This is to ensure words don't get context from words generated in the future.
- Not required in the encoders, but required int he decoders

In [10]:
mask = np.tril(np.ones( (L, L) ))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [11]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

In [12]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [13]:
scaled + mask

array([[-0.5324384 ,        -inf,        -inf,        -inf],
       [-0.09641477, -0.37059281,        -inf,        -inf],
       [-1.23168748, -1.14809416,  1.6370457 ,        -inf],
       [-0.03271782,  1.15298777, -1.11527533, -0.31327672]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [22]:
def softmax(x):
  f = np.exp(x);
  s = np.sum(np.exp(x), axis=-1);
  print(f,f.shape)
  print(s,s.shape)
  return (f.T / s).T

In [20]:
def softmax(x):
  f = np.exp(x);
  s = np.sum(np.exp(x), axis=-1);
  print(f,f.shape)
  print(s,s.shape)
  return (f/s)

In [25]:
# np.exp(x): This computes the element-wise exponential of the input matrix x. Each element of the matrix is exponentiated.
# np.sum(np.exp(x), axis=-1): This computes the sum of the exponentiated elements along the last axis (axis=-1) for each row. In this context, it computes the sum for each row independently.
# (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T: This is element-wise division, where the exponentiated values from step 1 are divided by the sum computed in step 2 for each row.
# The transpose operation (.T) is used to ensure that the dimensions match for broadcasting when dividing.

In [23]:
attention = softmax(scaled + mask)

[[0.58717146 0.         0.         0.        ]
 [0.90808729 0.69032498 0.         0.        ]
 [0.29179976 0.3172408  5.13996208 0.        ]
 [0.96781162 3.16764297 0.327825   0.73104759]] (4, 4)
[0.58717146 1.59841226 5.74900264 5.19432719] (4,)


In [24]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.56811832, 0.43188168, 0.        , 0.        ],
       [0.05075659, 0.05518189, 0.89406153, 0.        ],
       [0.18632088, 0.60982739, 0.06311212, 0.14073961]])

In [27]:
attention.shape, v.shape

((4, 4), (4, 8))

In [28]:
new_v = np.matmul(attention, v)
new_v

array([[-0.49189828, -0.11500711, -0.4024976 , -1.48276893, -0.12739775,
         0.11687734,  0.50024686, -0.70605724],
       [-0.87160601, -0.074086  , -0.19830155, -0.81195942, -0.13675927,
         0.02939817,  0.06147253,  0.07474254],
       [-1.19979205, -0.02951362, -0.50825024, -0.20060723, -1.18242406,
         0.11303526, -0.91747644, -1.15437745],
       [-1.02927893, -0.23884111, -0.05537951, -0.16563054, -0.29047686,
        -0.21558463, -0.13387551,  0.68752867]])

In [29]:
v

array([[-0.49189828, -0.11500711, -0.4024976 , -1.48276893, -0.12739775,
         0.11687734,  0.50024686, -0.70605724],
       [-1.37109215, -0.02025637,  0.07030794,  0.07045627, -0.14907388,
        -0.08567621, -0.51571271,  1.10184482],
       [-1.22940703, -0.02523145, -0.54996278, -0.14454805, -1.30609759,
         0.12508171, -1.02275876, -1.31908401],
       [-0.16987388, -1.44570233,  0.08134034,  0.54566615, -0.66363745,
        -1.37138219,  1.07974057,  1.63704388]])

# Function

In [30]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention, v)
  return out, attention

In [31]:
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[-2.6879324   0.07854285 -0.64721039  0.36714661  0.32883658  0.62780827
  -3.39731454  0.94502965]
 [-0.70253061  1.22888519 -0.29549868 -1.0314771  -0.91978412  0.27206952
   0.89239788 -0.12553903]
 [-0.47566418  0.979592   -0.88340703  0.82766499  1.09016252 -0.23835494
   0.21535181  0.57839964]
 [-0.81785089 -0.51255382 -1.22857287 -0.3379141  -1.28545635 -0.25243038
  -0.12786602  0.11140824]]
K
 [[ 0.08154103 -0.95601322  1.12336335 -0.47340832 -0.68075519  0.48105207
  -0.0127978  -0.45762074]
 [-0.97462519 -2.40703059 -0.0684793  -0.22713621 -1.07703894  1.86346115
  -0.47046248  0.85431961]
 [ 0.07276018  2.00754453 -0.51775134 -0.43810541  2.01571525  0.27616779
   1.65457744  0.19981189]
 [-1.98762627  0.47380283  0.68431491 -0.65272383  1.20062299  0.08587655
   0.51920804 -0.15466817]]
V
 [[-0.49189828 -0.11500711 -0.4024976  -1.48276893 -0.12739775  0.11687734
   0.50024686 -0.70605724]
 [-1.37109215 -0.02025637  0.07030794  0.07045627 -0.14907388 -0.08567621
  -0.5