<a href="https://colab.research.google.com/github/DOLLARDEV05/AI-LAB/blob/main/Attention_is_All_You_Need/Transformers/single_head_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [166]:
import numpy as np

sentence = "My cat is lazy"

tokens = sentence.lower().split()
print("Tokens:", tokens)

vocab = {word: idx for idx, word in enumerate(set(tokens))}
print("Vocab:", vocab)

token_ids = np.array([vocab[word] for word in tokens])
print("Token IDs:", token_ids)


Tokens: ['my', 'cat', 'is', 'lazy']
Vocab: {'my': 0, 'cat': 1, 'lazy': 2, 'is': 3}
Token IDs: [0 1 3 2]


In [167]:
# let's create some embeddings!

d_model = 4

# this will create a random matrix of small float values with dimension being
# rows = length of the sentence/token list, columns being the dimension we set
# that is the d_model = 4 where this is a hyperparameter, and for single head
# attention this is d_model = Q = K = V
embedding = np.random.rand(len(tokens),d_model)

print(embedding)

[[0.48285766 0.29490128 0.39238593 0.51395501]
 [0.83773367 0.25914538 0.46919931 0.26134735]
 [0.19616678 0.77443857 0.53184078 0.44215419]
 [0.18016639 0.04269236 0.4988755  0.29426143]]


In [168]:
embedding_token = np.array([embedding[vocab[token]] for token in tokens])
print(embedding_token)

[[0.48285766 0.29490128 0.39238593 0.51395501]
 [0.83773367 0.25914538 0.46919931 0.26134735]
 [0.18016639 0.04269236 0.4988755  0.29426143]
 [0.19616678 0.77443857 0.53184078 0.44215419]]


In [169]:
# let's convert the maths of positional encoding into code successful

import math

for embedding in range(len(embedding_token)):
  for value in range(d_model):
    if value % 2 == 0:
      temp = math.sin(embedding/10000**((2*(value//2))/d_model))
      embedding_token[embedding,value]=embedding_token[embedding,value]+temp
    elif value % 2 == 1:
      temp = math.cos(embedding/10000**((2*(value//2))/d_model))
      embedding_token[embedding,value]=embedding_token[embedding,value]+temp
    else:
      print("error")
print(embedding_token)

[[ 0.48285766  1.29490128  0.39238593  1.51395501]
 [ 1.67920465  0.79944769  0.47919914  1.26129735]
 [ 1.08946382 -0.37345448  0.51887416  1.29406144]
 [ 0.33728679 -0.21555392  0.56183628  1.44170422]]


In [170]:
x = 0 // 2
y = 1 // 2
z = 2 // 2
p = 3 // 2
q = 4 // 2

# how positional encoding uses floor division to have similar values given to the
# sin and cos fuction to have pi/2 shifted sin and cos waves that also dipicts a
# circle idk if this is necessary to mention here

print(x,y,z,p,q)

# was just showcasing where i was stuck in positional encoding and why it matter

0 0 1 1 2


In [171]:
# let's work on making the weight matrix that will extract features from the
# embedding matrix ! W_Q,W_K,W_V


W_Q = np.random.rand(d_model,d_model)
W_K = np.random.rand(d_model,d_model)
W_V = np.random.rand(d_model,d_model)

print(W_Q)
print(W_K)
print(W_V)

[[0.92779748 0.46298449 0.57184952 0.98870354]
 [0.81959304 0.63445143 0.83566886 0.57654681]
 [0.70924652 0.16211957 0.34625257 0.48766587]
 [0.65388826 0.66949956 0.4178588  0.39708335]]
[[0.53056113 0.95352246 0.76062017 0.50652536]
 [0.56784912 0.5532496  0.00905934 0.24741354]
 [0.51644729 0.5815917  0.05461971 0.6075039 ]
 [0.51707516 0.71346211 0.0821721  0.71180803]]
[[0.6487974  0.38440954 0.33164392 0.33597358]
 [0.70734113 0.90493853 0.83437099 0.77436826]
 [0.39469849 0.42910212 0.8181538  0.27323931]
 [0.21832438 0.57199189 0.56816157 0.70284576]]


In [172]:
# multiplying the weighted matrix with the embedded_token to get Q K V matrices


Q = embedding_token @ W_Q
K = embedding_token @ W_K
V = embedding_token @ W_V

print(Q,"\n\n",K,"\n\n\n",V,"\n\n\n\n")

# i used the ** 1/2 or **0.5 which is fine for scalers and one time operations
# but its better to use np.sqrt() which is faster for sqrting whole arrays!
# print((Q @ K.T)/d_model**(1/2))
pre_softmax = (Q @ K.T)/np.sqrt(d_model)

print(pre_softmax)

[[2.77754196 2.12231323 2.12671465 2.01649383]
 [3.37780146 2.20678201 2.32129404 2.85568384]
 [1.91890237 1.21795933 1.03132156 1.62873022]
 [1.47746122 1.07570505 0.80971164 1.05566484]] 

 [[1.97696925 2.48517717 0.52483909 1.88097707]
 [2.24455301 3.22203958 1.41429657 2.23727085]
 [1.30305954 2.05725139 0.95996141 1.69578475]
 [1.09217742 1.55771605 0.40374956 1.48504772]] 


 [[1.71462115 2.39176478 2.42176797 2.33624999]
 [2.11945699 2.29603054 2.33261325 2.20066885]
 [0.9300056  1.0436896  1.20946931 1.12814218]
 [0.60287575 1.00032147 1.21079659 1.11321279]] 




[[ 7.83730052 10.29588275  6.72326893  5.09638983]
 [ 9.37591778 12.18196493  8.0061798   6.15237323]
 [ 5.21266888  6.66694823  4.37904867  3.41407718]
 [ 4.00243134  5.14459241  3.35284395  2.59196309]]


In [173]:
# let's recreate softmax from scratch using only numpy and convert these vectors
# into attention scores that our model can use!

# done with row max subtraction for each element (was checking row max each
# iterations which caused all values other than 1st getting zeroed)

# we dont sum all row values and then exponent rather we sum the exponent of
# individual values then divide


for vector in pre_softmax:
  row_max = np.max(vector,axis=0)
  for element in range(len(vector)):
    vector[element] = np.exp(vector[element] - row_max)
  vector /= np.sum(vector,axis=0)


softmax = pre_softmax.copy()
print(softmax)


[[0.07644691 0.89352898 0.0250924  0.00493171]
 [0.05605891 0.92746049 0.01424869 0.00223191]
 [0.17002748 0.72795512 0.07387244 0.02814496]
 [0.204089   0.63951959 0.10658777 0.04980363]]


In [174]:
# final attention mechanism done
attention = softmax @ V

print(attention)

[[2.05118301 2.26553483 2.30571401 2.17875832]
 [2.07642933 2.28066117 2.31910404 2.19056017]
 [1.92007198 2.18332683 2.23322913 2.11388479]
 [1.83452222 2.1175536  2.1752248  2.05986198]]
