<a href="https://colab.research.google.com/github/DOLLARDEV05/AI-LAB/blob/main/Attention_is_All_You_Need/Transformers/single_head_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import numpy as np

sentence = "My cat is lazy"

tokens = sentence.lower().split()
print("Tokens:", tokens)

vocab = {word: idx for idx, word in enumerate(set(tokens))}
print("Vocab:", vocab)

token_ids = np.array([vocab[word] for word in tokens])
print("Token IDs:", token_ids)


Tokens: ['my', 'cat', 'is', 'lazy']
Vocab: {'lazy': 0, 'cat': 1, 'is': 2, 'my': 3}
Token IDs: [3 1 2 0]


In [15]:
# let's create some embeddings!

d_model = 4

# this will create a random matrix of small float values with dimension being
# rows = length of the sentence/token list, columns being the dimension we set
# that is the d_model = 4 where this is a hyperparameter, and for single head
# attention this is d_model = Q = K = V
embedding = np.random.rand(len(tokens),d_model)

print(embedding)

[[0.79093974 0.02522849 0.64786399 0.95843736]
 [0.70727585 0.82519675 0.53106664 0.97615182]
 [0.74865837 0.70274123 0.24756134 0.11911246]
 [0.85118147 0.94963237 0.08009344 0.68833727]]


In [16]:
embedding_token = np.array([embedding[vocab[token]] for token in tokens])
print(embedding_token)

[[0.85118147 0.94963237 0.08009344 0.68833727]
 [0.70727585 0.82519675 0.53106664 0.97615182]
 [0.74865837 0.70274123 0.24756134 0.11911246]
 [0.79093974 0.02522849 0.64786399 0.95843736]]


In [17]:
# let's convert the maths of positional encoding into code successful

import math

for embedding in range(len(embedding_token)):
  for value in range(d_model):
    if value % 2 == 0:
      temp = math.sin(embedding/10000**((2*(value//2))/d_model))
      embedding_token[embedding,value]=embedding_token[embedding,value]+temp
    elif value % 2 == 1:
      temp = math.cos(embedding/10000**((2*(value//2))/d_model))
      embedding_token[embedding,value]=embedding_token[embedding,value]+temp
    else:
      print("error")
print(embedding_token)

[[ 0.85118147  1.94963237  0.08009344  1.68833727]
 [ 1.54874683  1.36549906  0.54106647  1.97610182]
 [ 1.65795579  0.28659439  0.26756001  1.11891246]
 [ 0.93205975 -0.96476401  0.67785949  1.9579874 ]]


In [18]:
x = 0 // 2
y = 1 // 2
z = 2 // 2
p = 3 // 2
q = 4 // 2

# how positional encoding uses floor division to have similar values given to the
# sin and cos fuction to have pi/2 shifted sin and cos waves that also dipicts a
# circle idk if this is necessary to mention here

print(x,y,z,p,q)

# was just showcasing where i was stuck in positional encoding and why it matter

0 0 1 1 2


In [19]:
# let's work on making the weight matrix that will extract features from the
# embedding matrix ! W_Q,W_K,W_V


W_Q = np.random.rand(d_model,d_model)
W_K = np.random.rand(d_model,d_model)
W_V = np.random.rand(d_model,d_model)

print(W_Q)
print(W_K)
print(W_V)

[[0.81696593 0.56279898 0.66859006 0.38688357]
 [0.21019105 0.51028052 0.41529454 0.63794461]
 [0.93625725 0.28291135 0.43364866 0.48877707]
 [0.98563605 0.37907947 0.83791204 0.05933695]]
[[0.25643168 0.50920873 0.90321555 0.8123113 ]
 [0.31801313 0.96442879 0.22835597 0.06321805]
 [0.57571892 0.33734735 0.83224125 0.8761615 ]
 [0.13091818 0.6655994  0.13136361 0.10669835]]
[[0.78572169 0.48145457 0.98621154 0.35160185]
 [0.88870846 0.43048519 0.62888936 0.05948812]
 [0.59566161 0.2607628  0.84319599 0.92404155]
 [0.15342217 0.46801284 0.89791307 0.90106413]]


In [20]:
# multiplying the weighted matrix with the embedded_token to get Q K V matrices


Q = embedding_token @ W_Q
K = embedding_token @ W_K
V = embedding_token @ W_V

print(Q,"\n\n",K,"\n\n\n",V)

# i used the ** 1/2 or **0.5 which is fine for scalers and one time operations
# but its better to use np.sqrt() which is faster for sqrting whole arrays!
# print((Q @ K.T)/d_model**(1/2))
pre_softmax = (Q @ K.T)/np.sqrt(d_model)

print(pre_softmax)

[[2.84425568 2.1365768  2.82817369 1.7123942 ]
 [4.00658368 2.47059417 3.4929933  1.85201421]
 [2.76807843 1.57919186 2.28109111 1.02143725]
 [3.12319012 0.96626896 2.15707872 0.19263579]] 

 [[1.10542393 3.4644862  1.50245368 1.06499387]
 [1.40160425 3.60338154 2.42055779 2.02929716]
 [0.81681852 1.95565357 1.93259585 1.71870606]
 [0.57879471 1.07607611 1.44289215 1.49896059]] 


 [[2.70818348 2.06014199 3.64906261 2.01056677]
 [3.0558849  2.39940941 4.61673256 2.90633561]
 [1.88843189 1.51504014 3.04562295 1.85543774]
 [0.57911917 1.1265516  2.64214913 2.66096448]]
[[ 8.30956922 11.0030684   7.45523189  5.29646023]
 [10.10437865 13.36570871  9.01894522  6.39682288]
 [ 6.52302334  8.58224692  5.75667376  4.06197122]
 [ 5.12307068  6.73578061  4.47030627  3.12432514]]


In [20]:
# let's recreate softmax from scratch using only numpy and convert these vectors
# into attention scores that our model can use!

