<a href="https://colab.research.google.com/github/DOLLARDEV05/AI-LAB/blob/main/Attention_is_All_You_Need/Transformers/single_head_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [172]:
import numpy as np

sentence = "My cat is lazy"

tokens = sentence.lower().split()
print("Tokens:", tokens)

vocab = {word: idx for idx, word in enumerate(set(tokens))}
print("Vocab:", vocab)

token_ids = np.array([vocab[word] for word in tokens])
print("Token IDs:", token_ids)


Tokens: ['my', 'cat', 'is', 'lazy']
Vocab: {'is': 0, 'cat': 1, 'my': 2, 'lazy': 3}
Token IDs: [2 1 0 3]


In [173]:
# let's create some embeddings!

d_model = 4

# this will create a random matrix of small float values with dimension being
# rows = length of the sentence/token list, columns being the dimension we set
# that is the d_model = 4 where this is a hyperparameter, and for single head
# attention this is d_model = Q = K = V
embedding = np.random.rand(len(tokens),d_model)

print(embedding)

[[0.14586421 0.70119547 0.17144063 0.76415822]
 [0.25865658 0.47998634 0.72607315 0.80242705]
 [0.34788698 0.75669908 0.1984486  0.85131991]
 [0.962738   0.5953954  0.87845373 0.17610556]]


In [174]:
embedding_token = np.array([embedding[vocab[token]] for token in tokens])
print(embedding_token)

[[0.34788698 0.75669908 0.1984486  0.85131991]
 [0.25865658 0.47998634 0.72607315 0.80242705]
 [0.14586421 0.70119547 0.17144063 0.76415822]
 [0.962738   0.5953954  0.87845373 0.17610556]]


In [175]:
# let's convert the maths of positional encoding into code successful

import math

for embedding in range(len(embedding_token)):
  for value in range(d_model):
    if value % 2 == 0:
      temp = math.sin(embedding/10000**((2*(value//2))/d_model))
      embedding_token[embedding,value]=embedding_token[embedding,value]+temp
    elif value % 2 == 1:
      temp = math.cos(embedding/10000**((2*(value//2))/d_model))
      embedding_token[embedding,value]=embedding_token[embedding,value]+temp
    else:
      print("error")
print(embedding_token)

[[ 0.34788698  1.75669908  0.1984486   1.85131991]
 [ 1.10012756  1.02028864  0.73607298  1.80237705]
 [ 1.05516163  0.28504864  0.1914393   1.76395823]
 [ 1.103858   -0.3945971   0.90844923  1.17565559]]


In [176]:
x = 0 // 2
y = 1 // 2
z = 2 // 2
p = 3 // 2
q = 4 // 2

# how positional encoding uses floor division to have similar values given to the
# sin and cos fuction to have pi/2 shifted sin and cos waves that also dipicts a
# circle idk if this is necessary to mention here

print(x,y,z,p,q)

# was just showcasing where i was stuck in positional encoding and why it matter

0 0 1 1 2


In [177]:
# let's work on making the weight matrix that will extract features from the
# embedding matrix ! W_Q,W_K,W_V


W_Q = np.random.rand(d_model,d_model)
W_K = np.random.rand(d_model,d_model)
W_V = np.random.rand(d_model,d_model)

print(W_Q)
print(W_K)
print(W_V)

[[0.63014091 0.02741707 0.19828858 0.53166054]
 [0.80905515 0.03125995 0.95912015 0.42628215]
 [0.60602925 0.44329526 0.98692131 0.15422808]
 [0.72422329 0.10867545 0.63136481 0.41348703]]
[[0.245192   0.76355019 0.93882039 0.66777096]
 [0.15809499 0.20076901 0.79161411 0.57583391]
 [0.31457606 0.80364227 0.56777226 0.90171603]
 [0.94634113 0.81764359 0.46842689 0.35226859]]
[[0.11120649 0.21421011 0.09893766 0.40249899]
 [0.04748347 0.26406346 0.52640719 0.63207535]
 [0.04948195 0.8520003  0.51485093 0.70153184]
 [0.97824882 0.87217952 0.24891423 0.9430594 ]]


In [178]:
# multiplying the weighted matrix with the embedded_token to get Q K V matrices


Q = embedding_token @ W_Q
K = embedding_token @ W_K
V = embedding_token @ W_V

print(Q,"\n\n",K,"\n\n\n",V,"\n\n\n\n")

# i used the ** 1/2 or **0.5 which is fine for scalers and one time operations
# but its better to use np.sqrt() which is faster for sqrting whole arrays!
# print((Q @ K.T)/d_model**(1/2))
pre_softmax = (Q @ K.T)/np.sqrt(d_model)

print(pre_softmax)

[[3.1015189  0.35361672 3.1185789  1.72991037]
 [3.27011036 0.58422825 3.06112569 1.87860791]
 [2.28903802 0.31440314 1.78525907 1.44139811]
 [1.77831924 0.54840561 1.47925188 1.04489476]] 

 [[2.17743179 2.29152145 2.6971128  2.07498187]
 [2.36825948 3.11008636 3.10270077 2.62079968]
 [2.0333104  2.45903565 2.15123517 1.66275794]
 [1.60662095 2.45496359 1.79045674 1.74321171]] 


 [[1.9429726  2.32216252 1.52214949 3.135513  ]
 [1.97038371 2.70421015 1.4735365  3.30382678]
 [1.86593876 2.00289179 0.79208389 2.40269186]
 [1.29905486 1.93164062 0.66184773 1.94090575]] 




[[ 9.78217895 11.32737289  8.38056279  7.22518102]
 [10.30674623 11.9913411   8.89732948  7.72185566]
 [ 6.75530287  7.85779865  5.8323308   5.07927795]
 [ 5.64334947  6.622623    4.94202484  4.33670501]]


In [179]:
# let's recreate softmax from scratch using only numpy and convert these vectors
# into attention scores that our model can use!

# done with row max subtraction for each element (was checking row max each
# iterations which caused all values other than 1st getting zeroed)

# we dont sum all row values and then exponent rather we sum the exponent of
# individual values then divide




for vector in pre_softmax:
  row_max = np.max(vector,axis=0)
  for element in range(len(vector)):
    vector[element] = np.exp(vector[element] - row_max)

  for number in vector:
     number = number / np.sum(vector,axis=0)
print(pre_softmax)

[[0.21327051 1.         0.05250693 0.01653639]
 [0.18551958 1.         0.04531978 0.01398898]
 [0.33204135 1.         0.1319321  0.06213035]
 [0.37558385 1.         0.18626253 0.10168068]]
