In [1]:
import numpy as np

In [2]:
sentence = "The quick brown fox jumps over the lazy dog"

## Simple tokenization by splitting on spaces, ideally more complex tokenization would be used like BPE or WordPiece
sentence = sentence.split()
n = len(sentence)

print(f"Tokenized sentence: {sentence}")
print(f"Number of tokens: {len(sentence)}")

Tokenized sentence: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Number of tokens: 9


In [3]:
## Sample word embeddings, ideally these would be learned in the language modelling process or loaded from a pre-trained model like GloVe or Word2Vec

embeddings = np.array([
        [1.0, 0.5, 0.2, 0.8], 
        [0.3, 1.0, 0.7, 0.1],  
        [0.6, 0.2, 1.0, 0.4],  
        [0.9, 0.8, 0.3, 1.0],  
        [0.4, 0.6, 0.8, 0.2],  
        [0.7, 0.3, 0.5, 0.9],  
        [1.0, 0.5, 0.2, 0.8],  
        [0.2, 0.9, 0.4, 0.6],  
        [0.8, 0.4, 0.9, 0.3]  
    ])

print("Word embeddings (4-dimensional):")
for i, word in enumerate(sentence):
    print(f"  {word:4}: {embeddings[i]}")
print()

Word embeddings (4-dimensional):
  The : [1.  0.5 0.2 0.8]
  quick: [0.3 1.  0.7 0.1]
  brown: [0.6 0.2 1.  0.4]
  fox : [0.9 0.8 0.3 1. ]
  jumps: [0.4 0.6 0.8 0.2]
  over: [0.7 0.3 0.5 0.9]
  the : [1.  0.5 0.2 0.8]
  lazy: [0.2 0.9 0.4 0.6]
  dog : [0.8 0.4 0.9 0.3]



In [4]:
## Sample positional encodings, typically these would be generated using math functions or learned during training or RoPE

positional_embeddings = np.array([
    [0.0, 1.0, 0.0, 1.0],  
    [0.1, 0.9, 0.1, 0.9],  
    [0.2, 0.8, 0.2, 0.8],  
    [0.3, 0.7, 0.3, 0.7],  
    [0.4, 0.6, 0.4, 0.6],  
    [0.5, 0.5, 0.5, 0.5],  
    [0.6, 0.4, 0.6, 0.4],  
    [0.7, 0.3, 0.7, 0.3],  
    [0.8, 0.2, 0.8, 0.2]   
])

print("Positional embeddings (Same dimesnions as word embeddings):")
for i, word in enumerate(sentence):
    print(f"  Pos {i} ({word}): {positional_embeddings[i]}")
print()

Positional embeddings (Same dimesnions as word embeddings):
  Pos 0 (The): [0. 1. 0. 1.]
  Pos 1 (quick): [0.1 0.9 0.1 0.9]
  Pos 2 (brown): [0.2 0.8 0.2 0.8]
  Pos 3 (fox): [0.3 0.7 0.3 0.7]
  Pos 4 (jumps): [0.4 0.6 0.4 0.6]
  Pos 5 (over): [0.5 0.5 0.5 0.5]
  Pos 6 (the): [0.6 0.4 0.6 0.4]
  Pos 7 (lazy): [0.7 0.3 0.7 0.3]
  Pos 8 (dog): [0.8 0.2 0.8 0.2]



In [5]:
## The final input to the Attention block is the sum of the word embeddings and positional encodings

input_embeddings = embeddings + positional_embeddings

print("Input embeddings (word + positional):")
for i, word in enumerate(sentence):
    print(f"  {word:4}: {input_embeddings[i]}")
print()

Input embeddings (word + positional):
  The : [1.  1.5 0.2 1.8]
  quick: [0.4 1.9 0.8 1. ]
  brown: [0.8 1.  1.2 1.2]
  fox : [1.2 1.5 0.6 1.7]
  jumps: [0.8 1.2 1.2 0.8]
  over: [1.2 0.8 1.  1.4]
  the : [1.6 0.9 0.8 1.2]
  lazy: [0.9 1.2 1.1 0.9]
  dog : [1.6 0.6 1.7 0.5]



In [6]:
d_model = 4      # embedding dimension of the tokens
num_heads = 2    # number of attention heads
d_k = d_model // num_heads # Dimension of the Q, K and V matrices for each head

In [None]:
np.random.seed(42)

## Shape of the weights is (num_heads, d_model, d_k) for Q, K, V and (d_model, d_model) for output projection
W_q = np.random.randn(num_heads, d_model, d_k) * 0.3  # Query wieghts
W_k = np.random.randn(num_heads, d_model, d_k) * 0.3  # Key weights
W_v = np.random.randn(num_heads, d_model, d_k) * 0.3  # Value weights
W_o = np.random.randn(d_model, d_model) * 0.3 # Output projection weights. This is applied after concatenating the heads so that the head outputs can interact information with each other.

print("Weight matrices:")
print(f"W_q (Query weights) shape: {W_q.shape}")
print(W_q)
print(f"\nW_k (Key weights) shape: {W_k.shape}")
print(W_k)
print(f"\nW_v (Value weights) shape: {W_v.shape}")
print(W_v)
print()

print(f"W_o (Output projection weights) shape: {W_o.shape}")
print(W_o)

Weight matrices:
W_q (Query weights) shape: (2, 4, 2)
[[[ 0.14901425 -0.04147929]
  [ 0.19430656  0.45690896]
  [-0.07024601 -0.07024109]
  [ 0.47376384  0.23023042]]

 [[-0.14084232  0.16276801]
  [-0.13902531 -0.13971893]
  [ 0.07258868 -0.57398407]
  [-0.51747535 -0.16868626]]]

W_k (Key weights) shape: (2, 4, 2)
[[[-0.30384934  0.0942742 ]
  [-0.27240722 -0.42369111]
  [ 0.43969463 -0.06773289]
  [ 0.02025846 -0.42742446]]

 [[-0.16331482  0.03327678]
  [-0.34529807  0.11270941]
  [-0.18019161 -0.08750812]
  [-0.18051198  0.55568346]]]

W_v (Value weights) shape: (2, 4, 2)
[[[-0.00404917 -0.31731328]
  [ 0.24676347 -0.36625309]
  [ 0.06265908 -0.58790104]
  [-0.39845581  0.05905837]]

 [[ 0.22153997  0.05141048]
  [-0.03469448 -0.09033111]
  [-0.4435566  -0.21595326]
  [-0.13819163  0.31713667]]]

W_o (Output projection weights) shape: (4, 4)
[[ 0.10308549 -0.52891205  0.09722519 -0.11552468]
 [-0.2030766   0.18350289  0.30929986  0.27938404]
 [-0.25176526 -0.09276371  0.09937903  

In [11]:
def softmax(x):
    # Subtract max for numerical stability
    exp_x = np.exp(x - np.max(x))
    # Normalize over rows to get probabilities i.e. sum of each row = 1 and for each token, we have a pdf over all tokens
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

In [9]:
## To concatenate the outputs of each of the self attention heads, we will store them in this list. This will be of shape (num_heads, n, d_k)
head_outputs = []

In [10]:
for head in range(num_heads):
    print(f"=== HEAD {head + 1} ===")
    
    # Compute Q, K, V for this specific head only
    Q_h = input_embeddings @ W_q[head]  # (n, d_k)
    K_h = input_embeddings @ W_k[head]  # (n, d_k)
    V_h = input_embeddings @ W_v[head]  # (n, d_k)
    
    print(f"Head {head + 1} - Computing Q, K, V using weight matrices:")
    print(f"  Q_h = input_embeddings @ W_q[{head}]  ->  shape: {Q_h.shape}")
    print(f"  K_h = input_embeddings @ W_k[{head}]  ->  shape: {K_h.shape}")
    print(f"  V_h = input_embeddings @ W_v[{head}]  ->  shape: {V_h.shape}")
    print()

=== HEAD 1 ===
Head 1 - Computing Q, K, V using weight matrices:
  Q_h = input_embeddings @ W_q[0]  ->  shape: (9, 2)
  K_h = input_embeddings @ W_k[0]  ->  shape: (9, 2)
  V_h = input_embeddings @ W_v[0]  ->  shape: (9, 2)

=== HEAD 2 ===
Head 2 - Computing Q, K, V using weight matrices:
  Q_h = input_embeddings @ W_q[1]  ->  shape: (9, 2)
  K_h = input_embeddings @ W_k[1]  ->  shape: (9, 2)
  V_h = input_embeddings @ W_v[1]  ->  shape: (9, 2)



In [12]:
for head in range(num_heads):
    print(f"=== HEAD {head + 1} ===")

    Q_h = input_embeddings @ W_q[head]
    K_h = input_embeddings @ W_k[head]  
    V_h = input_embeddings @ W_v[head]

    attention_scores_h = Q_h @ K_h.T # (n, n)
    attention_scores_h /= np.sqrt(d_k) # Scale the scores
    print(f"Head {head + 1} - Attention scores (Q_h @ K_h.T) scaled by sqrt(d_k): {attention_scores_h.shape}")

    attention_weights_h = softmax(attention_scores_h) # (n, n)
    print(f"Head {head + 1} - Attention weights after softmax: {attention_weights_h.shape}")

    output_h = attention_weights_h @ V_h # (n, d_k)
    print(f"Head {head + 1} - Output (attention_weights_h @ V_h): {output_h.shape}")

    head_outputs.append(output_h)
    print()   


=== HEAD 1 ===
Head 1 - Attention scores (Q_h @ K_h.T) scaled by sqrt(d_k): (9, 9)
Head 1 - Attention weights after softmax: (9, 9)
Head 1 - Output (attention_weights_h @ V_h): (9, 2)

=== HEAD 2 ===
Head 2 - Attention scores (Q_h @ K_h.T) scaled by sqrt(d_k): (9, 9)
Head 2 - Attention weights after softmax: (9, 9)
Head 2 - Output (attention_weights_h @ V_h): (9, 2)



In [13]:
head_outputs = np.array(head_outputs) # (num_heads, n, d_k)
print(f"Stacked head outputs shape: {head_outputs.shape}")

Stacked head outputs shape: (2, 9, 2)


In [14]:
## Concatenate the output of each head along the last dimension i.e. over the d_k dimension (rows). num_heads * d_k = d_model

concatenated_output = np.concatenate(head_outputs, axis=1) # (n, d_model)
print(f"Concatenated output from all heads: {concatenated_output.shape}")

Concatenated output from all heads: (9, 4)


In [15]:
final_output = concatenated_output @ W_o # (n, d_model)
print(f"Final output after output projection (concatenated_output @ W_o): {final_output.shape}")

Final output after output projection (concatenated_output @ W_o): (9, 4)


In [16]:
print(final_output)

[[ 0.34494302 -0.16594008 -0.49273973 -0.51357179]
 [ 0.34647487 -0.15909348 -0.4852738  -0.50635228]
 [ 0.34336687 -0.15293441 -0.48028235 -0.50077979]
 [ 0.34598795 -0.16394967 -0.48996047 -0.51109125]
 [ 0.34236345 -0.15115023 -0.47936929 -0.49953431]
 [ 0.34188169 -0.15456373 -0.48264006 -0.50284664]
 [ 0.33998404 -0.15521293 -0.4845477  -0.50426322]
 [ 0.34235727 -0.15255953 -0.480753   -0.50093792]
 [ 0.33840776 -0.14368761 -0.47391596 -0.49318787]]
