In [None]:
import numpy as np

# Example token embeddings (3 tokens, each of dimension 4)
X = np.array([
    [1, 0, 1, 0],   # token 1
    [0, 2, 0, 2],   # token 2
    [1, 1, 1, 1]    # token 3
], dtype=float)

d_model = X.shape[1]

# Weight matrices for Query, Key, Value
W_Q = np.random.rand(d_model, d_model)
W_K = np.random.rand(d_model, d_model)
W_V = np.random.rand(d_model, d_model)

# Step 1: Create Q, K, V
Q = X @ W_Q
K = X @ W_K
V = X @ W_V

# Step 2: Scaled dot-product attention
scores = Q @ K.T / np.sqrt(d_model)

# Softmax function
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

attention_weights = softmax(scores)

# Step 3: Final output
output = attention_weights @ V

print("Input tokens:\n", X)
print("\nAttention weights:\n", attention_weights)
print("\nOutput of single-head attention:\n", output)


Input tokens:
 [[1. 0. 1. 0.]
 [0. 2. 0. 2.]
 [1. 1. 1. 1.]]

Attention weights:
 [[0.07478316 0.23613828 0.68907856]
 [0.04191368 0.16370204 0.79438427]
 [0.0211171  0.13177896 0.84710393]]

Output of single-head attention:
 [[1.52505401 3.19576466 2.57757996 1.3192114 ]
 [1.54470518 3.23312057 2.63780485 1.38672871]
 [1.55863388 3.26078451 2.67211186 1.41891149]]
