In [2]:
import numpy as np

def attention(Q, K, V, d_k):
    # step1: calculate similarity dot product between Q and K
    scores = np.dot(Q, K.T) / np.sqrt(d_k)
    
    # step2: softmax normalization: softmax function: exp / sum
    attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
    
    # step3: alpha dot product V
    output = np.dot(attention_weights, V)
    
    return output, attention_weights

In [3]:
Q = np.array([[1, 0], [0, 1]])  # 2x2 Query matrix
K = np.array([[1, 0], [0, 1]])  # 2x2 Key matrix
V = np.array([[1, 2], [3, 4]])  # 2x2 Value matrix
d_k = K.shape[1]  # Dimension of the Key (for scaling)

In [4]:
output, attention_weights = attention(Q, K, V, d_k)

print("Attention Weights:\n", attention_weights)
print("Output after applying attention:\n", output)

Attention Weights:
 [[0.66976155 0.33023845]
 [0.33023845 0.66976155]]
Output after applying attention:
 [[1.6604769 2.6604769]
 [2.3395231 3.3395231]]
