In [3]:
import torch
import torch.nn.functional as F
import math
# Set seed for reproducibility
torch.manual_seed(42)

# New Input: batch_size=1, seq_len=3, embedding_dim=4
X = torch.tensor(
    [
        [
            [0.5, 1.0, 0.0, 2.0],
            [1.0, 0.0, 1.5, 0.5],
            [0.0, 2.0, 1.0, 1.0],
        ]
    ]
)
# Initialize weight matrices
d_model = X.size(-1)
W_Q = torch.rand(d_model, d_model)
W_K = torch.rand(d_model, d_model)
W_V = torch.rand(d_model, d_model)

# Compute Query, Key, Value
Q = torch.matmul(X, W_Q)
K = torch.matmul(X, W_K)
V = torch.matmul(X, W_V)


# Scaled Dot-Product Attention
scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_model)
attention_weights = F.softmax(scores, dim=-1)
context_vector = torch.matmul(attention_weights, V)

# Outputs
print("Input X shape:", X.shape)
print("\nQuery (Q):\n", Q)
print("\nKey (K):\n", K)
print("\nValue (V):\n", V)
print("\nAttention Scores:\n", scores)
print("\nAttention Weights:\n", attention_weights)
print("\nContext Vector (Final Output):\n", context_vector)

Input X shape: torch.Size([1, 3, 4])

Query (Q):
 tensor([[[2.5704, 2.1938, 1.9302, 2.1321],
         [2.7281, 1.3986, 2.1553, 2.0644],
         [2.5911, 1.9027, 2.1888, 2.6103]]])

Key (K):
 tensor([[[1.8067, 0.7406, 2.3333, 1.2959],
         [1.3170, 0.9812, 1.2806, 0.9641],
         [1.1918, 1.1584, 1.9042, 1.9380]]])

Value (V):
 tensor([[[2.5990, 1.7584, 2.3007, 1.9257],
         [2.2320, 1.2705, 1.7976, 2.5233],
         [2.5792, 2.4538, 2.6084, 2.2205]]])

Attention Scores:
 tensor([[[6.7678, 5.0326, 6.7061],
         [6.8346, 4.8579, 6.4882],
         [7.2903, 5.2995, 7.2594]]])

Attention Weights:
 tensor([[[0.4725, 0.0833, 0.4442],
         [0.5418, 0.0750, 0.3832],
         [0.4748, 0.0649, 0.4603]]])

Context Vector (Final Output):
 tensor([[[2.5596, 2.0266, 2.3955, 2.1065],
         [2.5639, 1.9882, 2.3809, 2.0835],
         [2.5661, 2.0469, 2.4097, 2.1002]]])
