In [6]:
  import torch
import torch.nn.functional as F
import math
# Set seed for reproducibility
torch.manual_seed(42)

# New Input: batch_size=1, seq_len=3, embedding_dim=4
# Simulating embeddings for tokens in a sentence about LLMs
X = torch.tensor([
    [
        [1.0, 0.5, 1.5, 0.0],   # token 1
        [0.0, 1.5, 0.5, 1.0],   # token 2
        [1.0, 1.0, 1.0, 1.0]    # token 3
    ]
])

# Initialize weight matrices for Q, K, V
d_model = X.size(-1)
W_Q = torch.rand(d_model, d_model)
W_K = torch.rand(d_model, d_model)
W_V = torch.rand(d_model, d_model)
# Compute Query, Key, Value
Q = torch.matmul(X, W_Q)
K = torch.matmul(X, W_K)
V = torch.matmul(X, W_V)

# Scaled Dot-Product Attention
scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_model)
attention_weights = F.softmax(scores, dim=-1)
context_vector = torch.matmul(attention_weights, V)

# Outputs
print("Input X shape:", X.shape)
print("\nQuery (Q):\n", Q)
print("\nKey (K):\n", K)
print("\nValue (V):\n", V)
print("\nAttention Scores:\n", scores)
print("\nAttention Weights:\n", attention_weights)
print("\nContext Vector (Final Output):\n", context_vector)

Input X shape: torch.Size([1, 3, 4])

Query (Q):
 tensor([[[2.4887, 1.4152, 1.9130, 2.2465],
         [1.9255, 1.5357, 1.5933, 1.9167],
         [3.0829, 2.2168, 2.3151, 2.7759]]])

Key (K):
 tensor([[[1.1782, 1.1988, 0.9533, 1.3423],
         [1.0043, 0.8030, 1.5763, 1.4225],
         [1.8076, 1.2909, 1.8739, 1.7338]]])

Value (V):
 tensor([[[2.1265, 1.5818, 1.6806, 2.3997],
         [1.9731, 1.8196, 1.9758, 1.5761],
         [2.8873, 2.1330, 2.3914, 2.6872]]])

Attention Scores:
 tensor([[[4.7340, 4.9235, 6.9025],
         [4.1006, 4.2023, 5.8857],
         [6.1115, 6.2371, 8.7927]]])

Attention Weights:
 tensor([[[0.0913, 0.1103, 0.7984],
         [0.1240, 0.1372, 0.7388],
         [0.0598, 0.0677, 0.8725]]])

Context Vector (Final Output):
 tensor([[[2.7170, 2.0481, 2.2806, 2.5383],
         [2.6675, 2.0217, 2.2462, 2.4991],
         [2.7799, 2.0788, 2.3207, 2.5947]]])
