In [1]:
import torch

inputs = torch.tensor(
    [[0.43, 0.15, 0.89], # Your      (x^1)
     [0.55, 0.87, 0.66], # journey   (x^2)
     [0.57, 0.85, 0.64], # starts    (x^3)
     [0.22, 0.58, 0.33], # with      (x^4)
     [0.77, 0.25, 0.10], # one       (x^5)
     [0.05, 0.80, 0.55]] # step      (x^6)
)

print("Input embeddings shape:", inputs.shape)
print("Input embeddings:")
print(inputs)


Input embeddings shape: torch.Size([6, 3])
Input embeddings:
tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])


In [6]:
query = inputs[1]  # Select "journey" (x^2) as our query
print("Query token (journey):", query)
print()

# Compute attention scores by taking dot product of query with all inputs
attention_scores = torch.matmul(query, inputs.T)
print("Raw attention scores:", attention_scores)
print()

# Apply softmax to get attention weights (they sum to 1)
attention_weights = torch.softmax(attention_scores, dim=-1)
print("Attention weights:", attention_weights)
print("Sum of weights:", attention_weights.sum())
print()

# Create context vector as weighted combination of all input vectors
context_vector = torch.matmul(attention_weights, inputs)
print("Context vector for 'journey':", context_vector)


Query token (journey): tensor([0.5500, 0.8700, 0.6600])

Raw attention scores: tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum of weights: tensor(1.)

Context vector for 'journey': tensor([0.4419, 0.6515, 0.5683])


In [7]:
# Compute attention scores for all queries at once
# inputs @ inputs.T gives us a 6x6 matrix where each row is attention scores for one query
attention_scores_all = torch.matmul(inputs, inputs.T)
print("Attention scores matrix (6x6):")
print(attention_scores_all)
print()

# Apply softmax to each row to get attention weights
attention_weights_all = torch.softmax(attention_scores_all, dim=-1)
print("Attention weights matrix (each row sums to 1):")
print(attention_weights_all)
print()

# Verify that each row sums to 1
print("Row sums (should all be 1.0):")
print(attention_weights_all.sum(dim=-1))
print()

# Compute context vectors for all words
context_vectors_all = torch.matmul(attention_weights_all, inputs)
print("Context vectors for all words:")
print(context_vectors_all)
print()
print("Shape:", context_vectors_all.shape)


Attention scores matrix (6x6):
tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

Attention weights matrix (each row sums to 1):
tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])

Row sums (should all be 1.0):
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

Context vectors for all words:
tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
 

In [None]:
# Verify that our single context vector for "journey" matches the matrix result
print("Single context vector for 'journey' (computed earlier):")
print(context_vector)
print()
print("Context vector for 'journey' from matrix (row 1):")
print(context_vectors_all[1])
print()
print("Are they the same?", torch.allclose(context_vector, context_vectors_all[1]))


In [10]:
# Randomly initializing the weight matrices
torch.manual_seed(42)

W_q = torch.randn(3, 3)
W_k = torch.randn(3, 3)
W_v = torch.randn(3, 3)

print (W_q)
print (W_k)
print (W_v)

# Computing the query, key, and value vectors







tensor([[ 0.3367,  0.1288,  0.2345],
        [ 0.2303, -1.1229, -0.1863],
        [ 2.2082, -0.6380,  0.4617]])
tensor([[ 0.2674,  0.5349,  0.8094],
        [ 1.1103, -1.6898, -0.9890],
        [ 0.9580,  1.3221,  0.8172]])
tensor([[-0.7658, -0.7506,  1.3525],
        [ 0.6863, -0.3278,  0.7950],
        [ 0.2815,  0.0562,  0.5227]])


In [13]:
# First lets see how the context vector is computed for a single word

x_2 = inputs[1] # word = journey
d_in = inputs.shape[1] # dimension of the input vectors
d_out = 2 # dimension of the output vectors

W_q = torch.rand(d_in, d_out)
W_k = torch.rand(d_in, d_out)
W_v = torch.rand(d_in, d_out)

print (W_q)
print (W_k)
print (W_v)

q = x_2 @ W_q # query vector for the word "journey"
k = inputs @ W_k # key vector for the word "journey"
v = inputs @ W_v # value vector for the word "journey"

print ("query vector for the word journey")
print (q)
print ("key vector for the word journey")
print (k)
print ("value vector for the word journey")
print (v)

tensor([[0.9192, 0.4008],
        [0.9302, 0.6558],
        [0.0766, 0.8460]])
tensor([[0.3624, 0.3083],
        [0.0850, 0.0029],
        [0.6431, 0.3908]])
tensor([[0.6947, 0.0897],
        [0.8712, 0.1330],
        [0.4137, 0.6044]])
query vector for the word journey
tensor([1.3654, 1.3493])
key vector for the word journey
tensor([[0.7409, 0.4808],
        [0.6977, 0.4300],
        [0.6904, 0.4283],
        [0.3412, 0.1985],
        [0.3646, 0.2772],
        [0.4398, 0.2327]])
value vector for the word journey
tensor([[0.7975, 0.5965],
        [1.4130, 0.5639],
        [1.4012, 0.5510],
        [0.7946, 0.2963],
        [0.7941, 0.1627],
        [0.9592, 0.4433]])


In [15]:
# now calculating context vector for all words in the sentence

print("Original Inputs")
print (inputs)

print("Transpose of Inputs")
print (inputs.T)

print("Attention Weights")
attention_weights = torch.softmax(torch.matmul(inputs, inputs.T), dim=-1) # attention weights for all words in the sentence
print (attention_weights)

print("Context Vector")
context_vector = torch.matmul(attention_weights, inputs) # context vector for all words in the sentence

print ("attention weights for all words in the sentence")
print (attention_weights)

print ("context vector for all words in the sentence")
print (context_vector)





Original Inputs
tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])
Transpose of Inputs
tensor([[0.4300, 0.5500, 0.5700, 0.2200, 0.7700, 0.0500],
        [0.1500, 0.8700, 0.8500, 0.5800, 0.2500, 0.8000],
        [0.8900, 0.6600, 0.6400, 0.3300, 0.1000, 0.5500]])
Attention Weights
tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])
Context Vector
attention weights for all words in the sentence
tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.124