In [1]:
import torch

inputs = torch.tensor(
    [[0.43, 0.15, 0.89], # Your      (x^1)
     [0.55, 0.87, 0.66], # journey   (x^2)
     [0.57, 0.85, 0.64], # starts    (x^3)
     [0.22, 0.58, 0.33], # with      (x^4)
     [0.77, 0.25, 0.10], # one       (x^5)
     [0.05, 0.80, 0.55]] # step      (x^6)
)

print("Input embeddings shape:", inputs.shape)
print("Input embeddings:")
print(inputs)


Input embeddings shape: torch.Size([6, 3])
Input embeddings:
tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])


In [6]:
query = inputs[1]  # Select "journey" (x^2) as our query
print("Query token (journey):", query)
print()

# Compute attention scores by taking dot product of query with all inputs
attention_scores = torch.matmul(query, inputs.T)
print("Raw attention scores:", attention_scores)
print()

# Apply softmax to get attention weights (they sum to 1)
attention_weights = torch.softmax(attention_scores, dim=-1)
print("Attention weights:", attention_weights)
print("Sum of weights:", attention_weights.sum())
print()

# Create context vector as weighted combination of all input vectors
context_vector = torch.matmul(attention_weights, inputs)
print("Context vector for 'journey':", context_vector)


Query token (journey): tensor([0.5500, 0.8700, 0.6600])

Raw attention scores: tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])

Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum of weights: tensor(1.)

Context vector for 'journey': tensor([0.4419, 0.6515, 0.5683])


In [7]:
# Compute attention scores for all queries at once
# inputs @ inputs.T gives us a 6x6 matrix where each row is attention scores for one query
attention_scores_all = torch.matmul(inputs, inputs.T)
print("Attention scores matrix (6x6):")
print(attention_scores_all)
print()

# Apply softmax to each row to get attention weights
attention_weights_all = torch.softmax(attention_scores_all, dim=-1)
print("Attention weights matrix (each row sums to 1):")
print(attention_weights_all)
print()

# Verify that each row sums to 1
print("Row sums (should all be 1.0):")
print(attention_weights_all.sum(dim=-1))
print()

# Compute context vectors for all words
context_vectors_all = torch.matmul(attention_weights_all, inputs)
print("Context vectors for all words:")
print(context_vectors_all)
print()
print("Shape:", context_vectors_all.shape)


Attention scores matrix (6x6):
tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

Attention weights matrix (each row sums to 1):
tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])

Row sums (should all be 1.0):
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

Context vectors for all words:
tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
 

In [None]:
# Verify that our single context vector for "journey" matches the matrix result
print("Single context vector for 'journey' (computed earlier):")
print(context_vector)
print()
print("Context vector for 'journey' from matrix (row 1):")
print(context_vectors_all[1])
print()
print("Are they the same?", torch.allclose(context_vector, context_vectors_all[1]))


In [10]:
# Randomly initializing the weight matrices
torch.manual_seed(42)

W_q = torch.randn(3, 3)
W_k = torch.randn(3, 3)
W_v = torch.randn(3, 3)

print (W_q)
print (W_k)
print (W_v)

# Computing the query, key, and value vectors







tensor([[ 0.3367,  0.1288,  0.2345],
        [ 0.2303, -1.1229, -0.1863],
        [ 2.2082, -0.6380,  0.4617]])
tensor([[ 0.2674,  0.5349,  0.8094],
        [ 1.1103, -1.6898, -0.9890],
        [ 0.9580,  1.3221,  0.8172]])
tensor([[-0.7658, -0.7506,  1.3525],
        [ 0.6863, -0.3278,  0.7950],
        [ 0.2815,  0.0562,  0.5227]])


In [16]:
# Set dimensions like in the book
x_2 = inputs[1]  # Focus on "journey" token first
d_in = inputs.shape[1]  # Input dimension = 3
d_out = 2  # Output dimension (smaller for illustration)

print(f"Input token 'journey': {x_2}")
print(f"Input dimension (d_in): {d_in}")
print(f"Output dimension (d_out): {d_out}")
print()


Input token 'journey': tensor([0.5500, 0.8700, 0.6600])
Input dimension (d_in): 3
Output dimension (d_out): 2



In [None]:
# ### Comparison: Simple vs Standard Attention

# **Key Differences:**

# 1. **Simple Self-Attention**: Direct dot product of embeddings
#    - `attention = softmax(inputs @ inputs.T)`
#    - Uses original 3D embeddings
#    - Context vector is 3D

# 2. **Standard QKV Attention**: Learned transformations first
#    - `attention = softmax(Q @ K.T / √d_k)`
#    - Uses transformed 2D vectors (Q, K, V)
#    - Context vector is 2D
#    - Has learnable parameters (W_q, W_k, W_v)
#    - Includes scaling factor


In [17]:
# Initialize the three weight matrices W_q, W_k, and W_v
torch.manual_seed(123)
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

print("W_query matrix:")
print(W_query)
print("\nW_key matrix:")
print(W_key)
print("\nW_value matrix:")
print(W_value)
print()


W_query matrix:
Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])

W_key matrix:
Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])

W_value matrix:
Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])



In [51]:
# Compute query, key, and value vectors for token "journey"
query_2 = x_2 @ W_query
key_2 = x_2 @ W_key  
value_2 = x_2 @ W_value

print("Query vector for 'journey':", query_2)
print("Key vector for 'journey':", key_2)
print("Value vector for 'journey':", value_2)
print()

# Show the difference between simple and QKV attention
print("COMPARISON:")
print("Original embedding for 'journey':", x_2)
print("Query (what 'journey' is looking for):", query_2)
print("Key (what 'journey' advertises about itself):", key_2)
print("Value (information 'journey' provides):", value_2)


Query vector for 'journey': tensor([0.4306, 1.4551])
Key vector for 'journey': tensor([0.4433, 1.1419])
Value vector for 'journey': tensor([0.3951, 1.0037])

COMPARISON:
Original embedding for 'journey': tensor([0.5500, 0.8700, 0.6600])
Query (what 'journey' is looking for): tensor([0.4306, 1.4551])
Key (what 'journey' advertises about itself): tensor([0.4433, 1.1419])
Value (information 'journey' provides): tensor([0.3951, 1.0037])


In [52]:
# First lets see how the context vector is computed for a single word

x_2 = inputs[1] # word = journey
d_in = inputs.shape[1] # dimension of the input vectors's column
d_out = 3 # dimension of the output vectors

W_q = torch.rand(d_in, d_out)
W_k = torch.rand(d_in, d_out)
W_v = torch.rand(d_in, d_out)

print (W_q)
print (W_k)
print (W_v)

q = x_2 @ W_q # query vector for the word "journey"
k = x_2 @ W_k # key vector for the word "journey"
v = x_2 @ W_v # value vector for the word "journey"

#The following K and V are used to compute the attention weights for all words in the sentence
#needed  as we will now use it with journey word to compute the attention weights for all words 
#in the sentence in relation to journey word
Q = inputs @ W_q # query vector for all words in the sentence
K = inputs @ W_k # key vector for all words in the sentence
V = inputs @ W_v # value vector for all words in the sentence

print ("\n query vector for the word journey")
print (q)
print ("\n key vector for the word journey")
print (k)
print ("\n value vector for the word journey")
print (v)


attention_scores = torch.matmul(q, K.T) # calculating the attention scores for all words in the sentence in relation to journey word
#q contains the query vector for the word "journey"
#K contains the key vector for all words in the sentence
#the result of the dot product of q and K.T is the attention scores for all words in the sentence in relation to journey word   


attention_weights = torch.softmax(attention_scores, dim=-1) # this is the attention weights for all words in the sentence in relation to journey word
#attention_scores is the dot product of q and K.T
#softmax is used to convert the attention scores into attention weights
#the attention weights are then used to compute the context vector for the word "journey"   
print ("\n attention weights for the word journey")
print (attention_weights)

context_vector = torch.matmul(attention_weights, V) # this is the context vector for the word "journey"
#attention_weights is the attention weights for all words in the sentence in relation to journey word
#V contains the value vector for all words in the sentence
#the result of the dot product of attention_weights and V is the context vector for the word "journey"  
# context_vector contains the context vector for the word "journey"
# it is a new representation of the word "journey" that captures the context of the sentence
print ("\n context vector for the word journey")
print (context_vector)  






# print ("\n attention scores for the word journey ")
# print (attention_scores)
# print ("\n attention weights shape")
# print (attention_weights.shape)
# print ("\n value shape")
# print (v.shape)
    
# context_vector = torch.matmul(attention_weights, v)

# print ("\n context vector for the word journey")
# print (context_vector)

tensor([[0.8855, 0.9941, 0.3705],
        [0.5148, 0.2103, 0.9562],
        [0.6591, 0.4172, 0.6253]])
tensor([[0.9961, 0.7036, 0.7429],
        [0.9616, 0.5214, 0.5024],
        [0.6241, 0.0379, 0.6748]])
tensor([[0.4962, 0.4761, 0.5288],
        [0.9429, 0.6435, 0.0470],
        [0.9632, 0.8049, 0.7523]])

 query vector for the word journey
tensor([1.3699, 1.0051, 1.4483])

 key vector for the word journey
tensor([1.7964, 0.8656, 1.2910])

 value vector for the word journey
tensor([1.7290, 1.3529, 0.8283])

 attention weights for the word journey
tensor([0.0659, 0.3977, 0.3876, 0.0360, 0.0567, 0.0561])

 context vector for the word journey
tensor([1.5756, 1.2382, 0.7768])


In [60]:
# caclulating context vector for all words in the sentence
# making a loop to calculate the context vector for all words in the sentence
context_vectors = []

Q = inputs @ W_q # query vector for all words in the sentence
K = inputs @ W_k # key vector for all words in the sentence
V = inputs @ W_v # value vector for all words in the sentence


for i in range(inputs.shape[0]):
    q = Q[i]  # query vector for word 
    attention_scores = torch.matmul(q, K.T)  #calculating the attention scores for all words in the sentence in relation to the word i
    attention_weights = torch.softmax(attention_scores / (K.shape[-1] ** 0.5), dim=-1)  # scaled
    context_vector = torch.matmul(attention_weights, V) #calculating the context vector for the word i in relation to all words in the sentence

    context_vectors.append(context_vector)


print("\nContext vectors for all words:")
print(context_vectors) # this is the context vector for all words in the sentence in relation to all words in the sentence



Context vectors for all words:
[tensor([1.4076, 1.1098, 0.7100]), tensor([1.4618, 1.1514, 0.7327]), tensor([1.4605, 1.1505, 0.7322]), tensor([1.3737, 1.0839, 0.6971]), tensor([1.3835, 1.0912, 0.6995]), tensor([1.3940, 1.0997, 0.7063])]
