In [1]:
import torch

In [2]:
# input-embedding:
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [3]:
# omega = attention-scores
attention_scores = inputs @ inputs.T
attention_scores

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])

In [4]:
attention_weights = attention_scores.softmax(dim=1)
attention_weights

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])

In [5]:
context_vectors = attention_weights @ inputs
context_vectors

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])

In [6]:
inputs.shape, inputs.T.shape
# (6, 3) @ (3, 6)
# (6, 6)
# -----
# (6, 6) @ (6, 3)
# (6, 3)

(torch.Size([6, 3]), torch.Size([3, 6]))

In [7]:
context_vectors = (inputs @ inputs.T).softmax(dim=1) @ inputs
context_vectors

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])

In [8]:
torch.manual_seed(42)

# Self-Attention with [Trainable Weights]
x_2 = inputs[1] # [3]
d_in = x_2.shape[0] # 2
d_out = 2

# Weight Matrices
W_q = torch.nn.Parameter(data=torch.rand(d_in, d_out), requires_grad=False) # (3, 2)
W_k = torch.nn.Parameter(data=torch.rand(d_in, d_out), requires_grad=False) # (3, 2)
W_v = torch.nn.Parameter(data=torch.rand(d_in, d_out), requires_grad=False) # (3, 2)

In [9]:
query_2 = x_2 @ W_q # (3) @ (3, 2)
key_2 = x_2 @ W_k # (3) @ (3, 2)
value_2 = x_2 @ W_v # (3) @ (3, 2)

print(query_2)
print(key_2)
print(value_2)

tensor([1.0760, 1.7344])
tensor([1.5764, 0.9441])
tensor([1.7073, 1.0646])


In [10]:
# Projecting all `input` tokens into `keys` & `values`
keys = inputs @ W_k
values = inputs @ W_v

In [12]:
keys_2 = keys[1]
attention_score_22 = query_2.dot(key_2)
print(attention_score_22)

tensor(3.3338)


In [None]:
# (inputs @ W_q).dot(keys.T)
# (inputs @ W_q).shape # (6.2)
# keys.shape (6, 2)
# (inputs @ W_q) @ keys.T

tensor([[1.8033, 2.2486, 2.2247, 1.1941, 1.1672, 1.5103],
        [2.7084, 3.3338, 3.3013, 1.7563, 1.7869, 2.1966],
        [2.6993, 3.3251, 3.2925, 1.7525, 1.7789, 2.1933],
        [1.4408, 1.7618, 1.7454, 0.9243, 0.9596, 1.1492],
        [1.7754, 2.2317, 2.2067, 1.1910, 1.1353, 1.5164],
        [1.6295, 1.9707, 1.9539, 1.0266, 1.1023, 1.2637]])

In [25]:
query_2 = x_2 @ W_q # (3) @ (3, 2)
key_2 = x_2 @ W_k # (3) @ (3, 2)
value_2 = x_2 @ W_v # (3) @ (3, 2)

attention_score_22 = query_2.dot(keys_2)

In [None]:
attention_score_2 = query_2 @ keys.T
attention_score_2

tensor([2.7084, 3.3338, 3.3013, 1.7563, 1.7869, 2.1966])

In [None]:
d_in = 3
d_out = 2

W_q = torch.nn.Parameter(data=torch.rand(d_in, d_out), requires_grad=False)
W_k = torch.nn.Parameter(data=torch.rand(d_in, d_out), requires_grad=False)
W_v = torch.nn.Parameter(data=torch.rand(d_in, d_out), requires_grad=False)
queries = inputs @ W_q # (6, 3) @ (3, 2) = (6, 2)
keys = inputs @ W_k # (6, 3) @ (3, 2) = (6, 2)
values = inputs @ W_v # (6, 3) @ (3, 2) = (6, 2)
attention_scores = queries @ keys.T # (6, 2) @ (2, 6) = (6, 2)
attention_weights = torch.softmax(attention_scores / keys.shape[-1]**0.5, dim=-1) 
context_vectors = attention_weights @ inputs # (6, 6) @ (6, 3) = (6, 3)
context_vectors # (6, 3)

tensor([[0.4361, 0.6039, 0.5583],
        [0.4399, 0.6278, 0.5784],
        [0.4400, 0.6273, 0.5784],
        [0.4346, 0.6097, 0.5568],
        [0.4381, 0.6059, 0.5653],
        [0.4342, 0.6161, 0.5589]])

In [172]:
selfAttention_V2.W_k.weight.T

tensor([[-0.0512, -0.1135],
        [ 0.1528, -0.5516],
        [-0.1745, -0.3824]], grad_fn=<PermuteBackward0>)

In [166]:
from torch import nn
class SelfAttention_V1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_q = selfAttention_V2.W_q.weight.T # nn.Parameter(data=torch.rand(d_in, d_out))
        self.W_k = selfAttention_V2.W_k.weight.T # nn.Parameter(data=torch.rand(d_in, d_out))
        self.W_v = selfAttention_V2.W_v.weight.T # nn.Parameter(data=torch.rand(d_in, d_out))

    def forward(self, inputs): 
        queries = inputs @ self.W_q          # (6, 3) @ (3, 2) = (6, 2)
        keys = inputs @ self.W_k             # (6, 3) @ (3, 2) = (6, 2)
        values = inputs @ self.W_v           # (6, 3) @ (3, 2) = (6, 2)
        attention_score = (queries @ keys.T) # (6, 2) @ (2, 6) = (6, 6)
        attention_weights = torch.softmax(attention_score / (keys.shape[-1]**0.5), dim=-1)
        context_vectors = (attention_weights) @ values  # (6, 6) @ (6, 2) = (6, 2)
        return context_vectors 

# input-embedding:
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

selfAttention_V1 = SelfAttention_V1(d_in = inputs.shape[-1], 
                                    d_out= 2)


context_vectors = selfAttention_V1(inputs)
context_vectors

tensor([[0.0921, 0.2688],
        [0.0944, 0.2730],
        [0.0944, 0.2731],
        [0.0943, 0.2702],
        [0.0948, 0.2737],
        [0.0940, 0.2692]], grad_fn=<MmBackward0>)

In [241]:
torch.manual_seed(0)

class SelfAttention_V2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias = False):
        super().__init__()
        self.W_q = nn.Linear(in_features = d_in, out_features = d_out, bias=qkv_bias)
        self.W_k = nn.Linear(in_features = d_in, out_features = d_out, bias=qkv_bias)
        self.W_v = nn.Linear(in_features = d_in, out_features = d_out, bias=qkv_bias) 

    def forward(self, inputs): 
        queries = self.W_q(inputs)                          # (6, 3) @ (3, 2) = (6, 2)
        keys = self.W_k(inputs)                             # (6, 3) @ (3, 2) = (6, 2)
        values = self.W_v(inputs)                           # (6, 3) @ (3, 2) = (6, 2)
        attention_score = (queries @ keys.T)                # (6, 2) @ (2, 6) = (6, 6)
        attention_weights = torch.softmax(attention_score / (keys.shape[-1]**0.5), dim=-1)
        context_vectors = (attention_weights) @ values      # (6, 6) @ (6, 2) = (6, 2)
        return context_vectors 

selfAttention_V2 = SelfAttention_V2(d_in = inputs.shape[-1], 
                                    d_out= 2)


context_vectors = selfAttention_V2(inputs)
context_vectors

tensor([[-0.5844,  0.3235],
        [-0.5871,  0.3269],
        [-0.5871,  0.3269],
        [-0.5873,  0.3265],
        [-0.5876,  0.3276],
        [-0.5870,  0.3259]], grad_fn=<MmBackward0>)

In [3]:
import torch
from torch import nn
from torch import tensor

# input-embedding:
inputs = tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [16]:
class SelfAttention(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias = False):
        super().__init__()
        self.W_q = nn.Linear(in_features=d_in, out_features=d_out, bias=qkv_bias)
        self.W_k = nn.Linear(in_features=d_in, out_features=d_out, bias=qkv_bias)
        self.W_v = nn.Linear(in_features=d_in, out_features=d_out, bias=qkv_bias)
    
    def forward(self, x):
        '''x: 2D matrix, with (n_tokens, d_in)'''
        Q = self.W_q(x) # x @ W_q = (n_tokens, d_in) @ (d_in, d_out) = (n_tokens, d_out)
        K = self.W_k(x)
        V = self.W_v(x)
        attention_score = torch.matmul(Q, K.T)
        attention_weights = torch.softmax(attention_score/ K.shape[-1]**0.5 , dim=-1)
        context_vectors = attention_weights @ V
        return context_vectors
selfAttention = SelfAttention(inputs.shape[-1], d_out=8)
selfAttention(inputs)

tensor([[-0.7239,  0.1377, -0.2757, -0.1217,  0.0906,  0.3756, -0.2243,  0.2451],
        [-0.7232,  0.1371, -0.2739, -0.1184,  0.0875,  0.3731, -0.2258,  0.2444],
        [-0.7235,  0.1372, -0.2740, -0.1185,  0.0876,  0.3733, -0.2258,  0.2445],
        [-0.7280,  0.1358, -0.2763, -0.1201,  0.0894,  0.3752, -0.2279,  0.2458],
        [-0.7314,  0.1375, -0.2777, -0.1211,  0.0900,  0.3776, -0.2282,  0.2472],
        [-0.7250,  0.1353, -0.2750, -0.1192,  0.0886,  0.3734, -0.2270,  0.2448]],
       grad_fn=<MmBackward0>)

In [42]:
class SelfAttention(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias = False):
        '''Let's Consider Batched inputs'''
        super().__init__()
        self.W_q = nn.Linear(in_features=d_in, out_features=d_out, bias=qkv_bias)
        self.W_k = nn.Linear(in_features=d_in, out_features=d_out, bias=qkv_bias)
        self.W_v = nn.Linear(in_features=d_in, out_features=d_out, bias=qkv_bias)
    
    def forward(self, x):
        '''x: 3D matrix, with (batch_size, n_tokens, d_in)'''
        Q = self.W_q(x) # (8, 6, 2)
        K = self.W_k(x)
        V = self.W_v(x)
        attention_score = torch.matmul(Q, K.transpose(-1, -2))  
        attention_weights = torch.softmax(attention_score/ K.shape[-1]**0.5 , dim=-1)
        context_vectors = attention_weights @ V
        return context_vectors

inputs = torch.rand(size=(8, 6, 3)) # (6 tokens in each batch with dimensions = 3)
selfAttention = SelfAttention(d_in= inputs.shape[-1], d_out= 2)
selfAttention(inputs).shape

torch.Size([8, 6, 2])

In [49]:
K = torch.tensor([[[1, 2],   # Token A
                   [3, 4],   # Token B
                   [5, 6]]]) # Token C
print(K.transpose(-1, -2))
print(K.reshape(1, 2, 3))


tensor([[[1, 3, 5],
         [2, 4, 6]]])
tensor([[[1, 2, 3],
         [4, 5, 6]]])
