## Attention With Trainable Weights

In [30]:
import torch

In [31]:
inputs = torch.nn.Embedding(4,8)

In [32]:
inputs = inputs.weight.data
inputs

tensor([[ 0.6998, -1.3441, -1.0840, -0.8677,  1.5601, -1.1193, -0.3794, -0.2593],
        [-0.2150,  1.4361, -0.9720,  0.2007,  0.2984,  0.9455,  0.4857,  1.1697],
        [ 0.2430, -2.7900,  0.0987,  0.1144,  0.0182, -0.7825, -1.6714,  1.3765],
        [-0.4029, -0.1445, -0.1091,  0.8046, -0.6051,  1.1130,  0.1990, -0.4346]])

In [33]:
# Set dimensions
d_in = 8
d_out = 6
# Create Weight Matricies query, key, value with random entries
W_q = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_k = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_v = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

In [34]:
#choose an input vector (we're using 2) and transform it into our query vector using using W_q
query = inputs[2] @ W_q
query


tensor([-2.6857, -2.5957, -1.9745, -0.3506, -1.4866, -2.4151])

In [35]:
#Calculate keys and values matrices
keys = inputs @ W_k
values = inputs @ W_v
print(keys)
print(values)

tensor([[-2.2984e+00, -2.9225e+00, -2.0923e+00, -2.1051e+00, -1.9800e+00,
         -1.6977e-02],
        [ 2.3245e+00,  2.4342e+00,  2.4395e+00,  2.2793e+00,  1.0840e+00,
          2.2500e-01],
        [-2.1410e+00, -2.7600e+00, -2.0492e+00, -2.8903e+00, -2.5054e+00,
         -4.5085e-01],
        [-2.4465e-01,  3.3326e-01, -1.4882e-03, -3.3886e-02,  3.0863e-01,
         -2.4951e-02]])
tensor([[-2.2086, -0.4454, -2.0527, -1.3508, -1.7131, -1.3904],
        [ 1.7169,  0.7648,  3.0319,  2.3913,  1.9041,  1.7529],
        [-1.3820,  0.5128, -2.0619, -1.5754, -2.9080, -3.2197],
        [ 1.1661, -0.4348,  0.4523,  0.1352,  0.4739,  0.1568]])


In [36]:
attention_scores = query @ keys.T
attention_scores

tensor([ 21.6126, -20.3321,  22.7872,  -0.5917])

In [37]:
#normalize the scores to get the weights
# denominator is just because that's what the researchers figured out worked best for training 
attention_weights = torch.softmax(attention_scores / keys.shape[-1]**0.5, dim= -1)
attention_weights

tensor([3.8234e-01, 1.3985e-08, 6.1762e-01, 4.4221e-05])

In [38]:
#Make sure you did it right
attention_weights.sum()

tensor(1.)

In [39]:
# make the context vector
context_vector = attention_weights @ values
context_vector

tensor([-1.6979,  0.1464, -2.0583, -1.4895, -2.4510, -2.5201])

## Making the Simple Attention Class Using Parameter 

In [40]:
import torch.nn as nn

In [None]:
#Create a simple attention class v1
class SimpleAttention(nn.Module):
    #Constructor, initialize those matrix dimensions
    def __init__(self, d_in, d_out):
        super().__init__()
        #Create weight matrices
        self.W_q = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
        self.W_k = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
        self.W_v = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

    #   x = embedding vectors (inputs)
    def forward(self,x):
        #Make the queries, keys and values
        queries = x @ self.W_q
        keys = x @ self.W_k
        values = x @ self.W_v
        #Compute scores, then normalize into weights
        scores = queries @ keys.T
        weights = torch.softmax(scores / keys.shape[-1]**0.5, dim= -1)
        #Compute context vector using weights and values
        context = weights @ values
        return context


In [45]:
#Using the class example
#instatiate an instance
simple = SimpleAttention(d_in = 8, d_out = 6)


In [46]:
#It has attributes!
simple.W_k

Parameter containing:
tensor([[0.5698, 0.6263, 0.1545, 0.7109, 0.7294, 0.6444],
        [0.0787, 0.1508, 0.1867, 0.4702, 0.6483, 0.2031],
        [0.2645, 0.1886, 0.3463, 0.8850, 0.2874, 0.0721],
        [0.7814, 0.6933, 0.3550, 0.8331, 0.1792, 0.4322],
        [0.9339, 0.4995, 0.7530, 0.8626, 0.4893, 0.0855],
        [0.4245, 0.7784, 0.4897, 0.9567, 0.5127, 0.7614],
        [0.5993, 0.0281, 0.0788, 0.2597, 0.8530, 0.6056],
        [0.6885, 0.6482, 0.7043, 0.4383, 0.2465, 0.1752]])

In [None]:
#Class returns the context vector
context_vectors = simple(inputs)
context_vectors

tensor([[-1.3660, -1.4908, -1.7030, -1.0838, -1.0200, -0.7900],
        [ 0.7598,  2.2223,  1.3351,  2.5333,  1.2145,  0.3361],
        [-1.4457, -1.5578, -1.7237, -1.0475, -1.0540, -0.8276],
        [-0.2191,  0.1632, -0.1347,  0.4322, -0.0661, -0.1381]])

## Making the Simple Attention Class Again using nn.Linear

In [50]:
#Create a simple attention class v2
#Using nn.Linear to be more efficient

class SimpleAttention(nn.Module):
    #Constructor, initialize those matrix dimensions
    def __init__(self, d_in, d_out):
        super().__init__()
        #Create weight matrices
        self.W_q = nn.Linear(d_in, d_out, bias=False)
        self.W_k = nn.Linear(d_in, d_out, bias=False)
        self.W_v = nn.Linear(d_in, d_out, bias=False)

    #   x = embedding vectors (inputs)
    def forward(self,x):
        #Make the queries, keys and values
        #Still just muktiplying the matrices
        queries = self.W_q(x)
        keys = self.W_k(x)
        values = self.W_v (x)
        #Compute scores, then normalize into weights
        scores = queries @ keys.T
        weights = torch.softmax(scores / keys.shape[-1]**0.5, dim= -1)
        #Compute context vector using weights and values
        context = weights @ values
        return context


In [51]:
simple = SimpleAttention(d_in = 8, d_out = 6)

In [52]:
context_vectors = simple(inputs)
context_vectors

tensor([[-0.0766, -0.0609, -0.1546, -0.2535,  0.1187, -0.2948],
        [-0.3778,  0.0255, -0.4080, -0.2293,  0.4989, -0.7135],
        [-0.0322, -0.1192, -0.1979, -0.1672,  0.1092, -0.2373],
        [-0.3287, -0.0266, -0.4711, -0.1897,  0.5315, -0.6698]],
       grad_fn=<MmBackward0>)