In [12]:
import tiktoken
from torch.utils.data import Dataset, DataLoader
import torch

In [13]:
tokenizer = tiktoken.get_encoding("gpt2")

In [14]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    enc_text = tokenizer.encode(raw_text)
    print(len(enc_text))

5145


In [15]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size, max_length, stride,
                         shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader

In [29]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

In [30]:
data_iter = iter(dataloader)
first_batch = next(data_iter)
second_batch = next(data_iter)
print(first_batch)
print(second_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [33]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=8, stride=2, shuffle=False)

In [34]:
data_iter = iter(dataloader)
first_batch = next(data_iter)
second_batch = next(data_iter)
print(first_batch)
print(second_batch)

[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]
[tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]), tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]


## Self attention

The goal of self attention is to compute the `context vector` of each token in a sequence. The context vector is an enriched embedding representation of a token. It is packed with information about the token itself and its relationship/relevance to other tokens in a sequence.

#### Implement self-attention with untrainable weights

In [35]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [39]:
attention_scores = inputs @ inputs.T
# Normalize the attention scores with a softmax
attention_weights = torch.softmax(attention_scores, dim=-1)
# Compute now the context vector
context_vector = attention_weights @ inputs
print("Context vector:\n", context_vector)

Context vector:
 tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


#### Implement self-attention with trainable weights

In self-attention without `trainaible weights`, `context vector` is a weighted sum over input vectors. However, for `trainable weights`, `context vector` is weighted sum over value vector.

##### Query, Key and Value analogy to database operation
`query` is the word/token in input sequence that the model wants to get information on, key is what is used to get the information about the query and `value` is the information received using the key.

In [66]:
import torch.nn as nn

# Let's now implement self-attention with trainable weights
# We will be using nn.Parameter to initialize and create the weights
class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        """
        :param d_in: input dimension
        :param d_out: output dimension
        """
        super().__init__()
        self.W_q = nn.Parameter(torch.randn(d_in, d_out))
        self.W_k = nn.Parameter(torch.randn(d_in, d_out))
        self.W_v = nn.Parameter(torch.randn(d_in, d_out))
        self.d_out = d_out
    
    def forward(self, x):
        """
        :param x: input tensor of shape (batch_size, d_in)
        :return: output tensor of shape (batch_size, d_out)
        """
        # Compute queries, keys and values
        Q = x @ self.W_q
        K = x @ self.W_k
        V = x @ self.W_v

        # Compute attention scores
        attention_scores = Q @ K.T
        attention_weights = torch.softmax(attention_scores / (self.d_out ** 0.5), dim=-1)

        # Compute context vector
        context_vector = attention_weights @ V

        return context_vector

- Let's test self-attention v1

In [67]:
torch.manual_seed(123)
d_in, d_out = 3, 2
sa_v1 = SelfAttention_v1(d_in=d_in, d_out=d_out)
context_vector_v1 = sa_v1(inputs) # Context vector for the inputs
context_vector_v1

tensor([[0.2845, 0.4071],
        [0.2854, 0.4081],
        [0.2854, 0.4075],
        [0.2864, 0.3974],
        [0.2863, 0.3910],
        [0.2860, 0.4039]], grad_fn=<MmBackward0>)

In [68]:
# Lets now use nn.Linear to implement self-attention

class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bais=False):
        """
        :param d_in: input dimension
        :param d_out: output dimension
        """
        super().__init__()
        self.W_q = nn.Linear(d_in, d_out, bias=qkv_bais)
        self.W_k = nn.Linear(d_in, d_out, bias=qkv_bais)
        self.W_v = nn.Linear(d_in, d_out, bias=qkv_bais)
        self.d_out = d_out
    
    def forward(self, x):
        """
        :param x: input tensor of shape (batch_size, d_in)
        :return: output tensor of shape (batch_size, d_out)
        """
        # Compute queries, keys and values
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)

        # Compute attention scores
        attention_scores = Q @ K.T
        attention_weights = torch.softmax(attention_scores / (self.d_out ** 0.5), dim=-1)

        # Compute context vector
        context_vector = attention_weights @ V

        return context_vector

- Let's now test self-attention v2

In [69]:
torch.manual_seed(123)
sa_v2 = SelfAttention_v2(d_in=d_in, d_out=d_out)
context_vector_v2 = sa_v2(inputs) # Context vector for the inputs
context_vector_v2

tensor([[-0.5337, -0.1051],
        [-0.5323, -0.1080],
        [-0.5323, -0.1079],
        [-0.5297, -0.1076],
        [-0.5311, -0.1066],
        [-0.5299, -0.1081]], grad_fn=<MmBackward0>)

In [70]:
# Let's test if the two context vectors are the same
print(torch.allclose(context_vector_v1, context_vector_v2, atol=1e-6))

False


- Let's find a way to clone the weights from v2 and copy to v1 and then check if they are same

In [71]:
# Lets clone the weights of self-attention v2 to self-attention v1
torch.manual_seed(123)
sa_v1.W_q.data = sa_v2.W_q.weight.data.T.clone()
sa_v1.W_k.data = sa_v2.W_k.weight.data.T.clone()
sa_v1.W_v.data = sa_v2.W_v.weight.data.T.clone()

context_vector_v1 = sa_v1(inputs) 
context_vector_v2 = sa_v2(inputs)

# Let's test if the two context vectors are the same
print(torch.allclose(context_vector_v1, context_vector_v2, atol=1e-6))



True


In [72]:
context_vector_v1

tensor([[-0.5337, -0.1051],
        [-0.5323, -0.1080],
        [-0.5323, -0.1079],
        [-0.5297, -0.1076],
        [-0.5311, -0.1066],
        [-0.5299, -0.1081]], grad_fn=<MmBackward0>)

In [73]:
context_vector_v2

tensor([[-0.5337, -0.1051],
        [-0.5323, -0.1080],
        [-0.5323, -0.1079],
        [-0.5297, -0.1076],
        [-0.5311, -0.1066],
        [-0.5299, -0.1081]], grad_fn=<MmBackward0>)