# Handle Text

## Text Tokenization

### Download and Read the Verdict
Download the Verdict as a text file, then read the file content.

In [52]:
import urllib.request
from ftplib import ftpcp
from msilib import type_key

# url = ("https://raw.githubusercontent.com/rasbt/"
#        "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
#        "the-verdict.txt")
# file_path = "./the-verdict.txt"
# urllib.request.urlretrieve(url, file_path)

file_path = "./the-verdict.txt"
with open(file_path, "r") as file:
    raw_text = file.read()
print(f"Total characters: {len(raw_text)}")
print(raw_text[:100])

Total characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


### Split the Text into Words Using Regex

In [53]:
import re
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


### Convert the Words to Unique IDs


In [54]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

vocab = {token:integer for integer,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

1130
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


### Simple Tokenizer
Implement a encode function to convert a text into a sequence of token IDs.

Implement a decode function to convert a sequence of token IDs back into a text.

In [55]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.vocab = vocab
        self.inverse_vocab = {integer:token for token, integer in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        return [self.vocab[token] for token in preprocessed]

    def decode(self, ids):
        text = ' '.join([self.inverse_vocab[id] for id in ids])
        text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)
        return text

tokenizerV1 = SimpleTokenizerV1(vocab)
text = """It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""""
ids = tokenizerV1.encode(text)
print(ids)
decoded_text = tokenizerV1.decode(ids)
print(decoded_text)

[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


### Tokenizer V2
We want to handle the unknown words in the text.
We will add a special token `<|unk|>` to represent the unknown words.

And add a token `<|endoftext|>` to represent the end of the text, thus we can handle multiple texts that from different sources.

In [56]:
# add the special tokens to the vocab
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}

class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.vocab = vocab
        self.inverse_vocab = {integer:token for token, integer in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        return [self.vocab.get(token, self.vocab["<|unk|>"]) for token in preprocessed]

    def decode(self, ids):
        text = ' '.join([self.inverse_vocab[id] for id in ids])
        text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)
        return text

tokenizerV2 = SimpleTokenizerV2(vocab)
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
ids = tokenizerV2.encode(text)
print(ids)
decoded_text = tokenizerV2.decode(ids)
print(decoded_text)


[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


### BPE Tokenizer
BPE could handle the unknown words in the text by split it into smaller tokens.

BPE merge the most frequent pairs of tokens into a new token to make the vocabulary.

In [57]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
text = "Aiwerkn oker"
ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(ids)
for id in ids:
    print(id, tokenizer.decode([id]))
print(tokenizer.decode(ids))

[32, 14246, 9587, 77, 267, 6122]
32 A
14246 iw
9587 erk
77 n
267  o
6122 ker
Aiwerkn oker


## Use Sliding Window to Create Input-Output Pairs
### Read and Encode the Verdict

In [58]:
file_path = "./the-verdict.txt"
with open(file_path, "r") as file:
    raw_text = file.read()
enc_text = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})
print(len(enc_text))

5145


### Create Input-Output Pairs
The input will be a sequence of tokens.

The output will be the next token in the sequence.

In [59]:
enc_sample = enc_text[:10]
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]

for i in range(1, context_size + 1):
    context = enc_sample[:i]
    target = enc_sample[i]
    print(context, "->", target)
    print(tokenizer.decode(context), "->", tokenizer.decode([target]))

[40] -> 367
I ->  H
[40, 367] -> 2885
I H -> AD
[40, 367, 2885] -> 1464
I HAD ->  always
[40, 367, 2885, 1464] -> 1807
I HAD always ->  thought


### DataSet and DataLoader
We use the sliding window to create the input-output pairs.

x is the input sequence of tokens, specifically `text[sample_start: sample_start + context_size]`.

y is the target sequence of tokens, specifically `text[sample_start + 1: sample_start + context_size + 1]`.

We could generate the input-output pairs from x and y as forementioned.

DataSet is to store the tokenized text. DataLoader is to load the data in batches.

In [60]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDataSetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, batch_size = 4, max_length = 256, stride = 128, shuffle = True, drop_last = True, num_workers = 0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataSetV1(txt, tokenizer, max_length, stride)
    return DataLoader(dataset, batch_size = batch_size, shuffle = shuffle, drop_last = drop_last, num_workers = num_workers)

file_path = "./the-verdict.txt"
with open(file_path, "r") as file:
    raw_text = file.read()

dataLoader = create_dataloader_v1(raw_text, batch_size = 8, max_length = 4, stride = 4, shuffle = False, drop_last = True, num_workers = 0)
data_iter = iter(dataLoader)
first_batch = next(data_iter)
print(first_batch)


[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]


## Embedding Layer

### A Simple Embedding Layer
As a simple example, we create an embedding layer.

The embedding layer has two parameters:
1. The vocabulary size, which is the number of unique tokens in the dataset.
2. The embedding dimension, which is the size of the vector representation for each token.

The embedding layer receives a tensor of token indices and returns a tensor of token embeddings.
For example, the output dim is 5 and the num of tokens is 32, then it will return a tensor of shape (5, 32).

In [61]:
vocab_size = tokenizer.n_vocab
output_dim = 256
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

print(embedding_layer(torch.tensor([3])))

tensor([[ 9.3597e-03,  3.0292e-01, -7.8245e-01,  3.1204e-01,  2.4184e-01,
          1.0521e+00,  9.5445e-01,  4.4050e-01,  5.6805e-02,  2.2422e-01,
         -1.0624e-01,  8.0174e-01, -2.0032e+00,  2.1283e-01, -7.8109e-02,
         -1.2559e+00,  6.9840e-01, -1.3709e+00, -8.4167e-01, -1.0686e+00,
          2.1349e+00, -1.3881e-01,  1.1930e+00,  1.1629e+00, -5.7803e-01,
          7.9315e-01,  1.2979e+00, -5.8399e-02,  7.5043e-01,  6.8632e-01,
          4.6644e-01,  1.7545e+00, -7.6113e-02,  5.9238e-01,  1.0268e+00,
         -6.7041e-01,  1.7584e-03,  8.9729e-01,  5.1414e-01,  1.3069e+00,
          9.8313e-01,  1.4669e+00, -8.8847e-01, -2.3954e+00, -1.3900e+00,
          1.7364e+00,  1.6687e-01, -1.0436e+00,  1.7365e+00,  4.9998e-02,
         -1.1001e-01, -5.2164e-01,  1.3541e+00, -6.4290e-01,  1.0547e+00,
         -3.1463e-01, -3.6410e-01, -1.7926e-02, -9.9834e-01,  4.5447e-01,
         -3.7145e-01, -4.1177e-01,  1.2277e+00, -3.2087e-01,  1.3257e+00,
          1.0080e+00,  4.1322e-01,  1.

### Positional Embedding
Fixed embedding cannot capture the position of tokens in a sequence.

Since the position of tokens does matter with the meaning of the sequence, we could use positional embedding to add the position information to the token embeddings.

There are two positional embedding methods:
1. Absolute positional embedding
2. Relative positional embedding

In [62]:
max_length = 4
dataLoader = create_dataloader_v1(raw_text, batch_size = 8, max_length = max_length, stride = max_length, shuffle = False, drop_last = True, num_workers = 0)
data_iter = iter(dataLoader)
inputs, targets = next(data_iter)
print("Input shape:", inputs.shape)

token_embeddings = embedding_layer(inputs)
print("Input token embeddings shape:", token_embeddings.shape)

context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print("Positional embeddings shape:", pos_embeddings.shape)

# add the positional embeddings to the token embeddings
input_embeddings = token_embeddings + pos_embeddings
print("Input embeddings shape with positional embeddings:", input_embeddings.shape)


Input shape: torch.Size([8, 4])
Input token embeddings shape: torch.Size([8, 4, 256])
Positional embeddings shape: torch.Size([4, 256])
Input embeddings shape with positional embeddings: torch.Size([8, 4, 256])


# Attention Mechanism
The attention mechanism means for each token in sequence, it will add a weighted sum of all tokens in the sequence, thus it could .

We will compute the attention score for each token in the sequence.

## Simple Attention Mechanism Without Trainable Parameters

In [63]:
inputs = torch.tensor(
    [[0.43, 0.15, 0.89],
    [0.55, 0.82, 0.63],
    [0.22, 0.18, 0.05],
    [0.76, 0.59, 0.92]]
)

# we assume that the attention score is the dot product of the query and the key
# we will query the first token
# attention score for the first token is 0.43*0.43 + 0.15*0.15 + 0.89*0.89
# attention score for the second token is 0.55*0.43 + 0.82*0.15 + 0.63*0.89
# attention score for the third token is 0.22*0.43 + 0.18*0.15 + 0.05*0.89
# attention score for the fourth token is 0.76*0.43 + 0.59*0.15 + 0.92*0.89
query = inputs[0]
attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(query, x_i)

print("query.shape:", query.shape)
print("attn_scores_2.shape:", attn_scores_2.shape)
print(attn_scores_2)


# then we normalize the attention scores to make them sum up to 1
def softmax_naive(x):
    exp_x = torch.exp(x)
    return exp_x / torch.sum(exp_x)

attn_weights_naive = softmax_naive(attn_scores_2)
print("attn_weights_naive:", attn_weights_naive)
print("sum of attn_weights_naive:", torch.sum(attn_weights_naive))

attn_weights = torch.softmax(attn_scores_2, dim = 0)
print("attn_weights:", attn_weights)
print("sum of attn_weights:", torch.sum(attn_weights))

# after we get the attention weights, we can compute the weighted sum of the values
# the weighted sum of the values is the output of the attention mechanism
context_vec_first = torch.zeros_like(query)
for i, x_i in enumerate(inputs):
    context_vec_first += attn_weights[i] * x_i

print("context_vec of the first token:", context_vec_first)

# compute all context vectors
attn_scores = torch.empty(inputs.shape[0], inputs.shape[0])
for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attn_scores[i, j] = torch.dot(x_i, x_j)
print("attn_scores:", attn_scores)

attn_scores = inputs @ inputs.T
print("attn_scores:", attn_scores)

# then we normalize the attention scores, dim -1 means normalize along the last dimension
attn_weights = torch.softmax(attn_scores, dim = -1)
print("attn_weights:", attn_weights)
print("sum of attn_weights:", torch.sum(attn_weights, dim = -1))


query.shape: torch.Size([3])
attn_scores_2.shape: torch.Size([4])
tensor([0.9995, 0.9202, 0.1661, 1.2341])
attn_weights_naive: tensor([0.2760, 0.2550, 0.1200, 0.3490])
sum of attn_weights_naive: tensor(1.)
attn_weights: tensor([0.2760, 0.2550, 0.1200, 0.3490])
sum of attn_weights: tensor(1.0000)
context_vec of the first token: tensor([0.5506, 0.4780, 0.7334])
attn_scores: tensor([[0.9995, 0.9202, 0.1661, 1.2341],
        [0.9202, 1.3718, 0.3001, 1.4814],
        [0.1661, 0.3001, 0.0833, 0.3194],
        [1.2341, 1.4814, 0.3194, 1.7721]])
attn_scores: tensor([[0.9995, 0.9202, 0.1661, 1.2341],
        [0.9202, 1.3718, 0.3001, 1.4814],
        [0.1661, 0.3001, 0.0833, 0.3194],
        [1.2341, 1.4814, 0.3194, 1.7721]])
attn_weights: tensor([[0.2760, 0.2550, 0.1200, 0.3490],
        [0.2057, 0.3231, 0.1106, 0.3605],
        [0.2364, 0.2703, 0.2176, 0.2756],
        [0.2276, 0.2914, 0.0912, 0.3898]])
sum of attn_weights: tensor([1.0000, 1.0000, 1.0000, 1.0000])


## Oringinal Transformer
There are three trainable matrix:
1. Query matrix
2. Key matrix
3. Value matrix

We will use the query matrix to compute the attention score for each token in the sequence.

### Calculation of Transformer
$$
queries = inputs @ W_query
$$
$$
keys = inputs @ W_key
$$
$$
values = inputs @ W_value
$$


The attention score from token i to token j equals queries[i] dot with keys[j].

We use softmax to nomalize the attention score to attention weight.

Then we use the weight[i, j] multiple with values[j] to get the contex vrctor[i, j].

In [64]:
input_dim = 3
output_dim = 2

torch.manual_seed(123)
W_query = torch.randn(input_dim, output_dim)
W_key = torch.randn(input_dim, output_dim)
W_value = torch.randn(input_dim, output_dim)

queries = inputs @ W_query
keys = inputs @ W_key
values = inputs @ W_value
print("queries:", queries)
print("keys:", keys)
print("values:", values)

print("queries.shape:", queries.shape)
print("keys.shape", keys.shape)
print("values.shape", values.shape)

query_2 = queries[1]
keys_2 = keys[1]
attn_scores_2 = query_2 @ keys.T

dim_k = keys.shape[-1]
attn_weights_2 = torch.softmax(attn_scores_2 / dim_k**0.5, dim = -1)
print("attn_weights_2:", attn_weights_2)

context_vec_2 = attn_weights_2 @ values
print("context_vec_2", context_vec_2)


queries: tensor([[-1.1686e+00,  2.0194e-01],
        [-1.1185e+00,  8.9658e-04],
        [-1.5090e-01, -6.3319e-03],
        [-1.4040e+00,  1.4216e-01]])
keys: tensor([[-0.1823, -0.6888],
        [-0.1367, -0.7505],
        [-0.1451, -0.2052],
        [-0.3544, -0.9974]])
values: tensor([[ 0.1196, -0.3566],
        [ 0.3942,  0.6054],
        [ 0.1133,  0.2772],
        [ 0.3512,  0.2643]])
queries.shape: torch.Size([4, 2])
keys.shape torch.Size([4, 2])
values.shape torch.Size([4, 2])
attn_weights_2: tensor([0.2450, 0.2363, 0.2380, 0.2807])
context_vec_2 tensor([0.2480, 0.1958])


Then we extend the calculation to the whole token serial.

In [65]:
attn_scores = queries @ keys.T
print("attn_scores:", attn_scores)
attn_weights = torch.softmax(attn_scores, dim = -1)
print("attn_weights:", attn_weights)

attn_scores: tensor([[0.0740, 0.0082, 0.1281, 0.2127],
        [0.2033, 0.1522, 0.1621, 0.3955],
        [0.0319, 0.0254, 0.0232, 0.0598],
        [0.1581, 0.0852, 0.1745, 0.3557]])
attn_weights: tensor([[0.2415, 0.2261, 0.2549, 0.2774],
        [0.2426, 0.2305, 0.2328, 0.2940],
        [0.2492, 0.2476, 0.2470, 0.2562],
        [0.2401, 0.2232, 0.2441, 0.2926]])


### A Simple Self Attention Class

In [73]:
import torch.nn as nn
class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.randn(d_in, d_out))
        self.W_key = nn.Parameter(torch.randn(d_in, d_out))
        self.W_value = nn.Parameter(torch.randn(d_in, d_out))

    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim = -1)
        context_vec = attn_weights @ values
        return context_vec

# use linear layer
class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias = False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim = -1)
        context_vec = attn_weights @ values
        return context_vec

    def __str__(self):
        name = "SelfAttention_v2"
        wquery = W_query.__str__()
        wkey = W_key.__str__()
        wvalue = W_value.__str__()
        return name + "\n W_query:" + wquery + "\n W_key:" + wkey + "\n W_value:" + wvalue + "\n"

In [74]:
torch.manual_seed(789)
sa_v2 = SelfAttention_v2(input_dim, output_dim)
print(sa_v2(inputs))

print(sa_v2.__str__())

tensor([[-0.0621,  0.0899],
        [-0.0636,  0.0900],
        [-0.0608,  0.0901],
        [-0.0637,  0.0899]], grad_fn=<MmBackward0>)
SelfAttention_v2
 W_query:tensor([[-0.1115,  0.1204],
        [-0.3696, -0.2404],
        [-1.1969,  0.2093]])
 W_key:tensor([[-0.9724, -0.7550],
        [ 0.3239, -0.1085],
        [ 0.2103, -0.3908]])
 W_value:tensor([[ 0.2350,  0.6653],
        [ 0.3528,  0.9728],
        [-0.0386, -0.8861]])



### Practice to Transfer V2 Parameter to V1
nn.Linear is the transpose of tensor, it's Tensor(output_dim, input_dim)

This is because in the matrix multiply, the matrix 2 is visited by col and col, transpose it will make the memory visit continuously, thus increase the usage of hardware cache.

In [83]:
sa_v1 = SelfAttention_v1(input_dim, output_dim)
sa_v1.W_value = nn.Parameter(sa_v2.W_value.weight.T)
sa_v1.W_query = nn.Parameter(sa_v2.W_query.weight.T)
sa_v1.W_key = nn.Parameter(sa_v2.W_key.weight.T)

print("sa_v1 output:", sa_v1(inputs))
print("sa_v2 output:", sa_v2(inputs))

sa_v1 output: tensor([[-0.0621,  0.0899],
        [-0.0636,  0.0900],
        [-0.0608,  0.0901],
        [-0.0637,  0.0899]], grad_fn=<MmBackward0>)
sa_v2 output: tensor([[-0.0621,  0.0899],
        [-0.0636,  0.0900],
        [-0.0608,  0.0901],
        [-0.0637,  0.0899]], grad_fn=<MmBackward0>)
