# Handle Text

## Text Tokenization

### Download and Read the Verdict
Download the Verdict as a text file, then read the file content.

In [20]:
import urllib.request
from ftplib import ftpcp
from msilib import type_key

url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "./the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

with open(file_path, "r") as file:
    raw_text = file.read()
print(f"Total characters: {len(raw_text)}")
print(raw_text[:100])

Total characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


### Split the Text into Words Using Regex

In [21]:
import re
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


### Convert the Words to Unique IDs


In [22]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

vocab = {token:integer for integer,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

1130
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


### Simple Tokenizer
Implement a encode function to convert a text into a sequence of token IDs.

Implement a decode function to convert a sequence of token IDs back into a text.

In [23]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.vocab = vocab
        self.inverse_vocab = {integer:token for token, integer in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        return [self.vocab[token] for token in preprocessed]

    def decode(self, ids):
        text = ' '.join([self.inverse_vocab[id] for id in ids])
        text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)
        return text

tokenizerV1 = SimpleTokenizerV1(vocab)
text = """It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""""
ids = tokenizerV1.encode(text)
print(ids)
decoded_text = tokenizerV1.decode(ids)
print(decoded_text)

[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


### Tokenizer V2
We want to handle the unknown words in the text.
We will add a special token `<|unk|>` to represent the unknown words.

And add a token `<|endoftext|>` to represent the end of the text, thus we can handle multiple texts that from different sources.

In [28]:
# add the special tokens to the vocab
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}

class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.vocab = vocab
        self.inverse_vocab = {integer:token for token, integer in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        return [self.vocab.get(token, self.vocab["<|unk|>"]) for token in preprocessed]

    def decode(self, ids):
        text = ' '.join([self.inverse_vocab[id] for id in ids])
        text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)
        return text

tokenizerV2 = SimpleTokenizerV2(vocab)
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
ids = tokenizerV2.encode(text)
print(ids)
decoded_text = tokenizerV2.decode(ids)
print(decoded_text)


[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


### BPE Tokenizer
BPE could handle the unknown words in the text by split it into smaller tokens.

BPE merge the most frequent pairs of tokens into a new token to make the vocabulary.

In [35]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
text = "Aiwerkn oker"
ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(ids)
for id in ids:
    print(id, tokenizer.decode([id]))
print(tokenizer.decode(ids))

[32, 14246, 9587, 77, 267, 6122]
32 A
14246 iw
9587 erk
77 n
267  o
6122 ker
Aiwerkn oker


## Use Sliding Window to Create Input-Output Pairs
### Read and Encode the Verdict

In [36]:
file_path = "./the-verdict.txt"
with open(file_path, "r") as file:
    raw_text = file.read()
enc_text = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})
print(len(enc_text))

5145


### Create Input-Output Pairs
The input will be a sequence of tokens.

The output will be the next token in the sequence.

In [38]:
enc_sample = enc_text[:10]
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]

for i in range(1, context_size + 1):
    context = enc_sample[:i]
    target = enc_sample[i]
    print(context, "->", target)
    print(tokenizer.decode(context), "->", tokenizer.decode([target]))

[40] -> 367
I ->  H
[40, 367] -> 2885
I H -> AD
[40, 367, 2885] -> 1464
I HAD ->  always
[40, 367, 2885, 1464] -> 1807
I HAD always ->  thought


### DataSet and DataLoader
We use the sliding window to create the input-output pairs.

x is the input sequence of tokens, specifically `text[sample_start: sample_start + context_size]`.

y is the target sequence of tokens, specifically `text[sample_start + 1: sample_start + context_size + 1]`.

We could generate the input-output pairs from x and y as forementioned.

DataSet is to store the tokenized text. DataLoader is to load the data in batches.

In [42]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDataSetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, batch_size = 4, max_length = 256, stride = 128, shuffle = True, drop_last = True, num_workers = 0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataSetV1(txt, tokenizer, max_length, stride)
    return DataLoader(dataset, batch_size = batch_size, shuffle = shuffle, drop_last = drop_last, num_workers = num_workers)

file_path = "./the-verdict.txt"
with open(file_path, "r") as file:
    raw_text = file.read()

dataLoader = create_dataloader_v1(raw_text, batch_size = 8, max_length = 4, stride = 4, shuffle = False, drop_last = True, num_workers = 0)
data_iter = iter(dataLoader)
first_batch = next(data_iter)
print(first_batch)


[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]


## Embedding Layer

### A Simple Embedding Layer
As a simple example, we create an embedding layer.

The embedding layer has two parameters:
1. The vocabulary size, which is the number of unique tokens in the dataset.
2. The embedding dimension, which is the size of the vector representation for each token.

The embedding layer receives a tensor of token indices and returns a tensor of token embeddings.
For example, the output dim is 5 and the num of tokens is 32, then it will return a tensor of shape (5, 32).

In [46]:
vocab_size = tokenizer.n_vocab
embedding_dim = 32
embedding_layer = torch.nn.Embedding(vocab_size, embedding_dim)

print(embedding_layer(torch.tensor([3])))

tensor([[-0.7585, -0.7283, -1.0025, -0.0571,  1.0934, -0.0509, -0.6770, -0.4610,
          0.8019,  1.9966, -0.6660,  1.0782, -0.0667,  0.3993,  1.7781, -1.3685,
          1.1044,  1.5756, -0.0521, -0.0376,  0.9942,  0.4432, -0.3715,  1.8533,
         -0.0155, -0.5434, -0.5900,  1.9482, -1.4059,  0.5269,  1.5739,  0.1289]],
       grad_fn=<EmbeddingBackward0>)
