# 2.2 - Tokenizing text
I am using "The Pit and the Pendulum" by Edgar Allan Poe (which is in the public domain) instead of "The Verdict" to get some novel results.

In [43]:
# Import text
with open("pit-and-pendulum.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print(f"total number of character: {len(raw_text)}")
print(raw_text[:99])

total number of character: 33679
I was sick—sick unto death with that long agony; and when they at length unbound me, and I was perm


In [44]:
# Split a text on whitespace characters
import re
text = "Welcome, everyone. My name, is Cheng Guo."
result = re.split(r'(\s)', text)
print(result)

['Welcome,', ' ', 'everyone.', ' ', 'My', ' ', 'name,', ' ', 'is', ' ', 'Cheng', ' ', 'Guo.']


In [45]:
# Split on whitespaces, commas, and periods
result = re.split(r'([,.]|\s)', text)
print(result)

['Welcome', ',', '', ' ', 'everyone', '.', '', ' ', 'My', ' ', 'name', ',', '', ' ', 'is', ' ', 'Cheng', ' ', 'Guo', '.', '']


In [46]:
# Remove redundant whitespace characters
result = [item for item in result if item.strip()]
print(result)

['Welcome', ',', 'everyone', '.', 'My', 'name', ',', 'is', 'Cheng', 'Guo', '.']


In [47]:
# Adjust to handle other types of punctuation
text = "Wow, everyone! Am I Cheng--Guo?"
result = re.split(r'([,.:;?_!"()\']|--|—|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Wow', ',', 'everyone', '!', 'Am', 'I', 'Cheng', '--', 'Guo', '?']


In [48]:
# PRocessing the short story
preprocessed = re.split(r'([,.:;?_!"()\']|--|—|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))
print(preprocessed[:30])

7061
['I', 'was', 'sick', '—', 'sick', 'unto', 'death', 'with', 'that', 'long', 'agony', ';', 'and', 'when', 'they', 'at', 'length', 'unbound', 'me', ',', 'and', 'I', 'was', 'permitted', 'to', 'sit', ',', 'I', 'felt', 'that']


# 2.3 - Converting tokens into token IDs

In [49]:
# List and sort all unique tokens
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1680


In [50]:
# Creating a vocabulary
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('.', 6)
(':', 7)
(';', 8)
('?', 9)
('A', 10)
('After', 11)
('Agitation', 12)
('All', 13)
('Amid', 14)
('An', 15)
('And', 16)
('Another', 17)
('Arousing', 18)
('As', 19)
('At', 20)
('Avoiding', 21)
('But', 22)
('By', 23)
('Could', 24)
('Days', 25)
('Death', 26)
('Demon', 27)
('Down', 28)
('Dreading', 29)
('During', 30)
('Else', 31)
('Even', 32)
('Fate', 33)
('Fool', 34)
('For', 35)
('Forth', 36)
('Free', 37)
('French', 38)
('From', 39)
('General', 40)
('Groping', 41)
('Had', 42)
('Hades', 43)
('Having', 44)
('He', 45)
('How', 46)
('I', 47)
('In', 48)
('Inch', 49)
('Inquisition', 50)


In [51]:
# Implementing a simple text tokenizer
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab # stores vocab
        self.int_to_str = {i:s for s, i in vocab.items()} # creates the inverse vocab
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|—|\s)', text) # processes input text into IDs
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?_!"()\']|--|—|\s)', r'\1', text) # removes spaces before the specified punctuation
        return text

In [52]:
# Try the tokenizer
tokenizer = SimpleTokenizerV1(vocab)
text = """I saw that some ten or twelve vibrations would bring the steel 
            in actual contact with my robe—and with this observation there suddenly 
            came over my spirit all the keen, collected calmness of despair."""
ids = tokenizer.encode(text)
print(ids)

[47, 1259, 1464, 1346, 1455, 1043, 1542, 1600, 1668, 269, 1465, 1377, 803, 118, 366, 1656, 979, 1237, 1679, 151, 1656, 1479, 1017, 1470, 1413, 290, 1054, 979, 1361, 136, 1465, 848, 5, 342, 289, 1030, 439, 6]


In [53]:
# Convert back
print(tokenizer.decode(ids))

I saw that some ten or twelve vibrations would bring the steel in actual contact with my robe— and with this observation there suddenly came over my spirit all the keen, collected calmness of despair.


In [54]:
# New text (modified code)
try:
    text = "Hello, do you like tea?"
    print(tokenizer.encode(text))
except Exception as e:
    print(f"This error is a {type(e).__name__} on the word {str(e)}")

This error is a KeyError on the word 'Hello'


# 2.4 - Adding special context tokens

In [55]:
# Add special tokens
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab.items()))

1682


In [56]:
# Quick check
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('yawning', 1677)
('yet', 1678)
('—', 1679)
('<|endoftext|>', 1680)
('<|unk|>', 1681)


In [57]:
# New text tokenizer
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab # stores vocab
        self.int_to_str = {i:s for s, i in vocab.items()} # creates the inverse vocab
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|—|\s)', text) # processes input text into IDs
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?_!"()\']|--|—|\s)', r'\1', text) # removes spaces before the specified punctuation
        return text

In [58]:
# Try the new tokenizer
text1 = "Hello, do you like tea?"
text2 = "This is a story about pit and pendulum."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> This is a story about pit and pendulum.


In [59]:
# Tokenize the sample text
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

[1681, 5, 474, 1681, 881, 1681, 9, 1680, 85, 837, 103, 1681, 104, 1099, 151, 1077, 6]


In [60]:
# Detokenize
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do <|unk|> like <|unk|>? <|endoftext|> This is a <|unk|> about pit and pendulum.


# 2.5 - Byte pair encoding (BPE)

In [61]:
# Install Tiktoken
# !uv pip install tiktoken

In [62]:
# Import Tiktoken
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.11.0


In [63]:
# Instantiate the BPE tokenizer
tokenizer = tiktoken.get_encoding("gpt2")
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    " of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]


In [64]:
# Convert back
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


## Exercise 2.1 - Page 34

In [65]:
# BPE for unknown words
unknown = "Akwirw ier"
code = tokenizer.encode(unknown)
print(code)

[33901, 86, 343, 86, 220, 959]


In [66]:
# Decode each integer
for integer in code:
    print(f"\"{tokenizer.decode([integer])}\"" + " -> " + str(integer))

"Ak" -> 33901
"w" -> 86
"ir" -> 343
"w" -> 86
" " -> 220
"ier" -> 959


In [67]:
# Reconstruction
unknown_str = tokenizer.decode(code)
print(unknown_str)

Akwirw ier


# 2.6 - Data sampling with a sliding window

In [68]:
# Create BPE tokenizer for the short story
with open("pit-and-pendulum.txt", "r", encoding = "utf-8") as f:
    raw_text = f.read()
encode_text = tokenizer.encode(raw_text)
print(len(encode_text))

7702


In [69]:
# Create the input-target pairs
encode_sample = encode_text[50:]
context_size = 4
x = encode_sample[:context_size]
y = encode_sample[1:context_size + 1]
print(f"x: {x}")
print(f"y:      {y}")

x: [286, 7310, 18702, 2288]
y:      [7310, 18702, 2288, 543]


In [70]:
# Create the next-word prediction tasks
for i in range(1, context_size + 1):
    context = encode_sample[:i]
    desired = encode_sample[i]
    print(context, "----->", desired)

[286] -----> 7310
[286, 7310] -----> 18702
[286, 7310, 18702] -----> 2288
[286, 7310, 18702, 2288] -----> 543


In [71]:
# Repeat with decoded text
for i in range(1, context_size + 1):
    context = encode_sample[:i]
    desired = encode_sample[i]
    print(tokenizer.decode(context), "----->", tokenizer.decode([desired]))

 of ----->  distinct
 of distinct ----->  accent
 of distinct accent -----> uation
 of distinct accentuation ----->  which


In [72]:
# Efficient data loader implementation
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(text)
        for i in range(0, len(token_ids) - max_length, stride): # Use sliding window to chunk the text
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [73]:
# Create the data loader
def create_dataloader_v1(text, batch_size = 4, max_length = 256, 
                         stride = 128, shuffle = True, drop_last = True, num_workers =  0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )
    return dataloader

In [74]:
# Test the data loader
with open("pit-and-pendulum.txt", "r", encoding = "utf-8") as f:
    raw_text = f.read()
dataloader = create_dataloader_v1(raw_text, batch_size = 1, max_length = 4, stride = 1, shuffle = False)
data_iterator = iter(dataloader)
first_batch = next(data_iterator)
print(first_batch)

[tensor([[  40,  373, 6639,  960]]), tensor([[ 373, 6639,  960,   82]])]


In [75]:
# Fetch the next batch
second_batch = next(data_iterator)
print(second_batch)

[tensor([[ 373, 6639,  960,   82]]), tensor([[6639,  960,   82,  624]])]


In [76]:
# Batch size greater than 1
dataloader = create_dataloader_v1(raw_text, batch_size = 8, max_length = 4, stride = 4, shuffle = False)
data_iterator = iter(dataloader)
inputs, targets = next(data_iterator)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   373,  6639,   960],
        [   82,   624, 12722,  1918],
        [  351,   326,   890, 35358],
        [   26,   290,   618,   484],
        [  379,  4129,   555,  7784],
        [  502,    11,   290,   314],
        [  373, 10431,   284,  1650],
        [   11,   314,  2936,   326]])

Targets:
 tensor([[  373,  6639,   960,    82],
        [  624, 12722,  1918,   351],
        [  326,   890, 35358,    26],
        [  290,   618,   484,   379],
        [ 4129,   555,  7784,   502],
        [   11,   290,   314,   373],
        [10431,   284,  1650,    11],
        [  314,  2936,   326,   616]])


## Exercise 2.2 - Page 39

In [77]:
# Different stride and context sizes
dataloader_2 = create_dataloader_v1(raw_text, batch_size = 1, max_length = 2, stride = 2, shuffle = False)
data_iterator_2 = iter(dataloader_2)
first_batch_2 = next(data_iterator_2)
second_batch_2 = next(data_iterator_2)
print(first_batch_2)
print(second_batch_2)
dataloader_3 = create_dataloader_v1(raw_text, batch_size = 1, max_length = 8, stride = 2, shuffle = False)
data_iterator_3 = iter(dataloader_3)
first_batch_3 = next(data_iterator_3)
second_batch_3 = next(data_iterator_3)
print(first_batch_3)
print(second_batch_3)

[tensor([[ 40, 373]]), tensor([[ 373, 6639]])]
[tensor([[6639,  960]]), tensor([[960,  82]])]
[tensor([[   40,   373,  6639,   960,    82,   624, 12722,  1918]]), tensor([[  373,  6639,   960,    82,   624, 12722,  1918,   351]])]
[tensor([[ 6639,   960,    82,   624, 12722,  1918,   351,   326]]), tensor([[  960,    82,   624, 12722,  1918,   351,   326,   890]])]


# 2.7 - Creating token embeddings

In [78]:
# Convert token ID to embedding vector
input_ids = torch.tensor([2, 3, 5, 1])
vocab_size = 6
output_dim = 3
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [79]:
# Test embedding vector
print(embedding_layer(torch.tensor([3]))) # this is identical to the fourth row above

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [80]:
# Convert all four input IDs
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


# 2.8 - Encoding word positions

In [81]:
# Use larger embedding sizes
vocab_size = 50257 # BPE tokenizer
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size = 8, max_length = max_length, stride = max_length, shuffle = False)
data_iterator = iter(dataloader)
inputs, targets = next(data_iterator)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   373,  6639,   960],
        [   82,   624, 12722,  1918],
        [  351,   326,   890, 35358],
        [   26,   290,   618,   484],
        [  379,  4129,   555,  7784],
        [  502,    11,   290,   314],
        [  373, 10431,   284,  1650],
        [   11,   314,  2936,   326]])

Inputs shape:
 torch.Size([8, 4])


In [82]:
# Embed the token IDs
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [83]:
# Absolute positional embedding
context_length = max_length
position_embedding_layer = torch.nn.Embedding(context_length, output_dim)
position_embeddings = position_embedding_layer(torch.arange(context_length))
print(position_embeddings.shape)

torch.Size([4, 256])


In [84]:
# Add the position embeddings to the token embeddings
input_embeddings = token_embeddings + position_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
