### 2.2 Tokenizing text

In [1]:
import os
import urllib.request # download dataset

if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [3]:
with open("the-verdict.txt", "r", encoding='utf-8') as f:
    raw_text = f.read()

In [4]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [5]:
len(raw_text)

20479

In [6]:
import re

# regular expression
text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [8]:
result = re.split(r'([,.]|\s)', text)


In [9]:
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [10]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [12]:
result = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
result = [item.strip() for item in result if item.strip()]
print(result)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--', 'deploring', 'his', 'unaccountable', 'abdication', '.', '"', 'Of', 'course', 'it', "'", 's', 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture', "'", 'way', 'up', ';', 'but', 'I', 'don', "'", 't', 'think', 'of', 'that', ',

In [13]:
preprocessed = result
len(preprocessed)

4690

### 2.3 Converting tokens into token IDs
1. vocabulary (mapping words to unique integer)

In [16]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

In [17]:
vocab = {token:integer for integer, token in enumerate(all_words)}

In [19]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [20]:
tokenizer = SimpleTokenizerV1(vocab)

In [21]:
text =""""It's the last he painted, you know,"
            Mrs.Gisburn said with pardonable pride."""

In [22]:
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [23]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

### 2.4 Adding special context tokens
- |unk|, |endoftext|

In [25]:
all_token = sorted(list(set(preprocessed)))
all_token.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_token)}

In [27]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [28]:
tokenizer = SimpleTokenizerV2(vocab)

In [29]:
example = "Hello, do you like tea. is this --a test?"
tokenizer.encode(example)

[1131, 5, 355, 1126, 628, 975, 7, 584, 999, 6, 115, 1131, 10]

In [31]:
tokenizer.decode(tokenizer.encode(example))

'<|unk|>, do you like tea. is this -- a <|unk|>?'

### 2.5 Byte pair encoding
- break down words into subtokens

In [32]:
import tiktoken

In [33]:
tiktoken.__version__

'0.9.0'

In [34]:
tokenizer = tiktoken.get_encoding("gpt2")

In [35]:
tokenizer.encode("Hello world")

[15496, 995]

In [36]:
tokenizer.decode(tokenizer.encode("Hello world"))

'Hello world'

In [38]:
text = (
    "Hello, do you like te? <|endoftext|> In the sunlit terraces"
    "of someunknowPlace"
)
tokenizer.encode(text, allowed_special={"<|endoftext|>"})

[15496,
 11,
 466,
 345,
 588,
 573,
 30,
 220,
 50256,
 554,
 262,
 4252,
 18250,
 8812,
 2114,
 1659,
 617,
 2954,
 2197,
 27271]

### 2.6 Data sampling with a sliding window

In [39]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [41]:
context_size = 4

x = enc_text[:context_size]
y = enc_text[1:context_size+1]

print(f"x: {x}")
print(f"y:     {y}")

x: [40, 367, 2885, 1464]
y:     [367, 2885, 1464, 1807]


In [42]:
import torch

In [43]:
torch.__version__

'2.7.1'

In [47]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids)-max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))


    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [48]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    tokenizer=tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [50]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False 
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [51]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [77]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=2, stride=2, shuffle=False 
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 40, 367]]), tensor([[ 367, 2885]])]
[tensor([[2885, 1464]]), tensor([[1464, 1807]])]


In [78]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=8, stride=2, shuffle=False 
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]
[tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]), tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]


In [79]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=2, max_length=8, stride=2, shuffle=False 
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

[tensor([[   40,   367,  2885,  1464,  1807,  3619,   402,   271],
        [ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899],
        [ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]
[tensor([[ 1807,  3619,   402,   271, 10899,  2138,   257,  7026],
        [  402,   271, 10899,  2138,   257,  7026, 15632,   438]]), tensor([[ 3619,   402,   271, 10899,  2138,   257,  7026, 15632],
        [  271, 10899,  2138,   257,  7026, 15632,   438,  2016]])]


### 2.7 Creating Token Embeddings

In [52]:
input_ids = torch.tensor([2, 3, 5, 1])

In [53]:
tokenizer.n_vocab

50257

- embedding
    - vocab_size: tokenizer.n_vocab
    - output_dim: dimension per vector

In [54]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123) # initialize random seed
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [56]:
print(embedding_layer.weight) # optimizing when training the layer

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [57]:
embedding_layer(torch.tensor([3])) # index 3 in embedding.weight

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

In [58]:
embedding_layer(torch.tensor([2]))

tensor([[ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)

In [59]:
embedding_layer(input_ids)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)

In [60]:
embedding_layer = torch.nn.Embedding(tokenizer.n_vocab, output_dim)

In [62]:
embedding_layer.weight

Parameter containing:
tensor([[-2.1338,  1.0524, -0.3885],
        [-0.9343, -0.4991, -1.0867],
        [ 0.9624,  0.2492,  0.6266],
        ...,
        [ 0.9609, -1.3697,  0.1381],
        [-1.2365,  1.9319,  0.4730],
        [ 0.7365,  0.1316,  0.2379]], requires_grad=True)

### 2.8 Encoding word positions

In [63]:
vocab_size = tokenizer.n_vocab # 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [64]:
max_length = 4

dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [65]:
print("Token IDs:\n", inputs)
print("\nInput shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Input shape:
 torch.Size([8, 4])


In [66]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape) # batch_size, length, output dim

torch.Size([8, 4, 256])


In [68]:
token_embeddings[0,0] # first batch, first words (40) -> 256 dim

tensor([ 9.3138e-01,  2.4970e+00, -7.5886e-01, -2.9304e-03,  4.6033e-01,
        -1.2624e+00,  2.2945e-02,  8.7158e-01, -1.2823e+00, -1.2091e-01,
        -3.3686e-01,  1.6070e+00, -5.9332e-01, -1.6217e+00, -9.8342e-01,
        -2.0415e-01, -1.2068e+00, -2.5405e+00,  1.9803e+00, -1.2905e+00,
         1.7485e+00,  8.1073e-01, -4.6945e-01, -1.3970e+00, -1.0314e+00,
        -1.2450e+00, -8.8608e-01,  7.4233e-02,  1.0778e+00, -4.8303e-01,
         1.7835e-01, -1.8202e-02,  8.5286e-01,  7.1491e-01, -1.0246e+00,
         1.0360e+00, -7.9023e-02,  6.9433e-01,  9.0482e-01, -5.9876e-01,
         2.1469e-01,  4.5175e-01, -1.8838e+00,  7.8860e-01, -1.5495e-01,
        -2.6969e-01,  8.0673e-02,  1.8647e+00,  8.3791e-01, -2.5901e-01,
        -6.1770e-01, -2.2325e-01,  5.0203e-01, -1.0905e+00,  1.9047e+00,
         1.7961e+00,  4.9387e-01, -8.7234e-01,  1.9014e+00, -4.4768e-01,
        -9.0395e-01,  4.8624e-01,  1.2517e+00,  1.0142e+00,  3.8537e-01,
         9.4483e-01,  1.0095e+00, -9.3878e-01,  1.3

In [69]:
### positional encoding (positional embedding) -> optimize during training llm
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [70]:
torch.arange(max_length) # create sequential order

tensor([0, 1, 2, 3])

In [72]:
pos_embedding_layer.weight

Parameter containing:
tensor([[-0.7851,  1.5557,  1.1355,  ...,  0.5553, -0.6299,  0.2980],
        [ 0.8141,  0.9801, -0.9307,  ...,  2.2669, -1.0472, -0.0354],
        [-1.4536, -1.0188,  0.4802,  ...,  1.3324,  1.0717, -0.1117],
        [-0.4525,  0.1157,  1.9147,  ..., -0.4439, -0.8569, -0.6267]],
       requires_grad=True)

In [73]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [74]:
token_embeddings.shape

torch.Size([8, 4, 256])

In [75]:
token_embeddings[0] + pos_embeddings

tensor([[ 0.1463,  4.0527,  0.3767,  ..., -0.2820, -1.3300,  0.1748],
        [ 0.7395,  2.2866, -1.2291,  ...,  1.2576,  0.1355, -0.6984],
        [-0.2627, -0.1810,  1.2876,  ...,  2.2616,  0.6080, -0.3761],
        [ 1.0246,  1.1285,  2.2767,  ...,  1.1207, -1.3258, -2.3314]],
       grad_fn=<AddBackward0>)

In [76]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
