In [82]:
# Build Vocabulary

import re

words = """hello, world. how are you? This is a test. Cat, rat, dog, bat, elephant, can,
fish, row, column, food, eat, speak, drink, run, jump, skip, ---, climb, over,
under, above, sunny, rainy, <|endoftext|>, <|unk|>"""

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', words)
# Lowercase all words for handling differences in capitalization.
preprocessed = [item.lower() for item in preprocessed]
# Add capitalized version of all words and append to vocab list
preprocessed += [item.capitalize() for item in preprocessed]

preprocessed = [item.strip() for item in preprocessed if item.strip()]
preprocessed = sorted(set(preprocessed))

vocab = {token:integer for integer,token in enumerate(preprocessed)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

(',', 0)
('-', 1)
('--', 2)
('.', 3)
('<|endoftext|>', 4)
('<|unk|>', 5)
('?', 6)
('A', 7)
('Above', 8)
('Are', 9)
('Bat', 10)
('Can', 11)
('Cat', 12)
('Climb', 13)
('Column', 14)
('Dog', 15)
('Drink', 16)
('Eat', 17)
('Elephant', 18)
('Fish', 19)
('Food', 20)
('Hello', 21)
('How', 22)
('Is', 23)
('Jump', 24)
('Over', 25)
('Rainy', 26)
('Rat', 27)
('Row', 28)
('Run', 29)
('Skip', 30)
('Speak', 31)
('Sunny', 32)
('Test', 33)
('This', 34)
('Under', 35)
('World', 36)
('You', 37)
('a', 38)
('above', 39)
('are', 40)
('bat', 41)
('can', 42)
('cat', 43)
('climb', 44)
('column', 45)
('dog', 46)
('drink', 47)
('eat', 48)
('elephant', 49)
('fish', 50)


In [83]:
# Build Tokenizer (copied from book)

class Tokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}

    def encode(self, text, allowed_special=False):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids, allowed_special=False):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text


In [84]:
# Test tokenizer

tokenizer = Tokenizer(vocab)
text = "hello, how are you?"
text2 = "Hello, how are you?"
# encode text
encoded_text = tokenizer.encode(text)
encoded_text2 = tokenizer.encode(text2)
print(encoded_text, "\n", encoded_text2)

# Decode text
decoded_text = tokenizer.decode(encoded_text)
decoded_text2 = tokenizer.decode(encoded_text2)
print(decoded_text, "\n", decoded_text2)



[52, 0, 53, 40, 68, 6] 
 [21, 0, 53, 40, 68, 6]
hello, how are you? 
 Hello, how are you?


In [85]:
!pip install tiktoken



In [86]:
# Tests comparing tokenizer output with tiktoken

import tiktoken

tokenizer_tiktoken = tiktoken.get_encoding("gpt2")
text = "Hello, how are you?"
text2 = "hello, how are you?"

integers = tokenizer_tiktoken.encode(text)
integers2 = tokenizer_tiktoken.encode(text2)

print(integers, "\n", integers2)


[15496, 11, 703, 389, 345, 30] 
 [31373, 11, 703, 389, 345, 30]


In [87]:
# Testing unknown string on custom tokenizer and tiktoken

unknown_string = "ikorrdd"

# Encoding
print(tokenizer.encode(unknown_string))
print(tokenizer_tiktoken.encode(unknown_string))

# Decoding
print(tokenizer.decode(tokenizer.encode(unknown_string)))
print(tokenizer_tiktoken.decode(tokenizer_tiktoken.encode(unknown_string)))

[5]
[1134, 38890, 1860]
<|unk|>
ikorrdd


In [94]:
# Implement Dataset for DataLoader - copied from book

import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})

        # Use a sliding window to chunk into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [95]:
# Dataloader implementation
dataset = GPTDatasetV1(words, tokenizer, max_length=4, stride=1)
dataloader = DataLoader(dataset, batch_size=2)

first_batch = iter(dataloader)
print(next(first_batch))

[tensor([[52,  0, 67,  3],
        [ 0, 67,  3, 53]]), tensor([[ 0, 67,  3, 53],
        [67,  3, 53, 40]])]
