In [122]:
import re

In [123]:
with open("Harry Potter and the Sorcerers Stone.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

In [124]:
print("Total number of characters:", len(raw_text))
print(raw_text[:100])

Total number of characters: 263976
M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly norm


In [125]:
class Tokenizer:

    TOKEN_PATTERN = r'([,.:;?\-!"()\']|\s)'
    END_OF_TEXT = "<|endoftext|>"
    UNKNOWN_TOKEN = "<|unk|>"

    def __init__(self, raw_text):
        self.raw_text = raw_text
        self.tokens = self.get_tokens(self.raw_text)

    def get_tokens(self, text):
        tokens = re.split(self.TOKEN_PATTERN, text)
        tokens =[t.strip() for t in tokens if t.strip()]
        self.tokens = tokens + [self.END_OF_TEXT, self.UNKNOWN_TOKEN]
        self.idx_to_token = {i: t for i, t in enumerate(self.tokens)}
        self.token_to_idx = {t: i for i, t in enumerate(self.tokens)}

    def encode(self, text):
        tokens = re.split(self.TOKEN_PATTERN, text)
        tokens = [t.strip() for t in tokens if t.strip()]
        return [self.token_to_idx.get(t, self.UNKNOWN_TOKEN) for t in tokens] + [self.token_to_idx[self.END_OF_TEXT]]

    def decode(self, indices):
        tokens = [self.idx_to_token[i] for i in indices]
        text = " ".join(tokens)
        text = re.sub(r'\s([,.:;?\-!"()\'])', r"\1", text)
        return text

In [126]:
tokenizer = Tokenizer(raw_text)

text = "Harry Potter is a wizard."
encoded = tokenizer.encode(text)

print("Encoded text:", encoded)

decoded = tokenizer.decode(encoded)
print("Decoded text:", decoded)

Encoded text: [54275, 50599, 51951, 54345, 40688, 54362, 54363]
Decoded text: Harry Potter is a wizard. <|endoftext|>


In [129]:
text = "Harry sokokoko Potter is a wizard."
encoded = tokenizer.encode(text)
print("Encoded text:", encoded)

Encoded text: [54275, '<|unk|>', 50599, 51951, 54345, 40688, 54362, 54363]
