In [2]:
with open("../data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [3]:
print("Total number of characters: ", len(raw_text))
print(raw_text[:99])

Total number of characters:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [5]:
import re
sample_text = raw_text[:99]
result = re.split(r'([,.:;?_!"()\']|--|\s)', sample_text)
result = [item.strip() for item in result if item.strip()]
print(result)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no']


In [56]:
from collections import defaultdict

def _split_text(text):
    result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    return [item.strip() for item in result if item.strip()]

END_TOKEN = "<|endoftext|>"
UNK_TOKEN = "<|unk|>"

class Tokenizer:
    def __init__(self, base_text):
        all_tokens = set(_split_text(base_text))
        self.ordered_tokens = sorted(all_tokens)
        self.ordered_tokens.insert(0, UNK_TOKEN)
        self.ordered_tokens.insert(1, END_TOKEN)
        self.tokens_dict = defaultdict(
            lambda: 0,  # 0 is the index of the UNK token
            {token: index for index, token in enumerate(self.ordered_tokens)}
        )

    def encode(self, text: str) -> list[int]:
        return [self.tokens_dict[token] for token in _split_text(text)] + [1]
    
    def decode(self, tokens: list[int]) -> str:
        text = " ".join([self.ordered_tokens[token] for token in tokens])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        text = re.sub(r'\s+(--)\s+', r'\1', text)
        return text

In [59]:
tokenizer = Tokenizer(raw_text)
tokens = tokenizer.encode(sample_text)
reconstructed_sample_text = tokenizer.decode(tokens)
print(sample_text)
print(reconstructed_sample_text)

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no <|endoftext|>
