# GPT Tokenizer


In [14]:
import re

class DataLoader:
    def __init__(self, filepath):
        self.filepath = filepath

    def load_text(self):
        try:
            with open(self.filepath, "r", encoding="utf-8") as file:
                return file.read()
        except FileNotFoundError:
            print(f"Error: File '{self.filepath}' not found.")
            return ""
        except Exception as e:
            print(f"Error: {e}")
            return ""
    
    def clean_text(self, text):
        text = re.sub(r" {2,}", " ", text)  # Replace multiple spaces with a single space
        text = re.sub(r"\n{2,}", "\n", text)  # Replace multiple new lines with a single new line
        return text.strip()

In [15]:
file_path ="data/tekst.txt"

data_loader = DataLoader(file_path)
text = data_loader.load_text()
cleaned_text = data_loader.clean_text(text)

In [24]:
tokens = cleaned_text.encode("utf-8")
tokens = list(map(int, tokens))
print(tokens[:20])

[68, 101, 32, 98, 101, 103, 105, 110, 110, 101, 110, 100, 32, 99, 97, 98, 97, 114, 101, 116]


### Exercise 1

**Find the pair of bytes that occur most frequently**

In [29]:
from collections import defaultdict

def find_most_occuring_byte_pair(encoded_text: list[int]) -> tuple[int, int]:
    """ """
    byte_pair_count = defaultdict(int)

    for current_element, next_element in zip(encoded_text[:-2], encoded_text[1:]):
        byte_pair = (current_element, next_element)
        byte_pair_count[byte_pair] += 1
    
    return byte_pair_count

byte_pair_count = find_most_occuring_byte_pair(encoded_text=tokens)
byte_pair_count


defaultdict(int,
            {(68, 101): 3,
             (101, 32): 71,
             (32, 98): 27,
             (98, 101): 28,
             (101, 103): 18,
             (103, 105): 5,
             (105, 110): 45,
             (110, 110): 5,
             (110, 101): 10,
             (101, 110): 136,
             (110, 100): 31,
             (100, 32): 25,
             (32, 99): 7,
             (99, 97): 4,
             (97, 98): 5,
             (98, 97): 5,
             (97, 114): 30,
             (114, 101): 23,
             (101, 116): 45,
             (116, 105): 11,
             (105, 101): 46,
             (101, 114): 98,
             (114, 10): 1,
             (10, 71): 1,
             (71, 111): 1,
             (111, 101): 15,
             (101, 100): 13,
             (100, 101): 65,
             (110, 97): 7,
             (97, 118): 1,
             (118, 111): 8,
             (111, 110): 24,
             (100, 46): 5,
             (46, 32): 28,
             (32, 77): 1,
        