# GPT Tokenizer


In [7]:
import re

class DataLoader:
    def __init__(self, filepath):
        self.filepath = filepath

    def load_text(self):
        try:
            with open(self.filepath, "r", encoding="utf-8") as file:
                return file.read()
        except FileNotFoundError:
            print(f"Error: File '{self.filepath}' not found.")
            return ""
        except Exception as e:
            print(f"Error: {e}")
            return ""
    
    def clean_text(self, text):
        text = re.sub(r" {2,}", " ", text)
        text = re.sub(r"\n{2,}", "\n", text)
        return text.strip()

In [9]:
file_path ="data/text_test.txt"

data_loader = DataLoader(file_path)
text = data_loader.load_text()
cleaned_text = data_loader.clean_text(text)

In [10]:
tokens = cleaned_text.encode("utf-8")
tokens = list(map(int, tokens))
print(tokens[:20])

[239, 188, 181, 239, 189, 142, 239, 189, 137, 239, 189, 131, 239, 189, 143, 239, 189, 132, 239, 189]


### Exercise 1

**Find the pair of bytes that occur most frequently**

In [14]:
from collections import defaultdict

def find_most_occuring_byte_pair(encoded_text: list[int]) -> tuple[int, int]:
    """ """
    byte_pair_count = defaultdict(int)

    for current_element, next_element in zip(encoded_text[:-2], encoded_text[1:]):
        byte_pair = (current_element, next_element)
        byte_pair_count[byte_pair] += 1
    
    return [byte_pair for byte_pair, count in byte_pair_count.items() if count == max(byte_pair_count.values())]

byte_pair_count = find_most_occuring_byte_pair(encoded_text=tokens)
byte_pair_count


[(101, 32)]