# Exercise

Build your own GPT-4 Tokenizer!

### Step 1

Write the `BasicTokenizer` class, with the following three core functions:

- `def train(self, text, vocab_size, verbose=False)`
- `def encode(self, text)`
- `def decode(self, ids)`

Train your tokenizer on whatever text you like and visualize the merged tokens. Do they look reasonable? One default test you may wish to use is the text file `tests/taylorswift.txt`.

In [42]:
from collections import Counter
from itertools import tee

class BasicTokenizer():

    def __init__(self, text, vocab_size=None, ids=None) -> None:
        super().__init__()
        self.text = " ".join(line.strip("\n") for line in text)  
        self.vocab_size = vocab_size
        self.ids = ids  

    def train(self, text, vocab_size=None, verbose=False):
        """
        
        
        """
        # BPE - Byte Pair Encoding
        # A subword tokenization algorithm

        # Step 1: Construct the base vocabulary
        content = " ".join(line.strip("\n") for line in content)    
        vocab = self.get_vocab(text=content)
        print(vocab)

        # Step 2: Find most frequent pair
        frequencies_of_pairs = self.get_freq_of_pairs(text=self.text)
        max_freq_pair = self.get_best_pair(frequencies_of_pairs)
        print(max_freq_pair)

        # Step 3: Merge pair

        # Step 4: Update vocabulary 

        # Step 5: Iterate (steps 2-4) until a specified number of iterations are reached 
    
    def encode(self, text):
        ids = []
        for line in text:
            for token in line.split(" "):
                encoded = token.encode('UTF-8') 
                ids.append(encoded)
        return ids

    def decode(self, ids):
        pass

    def print(self, style=None, ids=None):
        if style == "tokens":
            for byte_arr in ids: 
                print("-----------------------------------------------------------")
                print("The byte array {} has the following byte: ".format(byte_arr))
                for byte in byte_arr:
                    print("Tokens IDs: ", byte)
        else: 
            print("")

    def get_vocab(self, text):
        # Initialize vocabulary with frequency of each word in text
        base_vocab = Counter(text.split())
        return {word: freq for word, freq in base_vocab.items()}
    
    def get_set_chars(self, text):
        # Initialize vocabulary with the set of characters in text
        base_vocab = set()
        for byte_arr in text: 
            for byte in byte_arr:
                base_vocab.add(byte)
        return base_vocab
    
    def pairwise(self, iterable):
        #"s -> (s0,s1), (s1,s2), (s2, s3), ..."
        a, b = tee(iterable)
        next(b, None)
        return zip(a, b)
    
    def get_freq_of_pairs(self, text):
        pair_words = list(self.pairwise(text.split()))
        freq_pairs = Counter(pair_words)
        return  {tuple_: freq for tuple_, freq in freq_pairs.items()}
        
    def get_best_pair(self, pairs):
         # Get the most frequent pair
        best_pair = max(pairs, key=pairs.get)
        return best_pair
    
    def merge_vocab(self):
        pass
        





In [43]:
import os

file_name = "taylorswift.txt"
cur_dir = os.getcwd()
abs_path = os.path.join(cur_dir, file_name)
print("The absolute path of the given file is: ", abs_path)

try:
    file = open(file=abs_path, mode="r", encoding="UTF-8")
    while True:
        content = file.readlines()
        if not content:
            break 
   
        # Object of the class BasicTokenizer
        tokenizer = BasicTokenizer(text=content)

        # Join every line of the text into a general string 
        content = " ".join(line.strip("\n") for line in content)    
        vocab = tokenizer.get_vocab(text=content)
        print(vocab)

        # Call the function encode in order to encode the input file, line-by-line
        ids = tokenizer.encode(text=content)
        #tokenizer.print(style="tokens", ids=ids)

        tokenizer.train(text=content, vocab_size=vocab)

        tmp = tokenizer.get_freq_of_pairs(content)
        print(tmp)

        print(tokenizer.get_best_pair(tmp))
    file.close()
except FileNotFoundError:
    print("File not found!")






The absolute path of the given file is:  c:\Users\c.manara\Documents\VS code projects\Tokenization\GPT-4 Tokenizer\taylorswift.txt
{'Copy': 1, 'paste': 1, 'of': 432, 'the': 1131, 'Wikipedia': 1, 'article': 3, 'on': 607, 'Taylor': 262, 'Swift,': 36, 'as': 111, 'Feb': 1, '16,': 40, '2024.': 35, '---': 1, 'Main': 5, 'menu': 1, 'WikipediaThe': 1, 'Free': 3, 'Encyclopedia': 1, 'Search': 1, 'Create': 1, 'account': 1, 'Log': 1, 'in': 268, 'Personal': 4, 'tools': 1, 'Contents': 1, 'hide': 1, '(Top)': 1, 'Life': 5, 'and': 480, 'career': 12, 'Toggle': 5, 'subsection': 5, 'Artistry': 3, 'Accolades': 2, 'achievements': 2, 'Cultural': 4, 'status': 5, 'Wealth': 3, 'Discography': 2, 'Filmography': 2, 'Tours': 2, 'See': 5, 'also': 21, 'Footnotes': 2, 'References': 3, 'External': 2, 'links': 2, 'Swift': 571, '136': 1, 'languages': 1, 'Article': 1, 'Talk': 2, 'Read': 1, 'View': 2, 'source': 2, 'history': 7, 'Tools': 1, 'Featured': 1, 'Page': 1, 'semi-protected': 1, 'From': 10, 'Wikipedia,': 1, 'free': 4

### Step 2

Convert you `BasicTokenizer` into a `RegexTokenizer`, which takes a regex pattern and splits the text exactly as GPT-4 would. Process the parts separately as before, then concatenate the results. Retrain your tokenizer and compare the results before and after. You should see that you will now have no tokens that go across categories (numbers, letters, punctuation, more than one whitespace). Use the GPT-4 pattern:

```
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
```



### Step 3

You're now ready to load the merges from the GPT-4 tokenizer and show that your tokenizer produces the identical results for both `encode` and `decode`, matching [tiktoken](https://github.com/openai/tiktoken).

```
# match this
import tiktoken
enc = tiktoken.get_encoding("cl100k_base") # this is the GPT-4 tokenizer
ids = enc.encode("hello world!!!? (안녕하세요!) lol123 😉")
text = enc.decode(ids) # get the same text back
```

Unfortunately, you will run into two issues:

1. It is not trivial to recover the raw merges from the GPT-4 tokenizer. You can easily recover what we call `vocab` here, and what they call and store under `enc._mergeable_ranks`. Feel free to copy paste the `recover_merges` function in `minbpe/gpt4.py`, which takes these ranks and returns the raw merges. If you wish to know how this function works, read [this](https://github.com/openai/tiktoken/issues/60) and [this](https://github.com/karpathy/minbpe/issues/11#issuecomment-1950805306). Basically, under some conditions it is enough to only store the parent nodes (and their rank) and get rid of the precise details of which children merged up to any parent.
2. Second, the GPT-4 tokenizer for some reason permutes its raw bytes. It stores this permutation in the first 256 elements of the mergeable ranks, so you can recover this byte shuffle relatively simply as `byte_shuffle = {i: enc._mergeable_ranks[bytes([i])] for i in range(256)}`. In both your encode and decode, you'll have to shuffle bytes around accordingly. If you're stuck, reference the minbpe/gpt4.py` file for hints.


### Step 4

(Optional, irritating, not obviously useful) Add the ability to handle special tokens. You'll then be able to match the output of tiktoken even when special tokens are present, e.g.:

```
import tiktoken
enc = tiktoken.get_encoding("cl100k_base") # this is the GPT-4 tokenizer
ids = enc.encode("<|endoftext|>hello world", allowed_special="all")
```

Without `allowed_special` tiktoken will error.

### Step 5

If you've made it this far, you're now a pro at LLM Tokenization! Sadly, you're not exactly done yet because a lot of LLMs outside of OpenAI (e.g. Llama, Mistral) use [sentencepiece](https://github.com/google/sentencepiece) instead. Primary difference being that sentencepiece runs BPE directly on Unicode code points instead of on UTF-8 encoded bytes. Feel free to explore sentencepiece on your own (good luck, it's not too pretty), and stretch goal if you really experience and suffer from the burden of time, re-write your BPE to be on Unicode code points and match the Llama 2 tokenizer.

In [46]:
from collections import Counter, defaultdict

def get_vocab(text):
    # Initialize vocabulary with frequency of each word in text
    vocab = Counter(text.split())
    return {word: freq for word, freq in vocab.items()}

def get_stats(vocab):
    # Get frequency of adjacent symbol pairs (bigrams) in vocabulary
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i+1]] += freq
    return pairs

def merge_vocab(pair, vocab):
    # Merge most frequent pair in all vocabulary words and update frequency
    new_vocab = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    for word in vocab:
        new_word = word.replace(bigram, replacement)
        new_vocab[new_word] = vocab[word]
    return new_vocab

# Sample text data
text = "low lower newest widest"

# Convert each word in initial vocabulary to space-separated string of characters
vocab = get_vocab(text)
vocab = {' '.join(word): freq for word, freq in vocab.items()}
print("Initial vocabulary:", vocab)

# Number of BPE iterations
num_merges = 10  

for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    # Get the most frequent pair
    best_pair = max(pairs, key=pairs.get)
    vocab = merge_vocab(best_pair, vocab)
    print(f"After iteration {i+1}, Best pair: {best_pair}")
    print("Updated vocabulary:", vocab)

Initial vocabulary: {'C o p y': 1, 'p a s t e': 1, 'o f': 233, 't h e': 483, 'W i k i p e d i a': 1, 'a r t i c l e': 2, 'o n': 98, 'T a y l o r': 29, 'S w i f t ,': 4, 'a s': 84, 'F e b': 1, '1 6 ,': 1, '2 0 2 4 .': 1, '- - -': 1, 'M a i n': 2, 'm e n u': 1, 'W i k i p e d i a T h e': 1, 'F r e e': 1, 'E n c y c l o p e d i a': 1, 'S e a r c h': 1, 'C r e a t e': 1, 'a c c o u n t': 1, 'L o g': 1, 'i n': 173, 'P e r s o n a l': 1, 't o o l s': 1, 'C o n t e n t s': 1, 'h i d e': 1, '( T o p )': 1, 'L i f e': 3, 'a n d': 371, 'c a r e e r': 12, 'T o g g l e': 5, 's u b s e c t i o n': 5, 'A r t i s t r y': 3, 'A c c o l a d e s': 2, 'a c h i e v e m e n t s': 2, 'C u l t u r a l': 4, 's t a t u s': 4, 'W e a l t h': 2, 'D i s c o g r a p h y': 1, 'F i l m o g r a p h y': 1, 'T o u r s': 1, 'S e e': 2, 'a l s o': 16, 'F o o t n o t e s': 1, 'R e f e r e n c e s': 2, 'E x t e r n a l': 1, 'l i n k s': 1, 'S w i f t': 201, '1 3 6': 1, 'l a n g u a g e s': 1, 'A r t i c l e': 1, 'T a l k':