# Basic tokenizer

In [1]:
class WhitespaceTokenizer:
    def __init__(self):
        pass

    def tokenize(self, text):
        return text.split()

In [2]:
text = "this is a simple tokenizer example"
tokenizer = WhitespaceTokenizer()
tokens = tokenizer.tokenize(text)
print(tokens)

['this', 'is', 'a', 'simple', 'tokenizer', 'example']


# Byte Pair Encoding (BPE) Tokenizer

Byte Pair Encoding is a subword tokenization algorithm. 
It iteratively merges the most frequent pairs of characters or subwords to create new subwords.

**Tokenizers Used by GPT-4**

GPT-4, like its predecessors, typically uses a tokenizer based on the Byte Pair Encoding (BPE) method. Here’s how it generally works:

    - Initialization: The tokenizer is initialized with a vocabulary that includes common subwords and characters.
    
    - Tokenization Process:
        
        - Input Text: The input text is first split into basic units (characters or initial subwords).
        
        - Merging: Frequent pairs of characters or subwords are iteratively merged based on their frequency in the training corpus.
        
        - Final Tokens: The merging process continues until the entire text is converted into a sequence of subwords (tokens) from the vocabulary.

In [9]:
import re
from collections import defaultdict, Counter

class BPE:
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.vocab = {}

    def get_vocab(self, text):
        """
        Converts the input text into tokens and counts their frequencies.
        """
        tokens = [tuple(word) + ('</w>', ) for word in text.split()]
        vocab = Counter(tokens)

        return vocab

    def get_stats(self, vocab):
        """
        Calculates the frequency of each character pair in the vocabulary.
        """
        pairs = defaultdict(int)
        for word, freq in vocab.items():
            for i in range(len(word) - 1):
                pairs[(word[i], word[i+1])] += freq

        return pairs

    def merge_vocab(self, pair, vocab):
        """
        Merges the most frequent pair into a new subword.
        """
        new_vocab = {}
        bigram = re.escape(' '.join(pair))
        p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
        for word in vocab:
            w_out = ' '.join(word)
            w_out = p.sub(''.join(pair), w_out)
            new_vocab[tuple(w_out.split())] = vocab[word]

        return new_vocab
        
    def fit(self, text):
        """
        Iteratively merges pairs until the vocabulary reaches the desired size.
        """
        vocab = self.get_vocab(text)
        while len(self.vocab) < self.vocab_size:
            pairs = self.get_stats(vocab)
            if not pairs:
                break
            best = max(pairs, key=pairs.get)
            vocab = self.merge_vocab(best, vocab)
            self.vocab[best] = pairs[best]

    def tokenize(self, word):
        """
        Tokenizes a new word based on the learned subwords.
        """
        tokens = list(word)
        i = 0
        while i < len(tokens) - 1:
            bigram = (tokens[i], tokens[i + 1])
            if bigram in self.vocab:
                tokens[i:i+2] = [''.join(bigram)]
            else:
                i += 1

        return tokens

In [14]:
text = "this is a simple BPE tokenizer example"
bpe_tokenizer = BPE(vocab_size=10)
bpe_tokenizer.fit(text)


In [13]:
tokens = bpe_tokenizer.tokenize("example")
print(tokens)
tokens = bpe_tokenizer.tokenize("simple")
print(tokens)

['e', 'x', 'a', 'mple']
['si', 'mple']


In [15]:
tokens = bpe_tokenizer.tokenize("this")
print(tokens)
tokens = bpe_tokenizer.tokenize("is")
print(tokens)

['th', 'is']
['is']
