### **Python Implementation of BPE**
Here’s a simplified implementation of Byte Pair Encoding in Python:

In [1]:
from collections import Counter

### **Counting Pairs**
Elements in the vocab are taken in 2's and the occurence in the sentence is counted

In [62]:
def get_stats(vocab):
    pairs = Counter()
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i+1]] += freq
    return pairs

Each pair is merged and the process is repeated

In [63]:
def merge_vocab(pair, vocab):
    new_vocab = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    for word in vocab:
        new_word = word.replace(bigram, replacement)
        new_vocab[new_word] = vocab[word]
    return new_vocab

In [65]:
vocab = {
    'l o w </w>' : 5,
    'l o w e r </w>' : 2,
    'n e w e s t </w>': 6,
    'w i d e s t </w>': 3,
}
num_merges = 10

for i in range(num_merges):
    print(f"{vocab=}")
    pairs = get_stats(vocab)
    top_pairs = sorted(list(pairs.items()), key=lambda x: x[1], reverse=True)[:5]
    print(f"{top_pairs=}")
    best = top_pairs[0][0]
    vocab = merge_vocab(best, vocab)
    print(f"best={best}: {pairs[best]}")
    

vocab={'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}
top_pairs=[(('e', 's'), 9), (('s', 't'), 9), (('t', '</w>'), 9), (('w', 'e'), 8), (('l', 'o'), 7)]
best=('e', 's'): 9
vocab={'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w es t </w>': 6, 'w i d es t </w>': 3}
top_pairs=[(('es', 't'), 9), (('t', '</w>'), 9), (('l', 'o'), 7), (('o', 'w'), 7), (('n', 'e'), 6)]
best=('es', 't'): 9
vocab={'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est </w>': 6, 'w i d est </w>': 3}
top_pairs=[(('est', '</w>'), 9), (('l', 'o'), 7), (('o', 'w'), 7), (('n', 'e'), 6), (('e', 'w'), 6)]
best=('est', '</w>'): 9
vocab={'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
top_pairs=[(('l', 'o'), 7), (('o', 'w'), 7), (('n', 'e'), 6), (('e', 'w'), 6), (('w', 'est</w>'), 6)]
best=('l', 'o'): 7
vocab={'lo w </w>': 5, 'lo w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}
top_pairs=[(('lo', 'w'), 7), (('n', 'e'), 6), (('e', 'w'), 6), (('w', 'est</w>'),

In [None]:
vocab = {
    "l o w": 5,
    "l o w e r": 2,
    "n e w": 6,
    "w i d e r": 3
}

In [56]:
' '.join(('l', 'o'))

'l o'