# Byte-pair enconding

In [None]:
# initialized vocabulary will all unique words in the corpus and their counts. Calculate also pair counts, and update vocabulary iterativelly

from collections import defaultdict

'''
generates a vocabulary that represents words as a sequence of characters and tracks their counts
'''
def initialize_vocabulary(corpus):
    vocabulary=defaultdict(int)
    charset = set()
    for word in corpus:
        word_with_marker = "_" + word
        characters = list(word_with_marker)
        charset.update(characters)
        tokenized_word = ' '.join(characters)
        vocabulary[tokenized_word] += 1
    return vocabulary, charset

In [None]:
# merge more frequency pair of tokens

'''
How often adjacent token pairs appear in the tokenized vocabulary words '''
def get_pair_counts(vocabulary):
    pair_counts = defaultdict(int)
    for tokenized_word, count in vocabulary.items():
        tokens = tokenized_word.split()
        for i in range(len(tokens)-1):
            pair = (tokens[i], tokens[i+1])
            pair_counts[pair] += count
    return pair_counts

In [3]:
# merge pairs

import re

'''Merges the input token pair in all tokenized words from the vocabulary'''
def merge_pair(vocabulary, pair):
    new_vocabulary = {}
    bigram = re.escape(' '.join(pair))
    pattern = re.compile(r"(?<!\S)" + bigram + r"(?!\S)")
    for tokenized_word, count in vocabulary.items():
        new_tokenized_word = pattern.sub("".join(pair), tokenized_word)
        new_vocabulary[new_tokenized_word] = count
    return new_vocabulary


In [None]:
# implements BPE algorithm
'''processes a corpus to produce the components needed for a tokenizer'''
def byte_pair_encoding(corpus, vocab_size):
    vocabulary, charset = initialize_vocabulary(corpus)
    merges = []
    tokens = set(charset)
    while len(tokens) < vocab_size:
        pair_counts = get_pair_counts(vocabulary)
        if not pair_counts:
            break
        most_frequent_pair = max(pair_counts, key=pair_counts.get)
        merges.append(most_frequent_pair)
        vocabulary = merge_pair(vocabulary, most_frequent_pair)
        new_token = ''.join(most_frequent_pair)
        tokens.add(new_token)

    return vocabulary, merges, charset, tokens

In [None]:
# tokenize a word using a trained tokenize

'''tokenizes a word using merges, vocabulary, and charset from byte_pair_encoding'''
def tokenize_word(word, merges, vocabulary, charset, unk_token="<UNK>"):
    word = "_" + word
    if word in vocabulary:
        return [word]
    tokens = [char if char in charset else unk_token for char in word]

    for left, right in merges:
        i = 0
        while i < len(tokens) - 1:
            if tokens[i:i+2] == [left, right]:
                tokens[i:i+2] = [left, right]
            else:
                i += 1
    return tokens
