In [1]:
import re
from collections import Counter, defaultdict

In [26]:
class BytePairEncoding:
    def __init__(self, num_merges):
        self.num_merges = num_merges
        self.vocab = {}
        self.bpe_ranks = {}
        self.inverse_vocab = {}

    def get_stats(self, corpus):
        pairs = defaultdict(int)
        for word, freq in corpus.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[(symbols[i], symbols[i + 1])] += freq
        return pairs

    def merge_vocab(self, pair, corpus):
        new_corpus = {}
        bigram = re.escape(' '.join(pair))
        p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
        for word in corpus:
            new_word = p.sub(''.join(pair), word)
            new_corpus[new_word] = corpus[word]
        return new_corpus

    def fit(self, texts):
        # Build initial corpus with space-separated characters
        corpus = {' '.join(list(word)) + ' </w>': freq for word, freq in Counter(texts).items()}

        # Perform num_merges merges
        for i in range(self.num_merges):
            pairs = self.get_stats(corpus)
            if not pairs:
                break
            best = max(pairs, key=pairs.get)
            corpus = self.merge_vocab(best, corpus)
            self.bpe_ranks[best] = i

        # Build vocabulary with indices (ENSURE ALL UNIQUE TOKENS)
        unique_tokens = set()
        for word in corpus.keys():
            unique_tokens.update(word.split())
        # for encode
        self.vocab = {token: idx for idx, token in enumerate(unique_tokens)}
        # for decode
        self.inverse_vocab = {idx: token for token, idx in self.vocab.items()}

    def encode(self, word):
        word = ' '.join(list(word)) + ' </w>'
        while True:
            pairs = [(word[i], word[i + 1]) for i in range(len(word) - 1)]
            pairs = [(p[0], p[1]) for p in pairs if (p[0], p[1]) in self.bpe_ranks]
            if not pairs:
                break
            pair = min(pairs, key=lambda pair: self.bpe_ranks[pair])
            word = word.replace(' '.join(pair), ''.join(pair))
        return word.split()

    def tokenize(self, text):
        encoded_text = self.encode(text)
        # Handle the case where encoded subwords are not in the vocab
        tokenized = []
        for subword in encoded_text:
            if subword in self.vocab:
                tokenized.append(self.vocab[subword])
            else:
                # Adding unknown token handling (optional)
                print(f"Warning: subword '{subword}' not found in vocabulary.")
        return tokenized

    def decode(self, tokens):
        subwords = [self.inverse_vocab[token] for token in tokens if token in self.inverse_vocab]
        return ''.join(subwords).replace('</w>', '')


In [27]:
sample_text = """
In a distant land, there was a village surrounded by lush forests and towering mountains. The villagers lived in harmony with nature, cultivating their fields and tending to their animals. Each day, the sun rose over the mountains, casting a golden glow across the village. Birds sang sweet melodies, and the air was filled with the scent of blooming flowers.

In this village, there was a young girl named Lily. She had a curious mind and a kind heart. Every morning, she would explore the forest, discovering new plants and animals. She dreamed of becoming a great healer, using the gifts of nature to help those in need. Her grandmother, who was the village healer, taught her about the medicinal properties of various herbs and plants.

One day, a stranger arrived in the village. He was a traveler from a far-off land, seeking knowledge and adventure. Lily was fascinated by his stories and eagerly listened to his tales of distant places and incredible feats. The traveler shared his wisdom with the villagers, teaching them new ways to cultivate their land and improve their lives.

As the seasons changed, the village continued to thrive. Lily grew up to become a respected healer, and the traveler became a cherished friend to all. Together, they showed that with knowledge, kindness, and a little bit of curiosity, anything is possible.
"""
# Preprocess the text to create a list of words
texts = sample_text.lower().split()

In [28]:
# Example usage
bpe = BytePairEncoding(num_merges=10)
bpe.fit(texts)

print("Vocabulary:", bpe.vocab)

Vocabulary: {'s': 0, 'w': 1, 'd</w>': 2, 'o': 3, 'y': 4, 'm': 5, 'an': 6, 'k': 7, 'i': 8, 's</w>': 9, 'n': 10, '</w>': 11, 'v': 12, '.</w>': 13, 'h': 14, 'c': 15, 'd': 16, 'e</w>': 17, 'x': 18, '-': 19, 'p': 20, 'e': 21, 'f': 22, 'g': 23, 'in': 24, 'a': 25, 'he': 26, 'the': 27, ',</w>': 28, 'er': 29, 'r': 30, 't': 31, 'l': 32, 'u': 33, 'b': 34}


In [31]:
# Tokenize a new text
new_text = "bell"
encoded_text = bpe.encode(new_text)
tokenized_text = bpe.tokenize(new_text)
decoded_text = bpe.decode(tokenized_text)
print(f"Encoded '{new_text}':", encoded_text)
print(f"Tokenized '{new_text}':", tokenized_text)
print(f"Decoded '{tokenized_text}: {decoded_text}'")

Encoded 'bell': ['b', 'e', 'l', 'l', '</w>']
Tokenized 'bell': [34, 21, 32, 32, 11]
Decoded '[34, 21, 32, 32, 11]: bell'


In [32]:
# Tokenize a new text
new_text = "sell"
encoded_text = bpe.encode(new_text)
tokenized_text = bpe.tokenize(new_text)
decoded_text = bpe.decode(tokenized_text)
print(f"Encoded '{new_text}':", encoded_text)
print(f"Tokenized '{new_text}':", tokenized_text)
print(f"Decoded '{tokenized_text}: {decoded_text}'")

Encoded 'sell': ['s', 'e', 'l', 'l', '</w>']
Tokenized 'sell': [0, 21, 32, 32, 11]
Decoded '[0, 21, 32, 32, 11]: sell'


In [33]:
# Tokenize a new text
new_text = "bush"
encoded_text = bpe.encode(new_text)
tokenized_text = bpe.tokenize(new_text)
decoded_text = bpe.decode(tokenized_text)
print(f"Encoded '{new_text}':", encoded_text)
print(f"Tokenized '{new_text}':", tokenized_text)
print(f"Decoded '{tokenized_text}: {decoded_text}'")

Encoded 'bush': ['b', 'u', 's', 'h', '</w>']
Tokenized 'bush': [34, 33, 0, 14, 11]
Decoded '[34, 33, 0, 14, 11]: bush'


In [34]:
# Tokenize a new text
new_text = "shit"
encoded_text = bpe.encode(new_text)
tokenized_text = bpe.tokenize(new_text)
decoded_text = bpe.decode(tokenized_text)
print(f"Encoded '{new_text}':", encoded_text)
print(f"Tokenized '{new_text}':", tokenized_text)
print(f"Decoded '{tokenized_text}: {decoded_text}'")

Encoded 'shit': ['s', 'h', 'i', 't', '</w>']
Tokenized 'shit': [0, 14, 8, 31, 11]
Decoded '[0, 14, 8, 31, 11]: shit'
