In [1]:
with open("./corpus.txt", encoding="utf-8") as f:
    dataset = f.read()

import pandas as pd
df = pd.DataFrame(data= dataset.split())

In [2]:
df

Unnamed: 0,0
0,lit
1,bet
2,nocap
3,cap
4,flex
...,...
111,glowup
112,wavy
113,boujee
114,glowedup


In [3]:
from collections import defaultdict
def get_pair_freq(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        for i in range(len(word)-1):
            pair = (word[i] , word[i+1])
            pairs[pair] += freq
        
    return pairs


In [4]:
def merge_vocab(pair, vocab):
    new_vocab = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)  # e.g. ('l','o') → 'lo'

    for word, freq in vocab.items():
        # Convert tuple of symbols to string with spaces between symbols
        word_str = ' '.join(word)
        # Replace all exact occurrences of the bigram
        new_word_str = word_str.replace(bigram, replacement)
        # Convert back to tuple for consistent representation
        new_vocab[tuple(new_word_str.split(' '))] = freq

    return new_vocab

In [5]:
def token_bpe(data, num_merges):
    vocab_dict = {}
    for word in data:
        char = list(word) + ["</w>"]
        vocab_dict[tuple(char)]  = vocab_dict.get(tuple(char), 0) + 1
    
    merges = []
    for i in range(num_merges):
        pairs = get_pair_freq(vocab_dict)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        merges.append(best)

        vocab_dict = merge_vocab(best, vocab_dict)
    
    return merges

# tokenization (encoding)

In [6]:
def merge_in_word(word, pair):
    """Merge the pair inside a single word represented as a list of symbols."""
    merged = []
    i = 0
    while i < len(word):
        if i < len(word) - 1 and (word[i], word[i+1]) == pair:
            merged.append(word[i] + word[i+1])
            i += 2
        else:
            merged.append(word[i])
            i += 1
    return merged

def encode_word(word, merges):
    word = list(word) + ["</w>"]
    while True:
        pairs = [(word[i], word[i+1]) for i in range(len(word)-1)]
        merge_candidates = [p for p in pairs if p in merges]
        if not merge_candidates:
            break
        # Merge the first applicable one according to merges priority
        pair_to_merge = min(merge_candidates, key=lambda p: merges.index(p))
        word = merge_in_word(word, pair_to_merge)
    return word


# Assigning id's

In [7]:
# Assign ids for an example encoded word (compute merges/encoded if missing)
if 'merges' not in globals():
    # token_bpe expects an iterable of words
    merges = token_bpe(dataset.split(), num_merges=10)

if 'encoded' not in globals():
    encoded = encode_word("delulu", merges)

tokens = encoded  # encoded should come from encode_word(...)
# preserve token order and remove duplicates when creating the vocab
unique_tokens = list(dict.fromkeys(tokens))
token_to_id = {token: i for i, token in enumerate(unique_tokens)}
id_to_token = {i: token for token, i in token_to_id.items()}

ids = [token_to_id[tok] for tok in tokens]

print("tokens:", tokens)
print("unique tokens:", unique_tokens)
print("ids:", ids)

tokens: ['d', 'e', 'l', 'u', 'l', 'u', '</w>']
unique tokens: ['d', 'e', 'l', 'u', '</w>']
ids: [0, 1, 2, 3, 2, 3, 4]


# Decoding

In [8]:
def decode(ids, id_to_token):
    tokens = [id_to_token[i] for i in ids]
    text = "".join(tokens).replace("</w>", " ")
    return text.strip()

In [9]:
merges = token_bpe(dataset, num_merges=10)
print("Learned merges:", merges)

encoded = encode_word("delulu", merges)
print("Tokens:", encoded)

Learned merges: [('\n', '</w>'), ('e', '</w>'), ('a', '</w>'), ('i', '</w>'), ('t', '</w>'), ('o', '</w>'), ('s', '</w>'), ('c', '</w>'), ('n', '</w>'), ('g', '</w>')]
Tokens: ['d', 'e', 'l', 'u', 'l', 'u', '</w>']
