# 1) Word tokenization

In [2]:
import re

def word_tokenization(text):
    # tokens = re.findall(r"\w+['-]?\w*|\d+|[^\s\w]", text) #using regx
    tokens = text.split() #using split function
    return tokens

if __name__ == "__main__":
    text = "I'm learning AI/ML — it's fun! 2025 is great."
    print(word_tokenization(text))

["I'm", 'learning', 'AI/ML', '—', "it's", 'fun!', '2025', 'is', 'great.']


# 2) Character tokenization

In [4]:
def character_tokenization(text, include_spaces = False):
    l = []
    if include_spaces:
        return list(text)
    else:
        for char in text:
            if not char.isspace():
                l.append(char)
    return l


if __name__ == "__main__":
    text = "hello world"
    print(character_tokenization(text))
    print(character_tokenization(text, True))



['h', 'e', 'l', 'l', 'o', 'w', 'o', 'r', 'l', 'd']
['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd']


# 3) Subword tokenization

In [7]:
from collections import Counter
def build_vocab_from_words(words, min_count=1):
    c = Counter(' '.join(words).ssplit())
    return set(w for w, cnt in c.items() if cnt >= min_count)

def greedy_subword_tokenize(word, vocab, unk_token = '[UNK]'):
    tokens = []
    i = 0
    while i < len(word):
        match = None
        for j in range(len(word), i, -1):
            piece = word[i:j]
            if piece in vocab:
                match = piece
                tokens.append(piece)
                i = j
                break
        if match is None:
            return [unk_token]
    return tokens

if __name__ == "__main__":
    vocab = {'play', 'ing', 'play', 'ground', 'able', 'playground'}
    print(greedy_subword_tokenize('playground', vocab)) 
    print(greedy_subword_tokenize('playing', vocab)) 

['playground']
['play', 'ing']


# 4) Byte-Pair Encoding (BPE)

In [None]:
# simple_bpe.py
from collections import Counter, defaultdict
import re

def get_stats(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[(symbols[i], symbols[i+1])] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    pattern = re.compile(r'(?<!\S)'+re.escape(bigram)+r'(?!\S)')
    for word in v_in:
        new_word = pattern.sub(replacement, word)
        v_out[new_word] = v_in[word]
    return v_out

def train_bpe(corpus, num_merges=50):
    # corpus: list of words (strings)
    vocab = Counter()
    for w in corpus:
        vocab[' '.join(list(w)) + ' </w>'] += 1
    merges = []
    for i in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
        merges.append(best)
    return merges, vocab

def apply_bpe(token, merges):
    word = ' '.join(list(token)) + ' </w>'
    for a,b in merges:
        bigram = a + ' ' + b
        word = word.replace(bigram, a+b)
    return word.split()

if __name__ == "__main__":
    corpus = ["low", "lower", "newest", "widest", "low", "low", "lower"]
    merges, final_vocab = train_bpe(corpus, num_merges=10)
    print("merges:", merges)
    print("tokenized:", apply_bpe("lower", merges))
