# 1) Word tokenization

In [10]:
import re

def word_tokenization(text):
    # tokens = re.findall(r"\w+['-]?\w*|\d+|[^\s\w]", text) #using regx
    tokens = text.split() #using split function
    return tokens

if __name__ == "__main__":
    text = "I'm learning AI/ML — it's fun! 2025 is great."
    print(word_tokenization(text))

["I'm", 'learning', 'AI/ML', '—', "it's", 'fun!', '2025', 'is', 'great.']


# 2) Character tokenization

In [11]:
def character_tokenization(text, include_spaces = False):
    l = []
    if include_spaces:
        return list(text)
    else:
        for char in text:
            if not char.isspace():
                l.append(char)
    return l


if __name__ == "__main__":
    text = "hello world"
    print(character_tokenization(text))
    print(character_tokenization(text, True))



['h', 'e', 'l', 'l', 'o', 'w', 'o', 'r', 'l', 'd']
['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd']


# 3) Subword tokenization

In [12]:
from collections import Counter
def build_vocab_from_words(words, min_count=1):
    c = Counter(' '.join(words).ssplit())
    return set(w for w, cnt in c.items() if cnt >= min_count)

def greedy_subword_tokenize(word, vocab, unk_token = '[UNK]'):
    tokens = []
    i = 0
    while i < len(word):
        match = None
        for j in range(len(word), i, -1):
            piece = word[i:j]
            if piece in vocab:
                match = piece
                tokens.append(piece)
                i = j
                break
        if match is None:
            return [unk_token]
    return tokens

if __name__ == "__main__":
    vocab = {'play', 'ing', 'play', 'ground', 'able', 'playground'}
    print(greedy_subword_tokenize('playground', vocab)) 
    print(greedy_subword_tokenize('playing', vocab)) 

['playground']
['play', 'ing']


# 4) Byte-Pair Encoding (BPE)

In [13]:
# simple_bpe.py
from collections import Counter, defaultdict
import re

def get_stats(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[(symbols[i], symbols[i+1])] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    pattern = re.compile(r'(?<!\S)'+re.escape(bigram)+r'(?!\S)')
    for word in v_in:
        new_word = pattern.sub(replacement, word)
        v_out[new_word] = v_in[word]
    return v_out

def train_bpe(corpus, num_merges=50):
    # corpus: list of words (strings)
    vocab = Counter()
    for w in corpus:
        vocab[' '.join(list(w)) + ' </w>'] += 1
    merges = []
    for i in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
        merges.append(best)
    return merges, vocab

def apply_bpe(token, merges):
    word = ' '.join(list(token)) + ' </w>'
    for a,b in merges:
        bigram = a + ' ' + b
        word = word.replace(bigram, a+b)
    return word.split()

if __name__ == "__main__":
    corpus = ["low", "lower", "newest", "widest", "low", "low", "lower"]
    merges, final_vocab = train_bpe(corpus, num_merges=10)
    print("merges:", merges)
    print("tokenized:", apply_bpe("lower", merges))


merges: [('l', 'o'), ('lo', 'w'), ('low', '</w>'), ('low', 'e'), ('lowe', 'r'), ('lower', '</w>'), ('e', 's'), ('es', 't'), ('est', '</w>'), ('n', 'e')]
tokenized: ['lower</w>']


# 5) WordPiece

In [14]:
# wordpiece_greedy.py
# WordPiece tokenization at inference time is greedy longest-match using a pre-built vocabulary.
def wordpiece_tokenize(text, vocab, unk_token='[UNK]'):
    tokens = []
    for word in text.split():
        i = 0
        sub_tokens = []
        while i < len(word):
            j = len(word)
            cur_sub = None
            while j > i:
                substr = (word[i:j] if i==0 else '##' + word[i:j])  # '##' denotes continuation
                if substr in vocab:
                    cur_sub = substr
                    break
                j -= 1
            if cur_sub is None:
                sub_tokens = [unk_token]
                break
            sub_tokens.append(cur_sub)
            i = j
        tokens.extend(sub_tokens)
    return tokens

if __name__ == "__main__":
    vocab = {"play","##ing","ground","##er","##est","playground","[UNK]"}
    print(wordpiece_tokenize("playing playground", vocab))
    # -> ['play','##ing','playground']

['play', '##ing', 'playground']


# 6) SentencePiece 

In [15]:
# sentencepiece_example.py
import sentencepiece as spm
import os

# 1) create a tiny training file
with open('text.txt', 'w', encoding='utf8') as f:
    f.write("This is a sample sentence.\nAnother sentence for training.\nPlaying and playground.\n")

# 2) train (model_type can be 'bpe', 'unigram', or 'word')
spm.SentencePieceTrainer.Train('--input=text.txt --model_prefix=m --vocab_size=200 --model_type=bpe')

# 3) load and tokenize
sp = spm.SentencePieceProcessor()
sp.Load('m.model')

s = "Playing playground is fun."
print(sp.EncodeAsPieces(s))  # subword pieces
print(sp.EncodeAsIds(s))     # numeric ids


ModuleNotFoundError: No module named 'sentencepiece'

# 7) Unigram language model

In [16]:
# train a unigram model
spm.SentencePieceTrainer.Train('--input=text.txt --model_prefix=unigram_m --vocab_size=100 --model_type=unigram')

# then in Python
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('unigram_m.model')
print(sp.EncodeAsPieces("Playing playground is fun."))

NameError: name 'spm' is not defined