# Unigram tokenization

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required packages for building a Unigram tokenizer from scratch
# - datasets: For loading and processing text datasets
# - evaluate: For model evaluation metrics
# - transformers[sentencepiece]: Core library with SentencePiece support (Unigram uses SentencePiece)
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Use the same training corpus to compare Unigram with BPE and WordPiece
# Unigram starts with a large vocabulary and progressively removes the least useful tokens
# This is opposite to BPE/WordPiece which start small and grow the vocabulary
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [None]:
# Load XLNet tokenizer to understand Unigram pre-tokenization
# XLNet uses the Unigram algorithm with SentencePiece implementation
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

In [None]:
# Step 1: Pre-tokenize corpus and count word frequencies for Unigram
# XLNet/SentencePiece uses a different pre-tokenization approach than BERT or GPT-2
from collections import defaultdict

word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

word_freqs

In [None]:
# Step 2: Generate initial large vocabulary for Unigram training
# Unlike BPE/WordPiece, Unigram starts with ALL possible substrings
# Then it removes the least useful ones to reach the target vocabulary size
char_freqs = defaultdict(int)
subwords_freqs = defaultdict(int)
for word, freq in word_freqs.items():
    for i in range(len(word)):
        char_freqs[word[i]] += freq
        # Generate all subwords of length 2 or more
        for j in range(i + 2, len(word) + 1):
            subwords_freqs[word[i:j]] += freq

# Sort subwords by frequency to prioritize common patterns
sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)
sorted_subwords[:10]

In [None]:
# Step 3: Create initial token frequencies combining characters and top subwords
# Limit to top 300 subwords plus all individual characters to prevent explosion
# This forms our starting vocabulary that will be pruned down
token_freqs = list(char_freqs.items()) + sorted_subwords[: 300 - len(char_freqs)]
token_freqs = {token: freq for token, freq in token_freqs}

In [None]:
# Step 4: Convert frequencies to Unigram model probabilities
# Unigram uses negative log-likelihood as the scoring function
# Higher frequency tokens get lower (better) scores
from math import log

total_sum = sum([freq for token, freq in token_freqs.items()])
model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

In [None]:
# Step 5: Unigram encoding using dynamic programming (Viterbi algorithm)
# Unlike BPE/WordPiece greedy approaches, Unigram finds optimal segmentation
# Uses dynamic programming to find the segmentation with minimum total score
def encode_word(word, model):
    # best_segmentations[i] stores the best way to segment word[:i]
    best_segmentations = [{"start": 0, "score": 1}] + [
        {"start": None, "score": None} for _ in range(len(word))
    ]
    
    for start_idx in range(len(word)):
        best_score_at_start = best_segmentations[start_idx]["score"]
        if best_score_at_start is None:
            continue
            
        # Try all possible tokens starting from start_idx
        for end_idx in range(start_idx + 1, len(word) + 1):
            token = word[start_idx:end_idx]
            if token in model:
                score = model[token] + best_score_at_start
                # Update if this gives a better segmentation ending at end_idx
                if (
                    best_segmentations[end_idx]["score"] is None
                    or best_segmentations[end_idx]["score"] > score
                ):
                    best_segmentations[end_idx] = {"start": start_idx, "score": score}

    # Reconstruct the optimal segmentation by backtracking
    segmentation = best_segmentations[-1]
    if segmentation["score"] is None:
        return ["<unk>"], None  # No valid tokenization found

    score = segmentation["score"]
    start = segmentation["start"]
    end = len(word)
    tokens = []
    while start != 0:
        tokens.insert(0, word[start:end])
        next_start = best_segmentations[start]["start"]
        end = start
        start = next_start
    tokens.insert(0, word[start:end])
    return tokens, score

In [None]:
# Step 6: Test the Unigram encoding on example words
# Notice the different segmentation strategies compared to BPE/WordPiece
# "Hopefully" falls back to character-level due to limited vocabulary
# "This" is found as a complete token with low score (high frequency)
print(encode_word("Hopefully", model))
print(encode_word("This", model))

In [None]:
# Step 7: Compute total loss function for current model
# Unigram measures model quality by total negative log-likelihood over corpus
# Lower loss means better model fit to the training data
def compute_loss(model):
    loss = 0
    for word, freq in word_freqs.items():
        _, word_loss = encode_word(word, model)
        loss += freq * word_loss  # Weight by word frequency
    return loss

In [None]:
# Step 8: Compute initial loss with full vocabulary
# This establishes baseline performance before vocabulary pruning
compute_loss(model)

In [None]:
# Step 9: Function to evaluate impact of removing each token
# Unigram removes tokens that contribute least to model performance
# We compute how much loss increases when each token is removed
import copy

def compute_scores(model):
    scores = {}
    model_loss = compute_loss(model)
    for token, score in model.items():
        # Always keep single character tokens (can't be removed)
        if len(token) == 1:
            continue
        # Create model without this token and measure loss increase
        model_without_token = copy.deepcopy(model)
        _ = model_without_token.pop(token)
        scores[token] = compute_loss(model_without_token) - model_loss
    return scores

In [None]:
# Step 10: Examine token removal scores
# Tokens with score 0 don't help the model - they can be safely removed
# Higher scores indicate more important tokens for the model
scores = compute_scores(model)
print(scores["ll"])   # "ll" contributes positively to model fit
print(scores["his"])  # "his" contributes nothing (score = 0)

In [None]:
# Step 11: Main Unigram training loop - iteratively remove least useful tokens
# This is the core of Unigram training: start big, shrink to target size
# Remove tokens with lowest scores (least impact on model performance)
percent_to_remove = 0.1  # Remove 10% of tokens each iteration
while len(model) > 100:
    scores = compute_scores(model)
    sorted_scores = sorted(scores.items(), key=lambda x: x[1])
    # Remove the least useful tokens (lowest scores first)
    for i in range(int(len(model) * percent_to_remove)):
        _ = token_freqs.pop(sorted_scores[i][0])

    # Recompute model probabilities with reduced vocabulary
    total_sum = sum([freq for token, freq in token_freqs.items()])
    model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

In [None]:
# Step 12: Complete Unigram tokenization function
# Combines pre-tokenization with optimal Unigram segmentation
def tokenize(text, model):
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in words_with_offsets]
    encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text]
    return sum(encoded_words, [])  # Flatten list of lists

# Test the final Unigram tokenizer
tokenize("This is the Hugging Face course.", model)