# Building a tokenizer, block by block

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required packages for building custom tokenizers with the tokenizers library
# - datasets: For loading and processing text datasets  
# - evaluate: For model evaluation metrics
# - transformers[sentencepiece]: Core library with SentencePiece support
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Load the WikiText-2 dataset for tokenizer training
# This is a larger, realistic dataset compared to our previous small corpus
# We create a generator function for memory-efficient training
from datasets import load_dataset

dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")

def get_training_corpus():
    # Yield data in batches of 1000 for efficient processing
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

In [None]:
# Alternative: Save the dataset to a text file for traditional training
# Some tokenizer trainers prefer file-based input over iterators
with open("wikitext-2.txt", "w", encoding="utf-8") as f:
    for i in range(len(dataset)):
        f.write(dataset[i]["text"] + "\n")

In [None]:
# Build a WordPiece tokenizer from scratch using the tokenizers library
# This demonstrates how to construct tokenizers programmatically
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

# Initialize with WordPiece model and [UNK] token for unknown words
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [None]:
# Add BERT-style normalization (handles case and accents)
# BertNormalizer combines several text normalization steps:
# - Lowercasing (since we specify lowercase=True)
# - Unicode normalization
# - Accent stripping
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

In [None]:
# Alternative: Build custom normalization pipeline by combining multiple steps
# This gives you more control over the normalization process
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

In [None]:
# Test the normalization pipeline
# See how accented characters are converted to their base forms and lowercased
print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

In [None]:
# Set up BERT-style pre-tokenization
# BertPreTokenizer splits on whitespace and punctuation, similar to our manual implementation
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [None]:
# Alternative: Use simple whitespace pre-tokenization
# This splits only on whitespace, keeping punctuation attached to words
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

In [None]:
# Test the pre-tokenization step
# Compare how different pre-tokenizers handle the same text
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

In [None]:
# Demonstrate WhitespaceSplit pre-tokenizer (keeps punctuation with words)
# Different from Whitespace - see how punctuation is handled differently
pre_tokenizer = pre_tokenizers.WhitespaceSplit()
pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

In [None]:
# Create a sequence of pre-tokenizers for more sophisticated splitting
# First split on whitespace, then separate punctuation
# This achieves similar results to BertPreTokenizer through composition
pre_tokenizer = pre_tokenizers.Sequence(
    [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)
pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

In [None]:
# Configure the WordPiece trainer with BERT's special tokens
# Special tokens serve specific purposes:
# - [UNK]: Unknown/out-of-vocabulary words
# - [PAD]: Padding for batch processing
# - [CLS]: Classification token (sequence start)
# - [SEP]: Separator token (sequence end/boundary)
# - [MASK]: Masking token for MLM training
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

In [None]:
# Train the tokenizer using the iterator-based approach
# This is memory-efficient for large datasets as it doesn't load everything at once
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [None]:
# Alternative: Train from text file
# Reset the model and train from the saved file instead
tokenizer.model = models.WordPiece(unk_token="[UNK]")
tokenizer.train(["wikitext-2.txt"], trainer=trainer)

In [None]:
# Test the trained tokenizer
# Notice the WordPiece subword segmentation with "##" prefixes
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

In [None]:
# Find the token IDs for special tokens needed for post-processing
# We'll need these IDs to configure the template processor
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

In [None]:
# Configure post-processing to add special tokens like BERT
# Template processing automatically adds [CLS] and [SEP] tokens
# Single sequence: [CLS] text [SEP]
# Pair of sequences: [CLS] text1 [SEP] text2 [SEP]
# The ":0" and ":1" specify token type IDs for distinguishing sequences
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [None]:
# Test single sequence processing
# Notice how [CLS] and [SEP] tokens are automatically added
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

In [None]:
# Test pair of sequences processing
# BERT format: [CLS] sentence1 [SEP] sentence2 [SEP]
# type_ids help distinguish which tokens belong to which sequence
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
print(encoding.tokens)
print(encoding.type_ids)  # 0 for first sequence, 1 for second sequence

In [None]:
# Configure decoder to reconstruct text from tokens
# WordPiece decoder knows how to handle "##" prefixes properly
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [None]:
# Test decoding - convert token IDs back to readable text
# The decoder properly reconstructs words from subword pieces
tokenizer.decode(encoding.ids)

In [None]:
# Save the complete tokenizer to JSON format
# This preserves all configuration: model, normalizer, pre-tokenizer, post-processor, decoder
tokenizer.save("tokenizer.json")

In [None]:
# Load the tokenizer from saved file
# Demonstrate that the tokenizer can be fully reconstructed
new_tokenizer = Tokenizer.from_file("tokenizer.json")

In [None]:
# Wrap the fast tokenizer in Transformers' interface
# PreTrainedTokenizerFast provides compatibility with Transformers models
# Specify all special tokens for proper integration
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    # tokenizer_file="tokenizer.json", # Alternative: load directly from file
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [None]:
# Alternative: Use BERT-specific tokenizer wrapper
# BertTokenizerFast automatically sets appropriate defaults for BERT models
from transformers import BertTokenizerFast

wrapped_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

In [None]:
# Now build a BPE tokenizer similar to GPT-2
# Start fresh with the BPE model
tokenizer = Tokenizer(models.BPE())

In [None]:
# Set up BPE-style pre-tokenization with ByteLevel
# ByteLevel ensures every possible input can be encoded (no unknown bytes)
# add_prefix_space=False means we don't add space at the beginning
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [None]:
# Test ByteLevel pre-tokenization
# Notice 'Ġ' represents spaces in BPE, and how it handles word boundaries
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!")

In [None]:
# Train BPE tokenizer with GPT-2 style configuration
# Use <|endoftext|> as the special token for document boundaries
trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=["<|endoftext|>"])
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [None]:
# Alternative: Train BPE from file
# Reset the BPE model and train from the saved file
tokenizer.model = models.BPE()
tokenizer.train(["wikitext-2.txt"], trainer=trainer)

In [None]:
# Test the trained BPE tokenizer
# Notice the BPE subword segmentation without "##" prefixes (that's WordPiece-specific)
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

In [None]:
# Configure ByteLevel post-processing for BPE
# This handles the proper mapping between byte-level and character-level representations
# trim_offsets=False preserves original character positions
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

In [None]:
# Demonstrate offset preservation with ByteLevel processing
# Offsets allow us to map tokens back to their original positions in the text
sentence = "Let's test this tokenizer."
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[4]  # Get position of 5th token
sentence[start:end]  # Extract original text for that token

In [None]:
# Configure ByteLevel decoder for BPE
# This properly reconstructs text from BPE tokens, handling byte-level encoding
tokenizer.decoder = decoders.ByteLevel()

In [None]:
# Test BPE decoding
# The decoder converts token IDs back to the original text
tokenizer.decode(encoding.ids)

In [None]:
# Wrap BPE tokenizer in Transformers interface
# Configure with GPT-2 style special tokens
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",  # Beginning of sequence
    eos_token="<|endoftext|>",  # End of sequence
)

In [None]:
# Alternative: Use GPT-2 specific tokenizer wrapper
# GPT2TokenizerFast automatically sets appropriate defaults for GPT-2 models
from transformers import GPT2TokenizerFast

wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)

In [None]:
# Finally, build a Unigram tokenizer similar to T5/XLNet
# Start with the Unigram model
tokenizer = Tokenizer(models.Unigram())

In [None]:
# Set up sophisticated normalization pipeline for Unigram
# This handles various text normalization tasks for SentencePiece compatibility
from tokenizers import Regex

tokenizer.normalizer = normalizers.Sequence(
    [
        normalizers.Replace("``", '"'),        # Convert double backticks to quotes
        normalizers.Replace("''", '"'),        # Convert double single quotes to quotes  
        normalizers.NFKD(),                    # Unicode normalization
        normalizers.StripAccents(),            # Remove accent marks
        normalizers.Replace(Regex(" {2,}"), " "),  # Collapse multiple spaces
    ]
)

In [None]:
# Configure Metaspace pre-tokenization for Unigram/SentencePiece
# Metaspace converts spaces to '▁' characters, preserving space information
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()

In [None]:
# Test Metaspace pre-tokenization  
# Notice the '▁' character representing spaces in SentencePiece format
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test the pre-tokenizer!")

In [None]:
# Train Unigram tokenizer with comprehensive special tokens
# Include tokens commonly used in various NLP tasks
special_tokens = ["<cls>", "<sep>", "<unk>", "<pad>", "<mask>", "<s>", "</s>"]
trainer = trainers.UnigramTrainer(
    vocab_size=25000, special_tokens=special_tokens, unk_token="<unk>"
)
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [None]:
# Alternative: Train Unigram from file
# Reset the model and train from the saved file
tokenizer.model = models.Unigram()
tokenizer.train(["wikitext-2.txt"], trainer=trainer)

In [None]:
# Test the trained Unigram tokenizer
# Notice the SentencePiece-style segmentation with '▁' space markers
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

In [None]:
# Get special token IDs for Unigram post-processing
# Find the IDs for tokens we'll use in the template
cls_token_id = tokenizer.token_to_id("<cls>")
sep_token_id = tokenizer.token_to_id("<sep>")
print(cls_token_id, sep_token_id)

In [None]:
# Configure post-processing with custom template
# This example puts <cls> at the end (uncommon but demonstrates flexibility)
# Single: text <sep> <cls>
# Pair: text1 <sep> text2 <sep> <cls>
tokenizer.post_processor = processors.TemplateProcessing(
    single="$A:0 <sep>:0 <cls>:2",      # Type 2 for special classification token
    pair="$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2",
    special_tokens=[("<sep>", sep_token_id), ("<cls>", cls_token_id)],
)

In [None]:
# Test Unigram with custom post-processing template
# Shows how flexible template processing can create different token arrangements
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences!")
print(encoding.tokens)
print(encoding.type_ids)  # Notice the different type ID pattern: 0, 1, 2

In [None]:
# Configure Metaspace decoder for Unigram
# This properly handles the '▁' space markers used by SentencePiece
tokenizer.decoder = decoders.Metaspace()

In [None]:
# Wrap Unigram tokenizer in Transformers interface  
# Configure with all the special tokens and padding preferences
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<s>",           # Beginning of sequence
    eos_token="</s>",          # End of sequence  
    unk_token="<unk>",         # Unknown token
    pad_token="<pad>",         # Padding token
    cls_token="<cls>",         # Classification token
    sep_token="<sep>",         # Separator token
    mask_token="<mask>",       # Mask token
    padding_side="left",       # Pad on the left (common for some models)
)

In [None]:
# Alternative: Use XLNet-specific tokenizer wrapper
# XLNetTokenizerFast automatically sets appropriate defaults for XLNet/SentencePiece models
from transformers import XLNetTokenizerFast

wrapped_tokenizer = XLNetTokenizerFast(tokenizer_object=tokenizer)