# Normalization and pre-tokenization

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required packages for tokenizer analysis
# - datasets: For loading and processing text datasets
# - evaluate: For model evaluation metrics
# - transformers[sentencepiece]: Core library with SentencePiece support
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Load a pre-trained tokenizer to explore its internal components
# We use BERT's tokenizer which implements the WordPiece algorithm
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Access the fast tokenizer backend to examine normalization and pre-tokenization
# The backend_tokenizer provides low-level access to tokenizer components
print(type(tokenizer.backend_tokenizer))

In [None]:
# Demonstrate text normalization - the first step in tokenization
# Normalization standardizes text by:
# - Converting to lowercase (for bert-base-uncased)
# - Removing or converting accents and special characters
# - Standardizing unicode representations
print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

In [None]:
# Demonstrate pre-tokenization - splitting text into word-like units
# Pre-tokenization happens after normalization and before applying the main tokenization algorithm
# BERT's pre-tokenizer splits on whitespace and punctuation
# Returns tuples of (token, (start_offset, end_offset)) to preserve original positions
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")

In [None]:
# Compare pre-tokenization behavior with GPT-2's tokenizer
# GPT-2 uses Byte-Pair Encoding (BPE) with different pre-tokenization rules
# Notice the 'Ġ' character - this represents spaces in GPT-2's encoding
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")

In [None]:
# Compare with T5's tokenizer which uses SentencePiece
# T5 uses the Unigram algorithm with SentencePiece implementation
# Notice the '▁' character which represents word boundaries in SentencePiece
# SentencePiece treats spaces as part of tokens rather than separators
tokenizer = AutoTokenizer.from_pretrained("t5-small")
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")