# Tokenizers (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required libraries for Transformers, datasets, and evaluation
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Simple word-level tokenization using Python's split()
# This naive approach splits on whitespace but has limitations:
# - Doesn't handle punctuation well
# - No handling of unknown words
# - Fixed vocabulary size issues
tokenized_text = "Jim Henson was a puppeteer".split()
print(tokenized_text)

In [None]:
# Loading a specific tokenizer class (BertTokenizer)
# This loads the exact tokenizer designed for BERT models
# Each model architecture typically has its own tokenizer class
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
# Preferred approach: AutoTokenizer automatically selects the right tokenizer
# AutoTokenizer reads the model's config and loads the appropriate tokenizer class
# This is more robust and works across different model architectures
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
# Full tokenization pipeline in one call
# This handles: text → tokens → IDs → model-ready tensors
# Returns three key components:
# - input_ids: numerical token representations
# - token_type_ids: distinguishes different sentences (for tasks like QA)
# - attention_mask: indicates real tokens vs padding
tokenizer("Using a Transformer network is simple")

In [None]:
# Save tokenizer for offline use or distribution
# Creates files: tokenizer.json, tokenizer_config.json, vocab.txt, etc.
# Allows loading the tokenizer without internet access
tokenizer.save_pretrained("directory_on_my_computer")

In [None]:
# Step-by-step tokenization process: text → tokens
# tokenize() converts text into subword tokens
# Notice how "Transformer" becomes "transform" + "##er" (WordPiece subword)
# This handles out-of-vocabulary words by breaking them into known subparts
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)

In [None]:
# Step 2: Convert tokens to numerical IDs
# Each token maps to a unique number in the tokenizer's vocabulary
# These IDs are what the model actually processes
ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

In [None]:
# Reverse process: Convert IDs back to text
# decode() takes token IDs and reconstructs the original text
# Handles subword reconstruction (e.g., "transform" + "##er" → "Transformer")
# Useful for understanding model outputs or debugging tokenization
decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
print(decoded_string)