# Fast tokenizers' special powers (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required libraries for working with fast tokenizers and NER models
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Load a BERT tokenizer and process example text
# BatchEncoding contains tokens + additional metadata for fast tokenizers
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
encoding = tokenizer(example)
print(type(encoding))

In [None]:
# Check if the tokenizer is a "fast" tokenizer (implemented in Rust for speed)
# Fast tokenizers provide additional features like offset mapping and word alignment
tokenizer.is_fast

In [None]:
# Check if the encoding was created by a fast tokenizer
# Only fast tokenizers can provide offset mapping and alignment features
encoding.is_fast

In [None]:
# Get the actual tokens created by the tokenizer
# Notice how "Sylvain" is split into subword tokens: "S", "##yl", "##va", "##in"
encoding.tokens()

In [None]:
# Get word IDs: maps each token back to its original word in the input
# None = special tokens ([CLS], [SEP]), numbers = word index (0-based)
# Multiple tokens can map to the same word (e.g., "Sylvain" -> word 3)
encoding.word_ids()

In [None]:
# Find character positions of word 3 ("Sylvain") in the original text
# This allows mapping from word index back to the original string
start, end = encoding.word_to_chars(3)
example[start:end]

In [None]:
# Token Classification pipeline for Named Entity Recognition (NER)
# Without aggregation, each subword token gets its own prediction
# Notice how "Sylvain" is split into multiple tokens with separate predictions
from transformers import pipeline

token_classifier = pipeline("token-classification")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

In [None]:
# Same pipeline but with aggregation strategy to group subword tokens
# "simple" strategy merges tokens belonging to the same entity
# Now "Sylvain" and "Hugging Face" appear as single entities with combined scores
from transformers import pipeline

token_classifier = pipeline("token-classification", aggregation_strategy="simple")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

In [None]:
# Manual approach: Load model and tokenizer directly for more control
# This gives us access to raw model outputs and probabilities
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
inputs = tokenizer(example, return_tensors="pt")
outputs = model(**inputs)

In [None]:
# Examine tensor shapes: 
# inputs: [batch_size=1, sequence_length=19] - 19 tokens including special tokens
# outputs: [batch_size=1, sequence_length=19, num_labels=9] - 9 possible entity labels
print(inputs["input_ids"].shape)
print(outputs.logits.shape)

In [None]:
# Convert model outputs to probabilities and predictions
# Softmax converts logits to probabilities, argmax gets the highest-scoring class
import torch

probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
predictions = outputs.logits.argmax(dim=-1)[0].tolist()
print(predictions)

In [None]:
# Mapping from prediction IDs to human-readable labels
# 0='O' (Outside), B-/I- prefix indicates Beginning/Inside of entity
# Entity types: PER (Person), ORG (Organization), LOC (Location), MISC (Miscellaneous)
model.config.id2label

In [None]:
# Extract entity predictions with their confidence scores
# Only include non-"O" (non-Outside) predictions that represent actual entities
results = []
tokens = inputs.tokens()

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        results.append(
            {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx]}
        )

print(results)

In [None]:
# Get offset mapping: character positions for each token in the original text
# This allows us to map tokens back to their exact positions in the input string
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
inputs_with_offsets["offset_mapping"]

In [None]:
# Verify offset mapping: extract characters at positions 12-14 from original text
# This should correspond to the "##yl" token from "Sylvain"
example[12:14]

In [None]:
# Enhanced results with character positions for each entity token
# Now we can precisely locate where each entity appears in the original text
results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        start, end = offsets[idx]
        results.append(
            {
                "entity": label,
                "score": probabilities[idx][pred],
                "word": tokens[idx],
                "start": start,
                "end": end,
            }
        )

print(results)

In [None]:
# Verify that positions 33-45 extract "Hugging Face" from the original text
# This demonstrates how offset mapping allows precise entity extraction
example[33:45]

In [None]:
# Advanced entity grouping: Group subword tokens into complete entities
# This manually implements what the "simple" aggregation strategy does automatically
import numpy as np

results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = model.config.id2label[pred]
    if label != "O":
        # Remove the B- or I- prefix to get the entity type
        label = label[2:]
        start, _ = offsets[idx]

        # Collect all consecutive tokens with the same entity label (I-label)
        all_scores = []
        while (
            idx < len(predictions)
            and model.config.id2label[predictions[idx]] == f"I-{label}"
        ):
            all_scores.append(probabilities[idx][pred])
            _, end = offsets[idx]
            idx += 1

        # Average the confidence scores of all tokens in the entity
        score = np.mean(all_scores).item()
        # Extract the complete entity text from the original string
        word = example[start:end]
        results.append(
            {
                "entity_group": label,
                "score": score,
                "word": word,
                "start": start,
                "end": end,
            }
        )
    idx += 1

print(results)