# Fast tokenizers in the QA pipeline (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required libraries for question-answering tasks with fast tokenizers
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Simple Question Answering with pipeline
# The pipeline handles tokenization, model inference, and answer extraction automatically
from transformers import pipeline

question_answerer = pipeline("question-answering")
context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch, and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back 🤗 Transformers?"
question_answerer(question=question, context=context)

In [None]:
# Test with a much longer context to see how the model handles it
# Notice the pipeline still finds the correct answer despite the longer text
long_context = """
🤗 Transformers: State of the Art NLP

🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction,
question answering, summarization, translation, text generation and more in over 100 languages.
Its aim is to make cutting-edge NLP easier to use for everyone.

🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and
then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and
can be modified to enable quick research experiments.

Why should I use transformers?

1. Easy-to-use state-of-the-art models:
  - High performance on NLU and NLG tasks.
  - Low barrier to entry for educators and practitioners.
  - Few user-facing abstractions with just three classes to learn.
  - A unified API for using all our pretrained models.
  - Lower compute costs, smaller carbon footprint:

2. Researchers can share trained models instead of always retraining.
  - Practitioners can reduce compute time and production costs.
  - Dozens of architectures with over 10,000 pretrained models, some in more than 100 languages.

3. Choose the right framework for every part of a model's lifetime:
  - Train state-of-the-art models in 3 lines of code.
  - Move a single model between TF2.0/PyTorch frameworks at will.
  - Seamlessly pick the right framework for training, evaluation and production.

4. Easily customize a model or an example to your needs:
  - We provide examples for each architecture to reproduce the results published by its original authors.
  - Model internals are exposed as consistently as possible.
  - Model files can be used independently of the library for quick experiments.

🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question_answerer(question=question, context=long_context)

In [None]:
# Manual QA implementation: Load model and tokenizer for more control
# This shows what happens under the hood in the QA pipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)

In [None]:
# QA models output two sets of logits:
# start_logits: probability of each token being the START of the answer
# end_logits: probability of each token being the END of the answer
start_logits = outputs.start_logits
end_logits = outputs.end_logits
print(start_logits.shape, end_logits.shape)

In [None]:
# Mask question tokens: answers can only come from the context, not the question
# sequence_ids() shows which tokens belong to question (0) vs context (1)
import torch

sequence_ids = inputs.sequence_ids()
# Mask everything apart from the tokens of the context
mask = [i != 1 for i in sequence_ids]
# Unmask the [CLS] token (allow it as a potential "no answer" indicator)
mask[0] = False
mask = torch.tensor(mask)[None]

# Set masked positions to very low values so they won't be selected
start_logits[mask] = -10000
end_logits[mask] = -10000

In [None]:
# Convert logits to probabilities using softmax
# This gives us probability distributions over start and end positions
start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)[0]
end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)[0]

In [None]:
# Calculate all possible answer span scores by multiplying start and end probabilities
# Each cell [i,j] represents the score for an answer spanning from token i to token j
scores = start_probabilities[:, None] * end_probabilities[None, :]

In [None]:
# Keep only upper triangular part: ensures start_index <= end_index
# This prevents invalid spans where the "end" comes before the "start"
scores = torch.triu(scores)

In [None]:
# Find the best answer span: highest scoring start-end combination
# Convert 2D position back to start and end indices
max_index = scores.argmax().item()
start_index = max_index // scores.shape[1]
end_index = max_index % scores.shape[1]
print(scores[start_index, end_index])

In [None]:
# Extract the actual answer text using offset mapping
# Fast tokenizers can map token positions back to character positions in original text
inputs_with_offsets = tokenizer(question, context, return_offsets_mapping=True)
offsets = inputs_with_offsets["offset_mapping"]

start_char, _ = offsets[start_index]
_, end_char = offsets[end_index]
answer = context[start_char:end_char]

In [None]:
# Create the final result in the same format as the pipeline
# Includes answer text, character positions, and confidence score
result = {
    "answer": answer,
    "start": start_char,
    "end": end_char,
    "score": scores[start_index, end_index],
}
print(result)

In [None]:
# Problem: Long contexts exceed model's maximum sequence length
# Check how many tokens the long context produces (461 tokens)
inputs = tokenizer(question, long_context)
print(len(inputs["input_ids"]))

In [None]:
# Solution: Truncate the context to fit model's max length (384 tokens)
# "only_second" means only truncate the context, not the question
# Notice how the answer is cut off because truncation removed the relevant part
inputs = tokenizer(question, long_context, max_length=384, truncation="only_second")
print(tokenizer.decode(inputs["input_ids"]))

In [None]:
# Demonstration of sliding window approach with overlapping chunks
# stride=2: each new chunk overlaps by 2 tokens with the previous chunk
# This prevents answers from being split across chunk boundaries
sentence = "This sentence is not too long but we are going to split it anyway."
inputs = tokenizer(
    sentence, truncation=True, return_overflowing_tokens=True, max_length=6, stride=2
)

for ids in inputs["input_ids"]:
    print(tokenizer.decode(ids))

In [None]:
# Check what additional information is returned with overflowing tokens
# overflow_to_sample_mapping tracks which original sample each chunk belongs to
print(inputs.keys())

In [None]:
# All chunks (7 total) come from the same sample (index 0)
# This mapping becomes important when processing multiple samples at once
print(inputs["overflow_to_sample_mapping"])

In [None]:
# Example with multiple sentences: shows how mapping works with multiple samples
# First 7 chunks from sample 0, next 4 chunks from sample 1
sentences = [
    "This sentence is not too long but we are going to split it anyway.",
    "This sentence is shorter but will still get split.",
]
inputs = tokenizer(
    sentences, truncation=True, return_overflowing_tokens=True, max_length=6, stride=2
)

print(inputs["overflow_to_sample_mapping"])

In [None]:
# Apply sliding window to the long context for question answering
# stride=128: significant overlap between chunks to ensure no answers are lost
# return_offsets_mapping=True: needed to extract answer text from original context
inputs = tokenizer(
    question,
    long_context,
    stride=128,
    max_length=384,
    padding="longest",
    truncation="only_second",
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)

In [None]:
# Prepare data for model inference
# Remove metadata that's not needed for the model and convert to PyTorch tensors
_ = inputs.pop("overflow_to_sample_mapping")
offsets = inputs.pop("offset_mapping")

inputs = inputs.convert_to_tensors("pt")
print(inputs["input_ids"].shape)  # 2 chunks of 384 tokens each

In [None]:
# Run model inference on both chunks simultaneously
# Each chunk gets its own set of start/end logits
outputs = model(**inputs)

start_logits = outputs.start_logits
end_logits = outputs.end_logits
print(start_logits.shape, end_logits.shape)  # [2, 384] for 2 chunks

In [None]:
# Enhanced masking for multiple chunks
# Mask question tokens AND padding tokens from both chunks
sequence_ids = inputs.sequence_ids()
# Mask everything apart from the tokens of the context
mask = [i != 1 for i in sequence_ids]
# Unmask the [CLS] token
mask[0] = False
# Also mask all [PAD] tokens (attention_mask == 0)
mask = torch.logical_or(torch.tensor(mask)[None], (inputs["attention_mask"] == 0))

start_logits[mask] = -10000
end_logits[mask] = -10000

In [None]:
# Convert logits to probabilities for both chunks
# Now we have probability distributions for each chunk separately
start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)
end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)

In [None]:
# Find the best answer candidate in each chunk
# Compare scores across chunks to find the overall best answer
candidates = []
for start_probs, end_probs in zip(start_probabilities, end_probabilities):
    scores = start_probs[:, None] * end_probs[None, :]
    idx = torch.triu(scores).argmax().item()

    start_idx = idx // scores.shape[1]
    end_idx = idx % scores.shape[1]
    score = scores[start_idx, end_idx].item()
    candidates.append((start_idx, end_idx, score))

print(candidates)

In [None]:
# Extract final answers from both chunks using offset mapping
# The second chunk (score 0.97149) contains the correct answer!
# The first chunk has a low score (0.33867) and extracts irrelevant text
for candidate, offset in zip(candidates, offsets):
    start_token, end_token, score = candidate
    start_char, _ = offset[start_token]
    _, end_char = offset[end_token]
    answer = long_context[start_char:end_char]
    result = {"answer": answer, "start": start_char, "end": end_char, "score": score}
    print(result)