# Handling multiple sequences (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required libraries for Transformers, datasets, and evaluation
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Common error: Wrong tensor dimensions for model input
# Models expect batch dimensions even for single sequences
# This will fail because input_ids is 1D but model expects 2D [batch_size, seq_len]
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)  # This creates a 1D tensor - wrong!
# This line will fail.
model(input_ids)

In [None]:
# Correct approach: Use tokenizer with return_tensors="pt"
# This automatically adds batch dimension and special tokens ([CLS], [SEP])
# Notice the tensor shape is [1, sequence_length] not just [sequence_length]
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print(tokenized_inputs["input_ids"])

In [None]:
# Manual fix: Add batch dimension to tensor
# We manually wrap the IDs in a list to create the batch dimension
# This creates a 2D tensor with shape [batch_size=1, sequence_length]
# Now the model can process this input correctly
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])  # Note the extra brackets for batch dimension
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

In [None]:
# Problem: Sequences of different lengths can't form a tensor
# Lists can have different lengths, but tensors require uniform dimensions
# This structure is invalid for tensor creation
batched_ids = [
    [200, 200, 200],   # Length 3
    [200, 200]         # Length 2 - inconsistent!
]

In [None]:
# Solution: Padding - making all sequences the same length
# Add special padding tokens to shorter sequences
# All sequences now have the same length (3) so we can create a tensor
padding_id = 100

batched_ids = [
    [200, 200, 200],           # Original length 3
    [200, 200, padding_id],    # Padded from length 2 to 3
]

In [None]:
# Problem: Padding changes model outputs incorrectly
# The padded sequence produces different results than the original
# This happens because the model treats padding tokens as real input
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)  # Note: second row differs from sequence2_ids!

In [None]:
# Solution: Attention masks tell the model to ignore padding
# attention_mask: 1 = real token, 0 = padding token
# Now the padded sequence produces the same output as the original!
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],    # All tokens are real
    [1, 1, 0],    # First two are real, last is padding
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)  # Second row now matches the unpadded sequence!

In [None]:
# Handling sequences that are too long: Truncation
# Models have maximum sequence lengths (e.g., 512 for BERT)
# Truncate sequences that exceed this limit by keeping only the first max_length tokens
sequence = sequence[:max_sequence_length]