# Debugging the training pipeline

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required libraries for Transformers, datasets, and evaluation
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Set up a fine-tuning pipeline for MNLI (Multi-Genre Natural Language Inference)
# This example demonstrates common training pipeline issues and their solutions
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

# Load the MNLI dataset - a benchmark for natural language inference
raw_datasets = load_dataset("glue", "mnli")

# Use DistilBERT as the base model - efficient and effective for classification
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


def preprocess_function(examples):
    """
    Tokenize premise and hypothesis pairs for NLI task
    - premise: the initial statement
    - hypothesis: the statement to be evaluated against the premise
    - truncation=True prevents sequences from being too long
    """
    return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)


# Apply preprocessing to all examples in the dataset
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# Load model for sequence classification (3 classes: entailment, neutral, contradiction)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

# Define training parameters
args = TrainingArguments(
    f"distilbert-finetuned-mnli",
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",        # Save checkpoint after each epoch
    learning_rate=2e-5,          # Learning rate for AdamW optimizer
    num_train_epochs=3,          # Number of training epochs
    weight_decay=0.01,           # Regularization parameter
)

# Load evaluation metric for MNLI
metric = evaluate.load("glue", "mnli")


def compute_metrics(eval_pred):
    """Calculate accuracy for evaluation"""
    predictions, labels = eval_pred
    return metric.compute(predictions=predictions, references=labels)


# Create trainer with raw datasets (this will cause an error)
trainer = Trainer(
    model,
    args,
    train_dataset=raw_datasets["train"],      # ERROR: Using raw instead of tokenized
    eval_dataset=raw_datasets["validation_matched"],  # ERROR: Using raw instead of tokenized
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
# Investigate the training dataset to understand the error
# Let's examine what the raw dataset contains vs what the model expects
trainer.train_dataset[0]

In [None]:
# Fix #1: Use tokenized datasets instead of raw datasets
# The model expects input_ids and attention_mask, not raw text
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

raw_datasets = load_dataset("glue", "mnli")

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


def preprocess_function(examples):
    return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

args = TrainingArguments(
    f"distilbert-finetuned-mnli",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

metric = evaluate.load("glue", "mnli")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return metric.compute(predictions=predictions, references=labels)


# FIXED: Now using tokenized_datasets instead of raw_datasets
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],           # FIXED
    eval_dataset=tokenized_datasets["validation_matched"], # FIXED
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
# Examine a tokenized example to understand the data structure
# This shows what the model actually receives as input
tokenizer.decode(trainer.train_dataset[0]["input_ids"])

In [None]:
# Check what features are available in the tokenized dataset
# This helps verify that tokenization worked correctly
trainer.train_dataset[0].keys()

In [None]:
# Check the model type to understand what it expects
# This confirms we're using the right model architecture
type(trainer.model)

In [None]:
# Examine the attention mask to understand sequence lengths
# Each example has varying length, which will cause batching issues
trainer.train_dataset[0]["attention_mask"]

In [None]:
# Verify that input_ids and attention_mask have the same length
# This is a crucial requirement for transformer models
len(trainer.train_dataset[0]["attention_mask"]) == len(
    trainer.train_dataset[0]["input_ids"]
)

In [None]:
# Check the label value - should be 0, 1, or 2 for the 3 MNLI classes
trainer.train_dataset[0]["label"]

In [None]:
# Check the label names to understand the classification task
# MNLI has 3 classes: entailment, neutral, contradiction
trainer.train_dataset.features["label"].names

In [None]:
# Try to get a batch from the dataloader to identify the batching issue
# This will fail because sequences have different lengths and can't be stacked
for batch in trainer.get_train_dataloader():
    break

In [None]:
# Examine the data collator being used
# The default collator doesn't handle variable-length sequences properly
data_collator = trainer.get_train_dataloader().collate_fn
data_collator

In [None]:
# Fix #2: Add proper data collator for padding sequences to the same length
# DataCollatorWithPadding handles variable-length sequences in batches
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,  # NEW: Import the padding collator
    TrainingArguments,
    Trainer,
)

raw_datasets = load_dataset("glue", "mnli")

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


def preprocess_function(examples):
    return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

args = TrainingArguments(
    f"distilbert-finetuned-mnli",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

metric = evaluate.load("glue", "mnli")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return metric.compute(predictions=predictions, references=labels)


# NEW: Create a data collator that pads sequences to the same length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# FIXED: Add data_collator and tokenizer to the trainer
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation_matched"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,  # NEW: Handles variable-length sequences
    tokenizer=tokenizer,          # NEW: Provides tokenizer to trainer
)
trainer.train()

In [None]:
# Test the data collator by manually creating a batch
# This verifies that the padding collator works correctly
data_collator = trainer.get_train_dataloader().collate_fn
batch = data_collator([trainer.train_dataset[i] for i in range(4)])

In [None]:
# Test with properly processed training set
# The trainer removes unused columns automatically
data_collator = trainer.get_train_dataloader().collate_fn
actual_train_set = trainer._remove_unused_columns(trainer.train_dataset)
batch = data_collator([actual_train_set[i] for i in range(4)])

In [None]:
# Successfully get a batch from the training dataloader
# This confirms that the padding collator fixed the batching issue
for batch in trainer.get_train_dataloader():
    break

In [None]:
# Test forward pass on CPU to avoid GPU memory issues
# This will reveal the next issue: incorrect number of labels
outputs = trainer.model.cpu()(**batch)

In [None]:
# Check the model's expected number of labels
# DistilBERT-base defaults to 2 labels, but MNLI has 3 classes
trainer.model.config.num_labels

In [None]:
# Fix #3: Specify the correct number of labels (3 for MNLI)
# The model needs to match the number of classes in the dataset
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

raw_datasets = load_dataset("glue", "mnli")

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


def preprocess_function(examples):
    return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# FIXED: Specify num_labels=3 for the three MNLI classes
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

args = TrainingArguments(
    f"distilbert-finetuned-mnli",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

metric = evaluate.load("glue", "mnli")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return metric.compute(predictions=predictions, references=labels)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation_matched"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
# Test forward pass with corrected model
# Now the model should work with the 3-class MNLI labels
for batch in trainer.get_train_dataloader():
    break

outputs = trainer.model.cpu()(**batch)

In [None]:
# Test GPU training with proper device management
# Move batch to GPU if available, otherwise use CPU
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: v.to(device) for k, v in batch.items()}

outputs = trainer.model.to(device)(**batch)

In [None]:
# Test backward pass (gradient computation)
# This verifies that the loss computation works correctly
loss = outputs.loss
loss.backward()

In [None]:
# Test optimizer step
# This ensures the optimization process works
trainer.create_optimizer()
trainer.optimizer.step()

In [None]:
# Note: Full training would work now but takes too long for demonstration
# This would run the complete training loop
# trainer.train()

In [None]:
# Test evaluation with an incorrect compute_metrics function
# This will fail because we're not converting logits to predictions properly
trainer.evaluate()

In [None]:
# Test evaluation manually to understand the issue
# Get predictions from the model for debugging
for batch in trainer.get_eval_dataloader():
    break

batch = {k: v.to(device) for k, v in batch.items()}

with torch.no_grad():
    outputs = trainer.model(**batch)

In [None]:
# Test the current compute_metrics function to see what's wrong
# The issue is that we need to convert logits to class predictions
predictions = outputs.logits.cpu().numpy()
labels = batch["labels"].cpu().numpy()

compute_metrics((predictions, labels))

In [None]:
# Check the shapes to understand the data structure
# Predictions are logits (raw scores), not class indices
predictions.shape, labels.shape

In [None]:
# Fix #4: Convert logits to predictions in compute_metrics
# The metric expects class indices, not raw logits
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Convert logits to class predictions using argmax
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


# Test the corrected function
compute_metrics((predictions, labels))

In [None]:
# Complete working solution with all fixes applied
# This version should work correctly for MNLI fine-tuning
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

raw_datasets = load_dataset("glue", "mnli")

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


def preprocess_function(examples):
    return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# FIXED: Correct number of labels for MNLI
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

args = TrainingArguments(
    f"distilbert-finetuned-mnli",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

metric = evaluate.load("glue", "mnli")


# FIXED: Proper conversion from logits to predictions
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


# FIXED: Proper data collator for variable-length sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],        # FIXED: Use tokenized data
    eval_dataset=tokenized_datasets["validation_matched"],  # FIXED: Use tokenized data
    compute_metrics=compute_metrics,
    data_collator=data_collator,                      # FIXED: Handle padding
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
# Demonstrate 20 training steps to show the pipeline works
# This simulates what happens during actual training
for batch in trainer.get_train_dataloader():
    break

batch = {k: v.to(device) for k, v in batch.items()}
trainer.create_optimizer()

for _ in range(20):
    outputs = trainer.model(**batch)
    loss = outputs.loss
    loss.backward()
    trainer.optimizer.step()
    trainer.optimizer.zero_grad()

In [None]:
# Test the final evaluation with correct metrics
# This should now work properly with the fixed compute_metrics function
with torch.no_grad():
    outputs = trainer.model(**batch)
preds = outputs.logits
labels = batch["labels"]

compute_metrics((preds.cpu().numpy(), labels.cpu().numpy()))