# Behind the pipeline (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required libraries for Transformers, datasets, and evaluation
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# High-level pipeline demonstration - shows the end result we'll build step by step
# This pipeline automatically handles tokenization, model inference, and post-processing
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

In [None]:
# Step 1: Tokenization - Convert text to numbers the model can understand
# AutoTokenizer automatically selects the correct tokenizer for the model
# This specific checkpoint is DistilBERT fine-tuned for sentiment analysis (SST-2 dataset)
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# Tokenize the input texts with important preprocessing steps:
# - padding=True: pads shorter sequences to match the longest one in the batch
# - truncation=True: cuts off sequences that are too long for the model
# - return_tensors="pt": returns PyTorch tensors instead of lists
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

# The output contains:
# - input_ids: numerical representations of tokens
# - attention_mask: tells the model which tokens to pay attention to (1) vs padding (0)

In [None]:
# Step 2A: Load the base model (without task-specific head)
# AutoModel gives us the core transformer that outputs contextualized embeddings
# This is the same checkpoint but without the classification layer
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

In [None]:
# Pass tokenized inputs through the base model
# Output shape: [batch_size, sequence_length, hidden_size]
# - batch_size=2 (two input sentences)
# - sequence_length=16 (padded length)
# - hidden_size=768 (DistilBERT's embedding dimension)
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

# These are high-dimensional embeddings, but not yet predictions for our task

In [None]:
# Step 2B: Load the model with the classification head
# AutoModelForSequenceClassification adds a classification layer on top of the base model
# This layer converts the embeddings into class predictions (positive/negative sentiment)
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

In [None]:
# Check the output shape: [batch_size, num_classes]
# - batch_size=2 (two input sentences)
# - num_classes=2 (NEGATIVE and POSITIVE sentiment classes)
print(outputs.logits.shape)

In [None]:
# Raw logits (unnormalized scores) from the model
# These numbers don't directly represent probabilities yet
# Higher values indicate stronger prediction for that class
print(outputs.logits)

In [None]:
# Step 3: Post-processing - Convert logits to probabilities
# Softmax function converts raw scores to probabilities that sum to 1
# dim=-1 applies softmax across the last dimension (the classes)
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

# Now we can see:
# First sentence: ~4% negative, ~96% positive
# Second sentence: ~99.9% negative, ~0.05% positive

In [None]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}