# Processing the data (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required libraries for Transformers, datasets, and evaluation
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Basic PyTorch training setup for sequence classification
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before - load pre-trained BERT model and tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new - add labels for training (1 = positive sentiment for both)
batch["labels"] = torch.tensor([1, 1])

# Set up optimizer and perform one training step
optimizer = AdamW(model.parameters())
loss = model(**batch).loss  # Forward pass and calculate loss
loss.backward()  # Compute gradients
optimizer.step()  # Update model weights

In [None]:
# Load the GLUE MRPC dataset - Microsoft Research Paraphrase Corpus
# This dataset contains sentence pairs labeled as paraphrases or not
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

In [None]:
# Examine the structure of the training data
# Each example contains two sentences and a label (0=not equivalent, 1=equivalent)
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

In [None]:
# Inspect the dataset features and data types
# ClassLabel shows the possible values: 0=not_equivalent, 1=equivalent
raw_train_dataset.features

In [None]:
# Initialize tokenizer and tokenize sentences separately (inefficient approach)
# This demonstrates tokenizing each sentence individually
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [None]:
# Better approach: tokenize sentence pairs together
# This creates proper input with [CLS] sentence1 [SEP] sentence2 [SEP] format
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

In [None]:
# Convert token IDs back to readable tokens to understand the structure
# Notice [CLS] at start, [SEP] between and after sentences
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

In [None]:
# Tokenize entire dataset with padding and truncation
# padding=True ensures all sequences have same length, truncation=True cuts long sequences
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [None]:
# Define a function to tokenize examples in batches
# This function will be applied to the dataset using map()
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [None]:
# Apply tokenization to all dataset splits (train, validation, test)
# batched=True processes multiple examples at once for efficiency
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

In [None]:
# Set up data collator for dynamic padding
# This pads sequences to the same length within each batch (more efficient than global padding)
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Prepare a sample batch to demonstrate dynamic padding
# Remove text columns and keep only tokenized features needed for training
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]  # Show different sequence lengths

In [None]:
# Apply data collator to create a padded batch
# All sequences are now padded to the same length (67 tokens in this case)
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}