
# Sentence Classification (Sentiment) — Linear Probe vs Full Fine-Tuning on **TweetEval: Sentiment** (PyTorch Training Loop)

**Audience:** 4th-year Computer Science students  
**Task:** Multi-class sentiment classification (**negative / neutral / positive**) on the **TweetEval** benchmark (subset: `sentiment`).

This version is identical to the previous notebook **except** the training sections now use a **traditional PyTorch loop** (no `Trainer`).

You'll build and compare two approaches using a Hugging Face encoder:
1. **Linear Probe (Frozen Encoder):** Freeze the transformer encoder and train only a small classification head.
2. **Full Fine-Tuning:** Unfreeze the encoder and fine-tune end-to-end.

We'll evaluate both on the same test set and visualize improvements.



## 0) Setup & Reproducibility


In [None]:

# If needed, uncomment to install:
# %pip install -U transformers datasets accelerate evaluate scikit-learn matplotlib

import os, random, time, json, math
import numpy as np

import evaluate
import torch
from datasets import load_dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, get_linear_schedule_with_warmup)

from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader

SEED = 42
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()
print("Torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device



## 1) Configuration


In [None]:

CONFIG = {
    "dataset_name": "tweet_eval",
    "dataset_subset": "sentiment",   # 3-way: negative(0), neutral(1), positive(2)
    "text_col": "text",
    "label_col": "label",
    "labels": ["negative", "neutral", "positive"],
    "model_name": "distilbert-base-uncased",
    "max_length": 128,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 32,
    "epochs_probe": 2,           # linear-probe training epochs
    "epochs_finetune": 3,        # full finetune epochs
    "learning_rate_probe": 5e-4, # higher since only head trains
    "learning_rate_finetune": 2e-5,
    "weight_decay": 0.01,
    "warmup_ratio": 0.06,
    "subset_fraction": 0.3,      # None for full data; use fraction like 0.3 for speed
    "output_dir": "checkpoints_tweeteval_sentiment_ptloop"
}
print(json.dumps(CONFIG, indent=2))



## 2) Load the **TweetEval: Sentiment** Dataset


In [None]:

raw = load_dataset(CONFIG["dataset_name"], CONFIG["dataset_subset"])

# Optionally downsample for a quick demo run
subset_fraction = CONFIG["subset_fraction"]
if subset_fraction is not None and 0 < subset_fraction < 1:
    def take_fraction(dset, frac):
        n = max(30, int(len(dset) * frac))  # keep a minimum
        return dset.shuffle(seed=SEED).select(range(n))
    raw = DatasetDict({
        "train": take_fraction(raw["train"], subset_fraction),
        "validation": take_fraction(raw["validation"], subset_fraction),
        "test": raw["test"]  # keep full test for better generalization measurement
    })

raw



## 3) Tokenization


In [None]:

tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"], use_fast=True)

def tokenize_fn(batch):
    return tokenizer(batch[CONFIG["text_col"]], truncation=True, max_length=CONFIG["max_length"])

remove_cols = [c for c in raw["train"].column_names if c not in (CONFIG["text_col"], CONFIG["label_col"])]
tokenized = raw.map(tokenize_fn, batched=True, remove_columns=remove_cols)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

num_labels = len(CONFIG["labels"])
label_names = CONFIG["labels"]

# Set format for PyTorch Dataloaders
columns = ["input_ids", "attention_mask", CONFIG["label_col"]]
tokenized = tokenized.with_format(type="torch", columns=columns)



## 4) Metrics


In [None]:

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics_from_preds(preds, labels):
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "macro_f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }



### Helper: Confusion Matrix


In [None]:

def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix", labels=None):
    if labels is None:
        labels = [str(i) for i in sorted(np.unique(y_true))]
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(labels))))
    fig, ax = plt.subplots()
    im = ax.imshow(cm)  # default colormap (no custom colors)
    ax.set_xticks(range(len(labels)))
    ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_title(title)

    # Annotate counts
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, cm[i, j], ha="center", va="center")

    plt.show()



## 5) Model Builder (unchanged)


In [None]:

def build_model(freeze_encoder=True):
    model = AutoModelForSequenceClassification.from_pretrained(
        CONFIG["model_name"], num_labels=len(label_names)
    )
    if freeze_encoder:
        if hasattr(model, "distilbert"):
            for p in model.distilbert.parameters():
                p.requires_grad = False
        else:
            base = getattr(model, "bert", None) or getattr(model, "roberta", None) or getattr(model, "deberta", None)
            if base is not None:
                for p in base.parameters():
                    p.requires_grad = False
    return model



## 6) PyTorch Training Utilities (New)


In [None]:

from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader

def make_dataloaders(tokenized, split, batch_size, shuffle, collate_fn):
    return DataLoader(tokenized[split], batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0
    criterion = torch.nn.CrossEntropyLoss()
    with torch.no_grad():
        for batch in dataloader:
            labels = batch[CONFIG["label_col"]].to(device)
            batch = {k: v.to(device) for k, v in batch.items() if k != CONFIG["label_col"]}
            outputs = model(**batch, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            total_loss += loss.item() * labels.size(0)
            preds = torch.argmax(logits, dim=-1)
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)
    avg_loss = total_loss / len(dataloader.dataset)
    metrics = compute_metrics_from_preds(all_preds, all_labels)
    return avg_loss, metrics, all_preds, all_labels

def train_model(model, train_loader, val_loader, device, epochs, lr, weight_decay, warmup_ratio):
    model.to(device)
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()),
                                  lr=lr, weight_decay=weight_decay)
    total_steps = len(train_loader) * epochs
    warmup_steps = max(1, int(warmup_ratio * total_steps))
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

    best_state = None
    best_macro_f1 = -1.0

    for epoch in range(1, epochs+1):
        model.train()
        running_loss = 0.0
        for batch in train_loader:
            labels = batch[CONFIG["label_col"]].to(device)
            batch = {k: v.to(device) for k, v in batch.items() if k != CONFIG["label_col"]}
            optimizer.zero_grad()
            outputs = model(**batch, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            running_loss += loss.item() * labels.size(0)

        train_loss = running_loss / len(train_loader.dataset)
        val_loss, val_metrics, _, _ = evaluate_model(model, val_loader, device)

        print(f"Epoch {epoch:02d} | Train Loss {train_loss:.4f} | Val Loss {val_loss:.4f} | "
              f"Val Acc {val_metrics['accuracy']:.4f} | Val Macro-F1 {val_metrics['macro_f1']:.4f}")

        if val_metrics["macro_f1"] > best_macro_f1:
            best_macro_f1 = val_metrics["macro_f1"]
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

    if best_state is not None:
        model.load_state_dict(best_state)
    return model



## 7) Baseline: **Linear Probe** (Frozen Encoder) — PyTorch Loop (Changed)


In [None]:

probe_model = build_model(freeze_encoder=True)

train_loader = make_dataloaders(tokenized, "train", CONFIG["per_device_train_batch_size"], True, data_collator)
val_loader   = make_dataloaders(tokenized, "validation", CONFIG["per_device_eval_batch_size"], False, data_collator)
test_loader  = make_dataloaders(tokenized, "test", CONFIG["per_device_eval_batch_size"], False, data_collator)

t0 = time.time()
probe_model = train_model(
    probe_model, train_loader, val_loader, device,
    epochs=CONFIG["epochs_probe"],
    lr=CONFIG["learning_rate_probe"],
    weight_decay=CONFIG["weight_decay"],
    warmup_ratio=CONFIG["warmup_ratio"]
)
probe_train_time = time.time() - t0

probe_test_loss, probe_test_metrics, probe_preds, y_test = evaluate_model(probe_model, test_loader, device)

probe_test = {"eval_accuracy": probe_test_metrics["accuracy"], "eval_macro_f1": probe_test_metrics["macro_f1"]}
print("Probe Test:", probe_test, "| Train time (s):", round(probe_train_time, 2))

plot_confusion_matrix(y_test, probe_preds, title="Frozen Encoder (Linear Probe) — Test", labels=label_names)
print(classification_report(y_test, probe_preds, target_names=label_names))



## 8) **Full Fine-Tuning** (Encoder + Head) — PyTorch Loop (Changed)


In [None]:

ft_model = build_model(freeze_encoder=False)

t0 = time.time()
ft_model = train_model(
    ft_model, train_loader, val_loader, device,
    epochs=CONFIG["epochs_finetune"],
    lr=CONFIG["learning_rate_finetune"],
    weight_decay=CONFIG["weight_decay"],
    warmup_ratio=CONFIG["warmup_ratio"]
)
ft_train_time = time.time() - t0

ft_test_loss, ft_test_metrics, ft_preds, y_test2 = evaluate_model(ft_model, test_loader, device)

ft_test = {"eval_accuracy": ft_test_metrics["accuracy"], "eval_macro_f1": ft_test_metrics["macro_f1"]}
print("Finetune Test:", ft_test, "| Train time (s):", round(ft_train_time, 2))

plot_confusion_matrix(y_test2, ft_preds, title="Full Fine-Tuned — Test", labels=label_names)
print(classification_report(y_test2, ft_preds, target_names=label_names))



## 9) Compare Results (unchanged logic)


In [None]:

def metric(d, key):
    return float(d.get(key, "nan"))

probe_acc = metric(probe_test, "eval_accuracy")
probe_f1m = metric(probe_test, "eval_macro_f1")
ft_acc = metric(ft_test, "eval_accuracy")
ft_f1m = metric(ft_test, "eval_macro_f1")

print(f"Probe — Test Accuracy: {probe_acc:.4f} | Macro F1: {probe_f1m:.4f}")
print(f"FT    — Test Accuracy: {ft_acc:.4f} | Macro F1: {ft_f1m:.4f}")
print(f"Δ Accuracy: {ft_acc - probe_acc:+.4f}")
print(f"Δ Macro F1: {ft_f1m - probe_f1m:+.4f}")

labels_disp = ["Probe (Frozen)", "Finetuned"]
accs = [probe_acc, ft_acc]
f1s = [probe_f1m, ft_f1m]

plt.figure()
plt.bar(labels_disp, accs)
plt.title("Test Accuracy")
plt.ylabel("Accuracy")
plt.ylim(0, 1.0)
plt.show()

plt.figure()
plt.bar(labels_disp, f1s)
plt.title("Test Macro-F1")
plt.ylabel("Macro-F1")
plt.ylim(0, 1.0)
plt.show()



## 10) Discussion & Extensions



- **Try other datasets:** `imdb`, `amazon_polarity`, `yelp_polarity`, or other `tweet_eval` tasks.
- **Try other encoders:** `bert-base-uncased`, `roberta-base`, `google/electra-small-discriminator`.
- **Compute budget:** Adjust `subset_fraction` for CPU demos vs. full GPU runs.
- **PEFT:** Explore LoRA/adapters to approach full-FT accuracy with less compute.
- **Error analysis:** Inspect misclassifications; per-class precision/recall; calibration.
- **Robustness:** Evaluate on different time slices or domains.

> ✍️ **Short write-up prompt:** Explain why full fine-tuning improves performance vs. a frozen encoder. Relate to representation learning and task/domain adaptation.
