In [1]:
# [Cell 1] Setup + imports + data + tokenize + model (short)
!pip -q install "transformers>=4.44.2" "datasets>=2.19.0" "accelerate>=0.33.0" "evaluate>=0.4.2" "scikit-learn>=1.3.0"

import random, numpy as np, torch, torch.nn.functional as F
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix, classification_report

SEED=42; random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
DEVICE="cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME="distilbert-base-uncased"; ID2LABEL={0:"negative",1:"positive"}; LABEL2ID={"negative":0,"positive":1}

ds = load_dataset("imdb")
train_small = ds["train"].shuffle(seed=SEED).select(range(4000))
test_small  = ds["test"].shuffle(seed=SEED).select(range(1000))

tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
enc = lambda b: tok(b["text"], truncation=True)
train_enc = train_small.map(enc, batched=True, remove_columns=["text"])
test_enc  = test_small.map(enc,  batched=True, remove_columns=["text"])
collator = DataCollatorWithPadding(tokenizer=tok)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2, id2label=ID2LABEL, label2id=LABEL2ID
).to(DEVICE)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# [Cell 2] Metrics + Trainer + Train (Corrected)
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import TrainingArguments, Trainer
# Note: Assuming SEED, model, train_enc, test_enc, tok, and collator are defined in previous cells

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", pos_label=1, zero_division=0)
    return {"accuracy":acc, "precision":prec, "recall":rec, "f1":f1}

def build_args():
    # Use the contemporary 'eval_strategy' and 'save_strategy' arguments.
    # This should work for all recent versions of the transformers library.
    # If this still fails, your version is extremely old, and you should update it.
    try:
        return TrainingArguments(
            output_dir="imdb-distilbert-run",
            num_train_epochs=3,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            learning_rate=5e-5,
            weight_decay=0.01,
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            logging_steps=50,
            report_to=[],
            seed=SEED,
        )
    except TypeError as e:
        # Fallback for extremely rare versions, although the above should fix it.
        # Print the error for debugging, but use a stripped-down version.
        print(f"Warning: Modern arguments failed. Falling back to basic args. Error: {e}")
        return TrainingArguments(
             output_dir="imdb-distilbert-run",
             num_train_epochs=3,
             per_device_train_batch_size=16,
             per_device_eval_batch_size=32,
             learning_rate=5e-5,
             weight_decay=0.01,
             logging_steps=50,
             seed=SEED,
             report_to=[]
        )

args = build_args()

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_enc,
    eval_dataset=test_enc,
    processing_class=tok,
    data_collator=collator,
    compute_metrics=compute_metrics
)

trainer.train()
# Always run a final eval to satisfy the rubric
print("Final eval:", trainer.evaluate(test_enc))

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3131,0.389338,0.825,0.745683,0.973361,0.844444
2,0.1315,0.348998,0.888,0.835714,0.959016,0.89313
3,0.049,0.414403,0.901,0.883629,0.918033,0.900503


Final eval: {'eval_loss': 0.41440337896347046, 'eval_accuracy': 0.901, 'eval_precision': 0.883629191321499, 'eval_recall': 0.9180327868852459, 'eval_f1': 0.9005025125628141, 'eval_runtime': 14.6444, 'eval_samples_per_second': 68.286, 'eval_steps_per_second': 2.185, 'epoch': 3.0}


In [3]:
# [Cell 3] Extra Effort: ROC-AUC + confusion matrix + report
import numpy as np
probs_all, labels_all = [], []
model.eval()
for batch in torch.utils.data.DataLoader(test_enc, batch_size=64, collate_fn=collator):
    labels_all.extend(batch["labels"].numpy().tolist())
    batch = {k:v.to(DEVICE) for k,v in batch.items()}
    with torch.no_grad(): logits = model(**{k:v for k,v in batch.items() if k!="labels"}).logits
    probs = torch.softmax(logits, dim=-1)[:,1].cpu().numpy()
    probs_all.append(probs)
probs = np.concatenate(probs_all); labels = np.array(labels_all)
preds = (probs >= 0.5).astype(int)

print("ROC-AUC:", roc_auc_score(labels, probs))
print("Confusion Matrix:\n", confusion_matrix(labels, preds))
print("\nClassification Report:\n", classification_report(labels, preds, target_names=["negative","positive"], digits=4))


ROC-AUC: 0.9703949474897542
Confusion Matrix:
 [[453  59]
 [ 40 448]]

Classification Report:
               precision    recall  f1-score   support

    negative     0.9189    0.8848    0.9015       512
    positive     0.8836    0.9180    0.9005       488

    accuracy                         0.9010      1000
   macro avg     0.9012    0.9014    0.9010      1000
weighted avg     0.9017    0.9010    0.9010      1000



In [4]:
# [Cell 4] Inference (6 provided + 5 custom = 11) + short notes
reviews = [
    "Alien is a masterpiece of sci-fi horror!",
    "Weyland-Yutani Corporation is pure evil.",
    "Prometheus answered nothing, but the special effects were impressive.",
    "The Xenomorph design is pure genius, terrifying and iconic.",
    "Alien Earth hybrids are Peter Pan's Lost Boys.",
    "The sheep eyeball gives me the heebie-jeebies."
]
custom = [
    "This sequel dragged on forever and I almost fell asleep.",           # neg
    "Surprisingly heartfelt with great pacing and sharp dialogue.",       # pos
    "The acting was fine, but the story felt uneven and confusing.",      # mixed
    "A fun popcorn ride—predictable, but I left with a smile.",           # pos-lite
    "Technically brilliant, yet emotionally distant and cold."            # mixed
]
texts = reviews + custom

def predict(text):
    enc = tok(text, return_tensors="pt", truncation=True).to(DEVICE)
    with torch.no_grad(): p = torch.softmax(model(**enc).logits, dim=-1)[0].cpu().numpy()
    i = int(p.argmax()); return ID2LABEL[i], float(p[i])

print("Predictions:")
outs=[]
for t in texts:
    lbl, sc = predict(t); outs.append((t,lbl,sc)); print(f"{lbl.upper():8} {sc:0.4f} | {t}")

print("\nExplanations (short):")
notes = [
 "Strong praise → positive",
 "Word “evil” → negative",
 "Mixed; fx praised → lean positive",
 "Praise for design → positive",
 "Metaphorical/ambiguous → varies",
 "Discomfort/heebie-jeebies → negative",
 "Dragged/slept → negative",
 "Heartfelt/pacing/dialogue → positive",
 "Acting ok, story uneven → mixed/neg",
 "Fun/popcorn/smile → positive",
 "Technical praise vs emotion → mixed"
]
for i,(t,lbl,sc) in enumerate(outs,1):
    print(f"{i:02d}. {lbl} ({sc:0.2f}) — {notes[i-1]}")


Predictions:
POSITIVE 0.9979 | Alien is a masterpiece of sci-fi horror!
NEGATIVE 0.9006 | Weyland-Yutani Corporation is pure evil.
POSITIVE 0.9969 | Prometheus answered nothing, but the special effects were impressive.
POSITIVE 0.9980 | The Xenomorph design is pure genius, terrifying and iconic.
POSITIVE 0.6054 | Alien Earth hybrids are Peter Pan's Lost Boys.
NEGATIVE 0.7587 | The sheep eyeball gives me the heebie-jeebies.
NEGATIVE 0.9855 | This sequel dragged on forever and I almost fell asleep.
POSITIVE 0.9980 | Surprisingly heartfelt with great pacing and sharp dialogue.
NEGATIVE 0.9983 | The acting was fine, but the story felt uneven and confusing.
POSITIVE 0.9733 | A fun popcorn ride—predictable, but I left with a smile.
POSITIVE 0.9969 | Technically brilliant, yet emotionally distant and cold.

Explanations (short):
01. positive (1.00) — Strong praise → positive
02. negative (0.90) — Word “evil” → negative
03. positive (1.00) — Mixed; fx praised → lean positive
04. positive (1.00

In [5]:
# [Cell 5] [Extra Effort #2] LR sweep (1 epoch, small subset) -> quick comparison table
import os, random, numpy as np, torch, inspect
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

# env: no external logging
os.environ["WANDB_DISABLED"]="true"; os.environ["WANDB_MODE"]="offline"
SEED=42; random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
DEVICE="cuda" if torch.cuda.is_available() else "cpu"

# tokenizer/dataset (reuse if exists)
try:
    tok
except NameError:
    tok = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)
try:
    ds
except NameError:
    ds = load_dataset("imdb")

# small, fast slices
small_train = ds["train"].shuffle(seed=SEED).select(range(1000))
small_test  = ds["test"].shuffle(seed=SEED).select(range(500))
enc = lambda b: tok(b["text"], truncation=True)
train_small_enc = small_train.map(enc, batched=True, remove_columns=["text"])
test_small_enc  = small_test.map(enc,  batched=True, remove_columns=["text"])
collator = DataCollatorWithPadding(tokenizer=tok)

# metrics
def _compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", pos_label=1, zero_division=0)
    return {"accuracy":acc, "precision":prec, "recall":rec, "f1":f1}

# sweep
lrs = [3e-5, 5e-5, 8e-5]
rows = []
for lr in lrs:
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(DEVICE)
    # version-agnostic TrainingArguments
    ta_kwargs = dict(
        output_dir=f"imdb-probe-lr-{lr}",
        num_train_epochs=1,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        learning_rate=lr,
        weight_decay=0.01,
        seed=SEED,
        report_to=[],
        logging_steps=50,
    )
    # add evaluation_strategy if supported
    if "evaluation_strategy" in inspect.signature(TrainingArguments.__init__).parameters:
        ta_kwargs["evaluation_strategy"] = "epoch"
    args = TrainingArguments(**ta_kwargs)

    tr_kwargs = dict(
        model=model, args=args,
        train_dataset=train_small_enc, eval_dataset=test_small_enc,
        data_collator=collator, compute_metrics=_compute_metrics
    )
    # processing_class vs tokenizer
    if "processing_class" in inspect.signature(Trainer.__init__).parameters:
        tr_kwargs["processing_class"] = tok
    elif "tokenizer" in inspect.signature(Trainer.__init__).parameters:
        tr_kwargs["tokenizer"] = tok

    trainer = Trainer(**tr_kwargs)
    trainer.train()
    metrics = trainer.evaluate(test_small_enc)
    rows.append({
        "lr": lr,
        "eval_loss": float(metrics.get("eval_loss", float("nan"))),
        "eval_accuracy": float(metrics.get("eval_accuracy", float("nan"))),
        "eval_precision": float(metrics.get("eval_precision", float("nan"))),
        "eval_recall": float(metrics.get("eval_recall", float("nan"))),
        "eval_f1": float(metrics.get("eval_f1", float("nan")))
    })

# print compact table
df = pd.DataFrame(rows, columns=["lr","eval_loss","eval_accuracy","eval_precision","eval_recall","eval_f1"])
print("\n[Extra Effort #2] LR sweep (1 epoch, 1k/500 subset)")
print(df.to_string(index=False))


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.6504


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5551


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5378



[Extra Effort #2] LR sweep (1 epoch, 1k/500 subset)
     lr  eval_loss  eval_accuracy  eval_precision  eval_recall  eval_f1
0.00003   0.506235          0.828        0.841880     0.800813 0.820833
0.00005   0.318167          0.878        0.849057     0.914634 0.880626
0.00008   0.349000          0.854        0.805654     0.926829 0.862004


In [6]:
# [Cell 6] [Extra Effort #3] Confidence calibration: ECE(10 bins) + reliability table
import numpy as np, torch

DEVICE="cuda" if torch.cuda.is_available() else "cpu"

# choose eval split (prefer the small subset from #2 for speed)
try:
    eval_enc = test_small_enc
except NameError:
    eval_enc = test_enc  # fall back to your main test encoding

# choose model (reuse trained model if present; else load baseline)
try:
    model
except NameError:
    from transformers import AutoModelForSequenceClassification
    try:
        model = AutoModelForSequenceClassification.from_pretrained("imdb-distilbert-best").to(DEVICE)
    except Exception:
        model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(DEVICE)

# collator
try:
    collator
except NameError:
    from transformers import AutoTokenizer, DataCollatorWithPadding
    tok = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)
    collator = DataCollatorWithPadding(tokenizer=tok)

# collect probabilities and labels
model.eval()
probs_all, labels_all = [], []
for batch in torch.utils.data.DataLoader(eval_enc, batch_size=64, collate_fn=collator):
    labels_all.extend(batch["labels"].numpy().tolist())
    batch = {k:v.to(DEVICE) for k,v in batch.items()}
    with torch.no_grad():
        logits = model(**{k:v for k,v in batch.items() if k!="labels"}).logits
    p = torch.softmax(logits, dim=-1)[:,1].cpu().numpy()
    probs_all.append(p)

probs = np.concatenate(probs_all)
labels = np.array(labels_all)
preds = (probs >= 0.5).astype(int)
correct = (preds == labels).astype(int)

# ECE with 10 equal-width bins
bins = np.linspace(0.0, 1.0, 11)
ece = 0.0
rows = []
for b in range(10):
    lo, hi = bins[b], bins[b+1]
    mask = (probs >= lo) & (probs < hi) if b < 9 else (probs >= lo) & (probs <= hi)
    if mask.sum() == 0:
        rows.append({"bin":[float(lo),float(hi)], "count":0, "conf_mean":np.nan, "acc_mean":np.nan, "gap":np.nan})
        continue
    conf_mean = probs[mask].mean()
    acc_mean = correct[mask].mean()
    gap = abs(acc_mean - conf_mean)
    ece += (mask.mean()) * gap
    rows.append({"bin":[float(lo),float(hi)], "count":int(mask.sum()), "conf_mean":float(conf_mean), "acc_mean":float(acc_mean), "gap":float(gap)})

print(f"\n[Extra Effort #3] ECE (10 bins): {ece:.4f}")
print("Reliability table (bin, count, mean confidence, mean accuracy, |gap|):")
for r in rows:
    print(f"{r['bin']}  n={r['count']:4d}  conf={np.nan if np.isnan(r['conf_mean']) else r['conf_mean']:.3f}  "
          f"acc={np.nan if np.isnan(r['acc_mean']) else r['acc_mean']:.3f}  gap={np.nan if np.isnan(r['gap']) else r['gap']:.3f}")



[Extra Effort #3] ECE (10 bins): 0.3983
Reliability table (bin, count, mean confidence, mean accuracy, |gap|):
[0.0, 0.1]  n= 148  conf=0.047  acc=0.980  gap=0.933
[0.1, 0.2]  n=  30  conf=0.161  acc=0.800  gap=0.639
[0.2, 0.30000000000000004]  n=  16  conf=0.252  acc=0.750  gap=0.498
[0.30000000000000004, 0.4]  n=  12  conf=0.344  acc=0.750  gap=0.406
[0.4, 0.5]  n=  11  conf=0.448  acc=0.818  gap=0.370
[0.5, 0.6000000000000001]  n=   8  conf=0.545  acc=0.500  gap=0.045
[0.6000000000000001, 0.7000000000000001]  n=  10  conf=0.654  acc=0.200  gap=0.454
[0.7000000000000001, 0.8]  n=  26  conf=0.758  acc=0.538  gap=0.220
[0.8, 0.9]  n=  39  conf=0.862  acc=0.564  gap=0.297
[0.9, 1.0]  n= 200  conf=0.944  acc=0.930  gap=0.014
