In [1]:
!pip install -U transformers datasets accelerate optuna

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
I

In [4]:
import torch
import os
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    set_seed,
    EarlyStoppingCallback # <--- NEW: Imported for early stopping
)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# --- Global Configuration ---
set_seed(42)
MODEL_NAME = "distilbert-base-multilingual-cased"
data_path = "/content/drive/MyDrive/ITC508_data/clickbait_data.csv"
output_dir = "./grid_search_results"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# --- Load Dataset & Split ---
df = pd.read_csv(data_path)
df = df.rename(columns={"headline": "text", "clickbait": "label"})
print(f"Original Data Distribution:\n{df['label'].value_counts()}")

# Split into Train, Validation, and Test (64%, 16%, 20%)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df["label"], random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df) # <--- NEW: Test set for final evaluation

# --- Tokenization ---
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=["__index_level_0__", "text"])
val_dataset = val_dataset.map(tokenize, batched=True, remove_columns=["__index_level_0__", "text"])
test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=["__index_level_0__", "text"]) # Tokenize test set

train_dataset = train_dataset.rename_column("label", "labels")
val_dataset = val_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# --- Metrics ---
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1": f1_score(p.label_ids, preds),
        "precision": precision_score(p.label_ids, preds),
        "recall": recall_score(p.label_ids, preds)
    }

# --- Model Initialization ---
def model_init():
    # Model is initialized on CPU and moved to device by the Trainer
    return DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

# --- Training Arguments (Base Configuration) ---
base_kwargs = {
    "output_dir": output_dir,
    "save_strategy": "epoch",
    "load_best_model_at_end": True, # Crucial for Early Stopping
    "metric_for_best_model": "f1", # Monitor F1, as accuracy is too high
    "greater_is_better": True,
    "fp16": torch.cuda.is_available(),
    "report_to": "none",
    "num_train_epochs": 10, # Increased epochs with Early Stopping
}

# Adjust for evaluation strategy naming based on library version
if hasattr(TrainingArguments, "evaluation_strategy"):
    base_kwargs["evaluation_strategy"] = "epoch"
else:
    base_kwargs["eval_strategy"] = "epoch"

# --- Hyperparameter Grid (Refined for Regularization) ---
# Focus on lower learning rates and higher weight decay to combat overfitting
search_space = [
    {"learning_rate": 5e-5, "batch_size": 8, "weight_decay": 0.0},
    {"learning_rate": 3e-5, "batch_size": 16, "weight_decay": 0.05},
    {"learning_rate": 1e-5, "batch_size": 8, "weight_decay": 0.05},
    {"learning_rate": 1e-5, "batch_size": 16, "weight_decay": 0.1}, # Strongest Regularization combo
]

best_score = 0
best_params = None
best_model_path = None
run_index = 0

# --- Grid Search Loop ---
for params in search_space:
    run_index += 1
    print(f"\n========================================================")
    print(f"🔍 Run {run_index}/{len(search_space)}: Training with: {params}")
    print(f"========================================================")

    # Update base args with specific grid parameters
    training_args = TrainingArguments(
        **base_kwargs,
        per_device_train_batch_size=params["batch_size"],
        learning_rate=params["learning_rate"],
        weight_decay=params["weight_decay"],
        # Save checkpoints for this run to a unique folder
        output_dir=f"{output_dir}/run_{run_index}",
    )

    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        # --- NEW: Add Early Stopping Callback (Patience=2 means stop if no improvement for 2 epochs) ---
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    trainer.train()
    metrics = trainer.evaluate()
    print(metrics)

    # Track best model and save path
    if metrics["eval_f1"] > best_score:
        best_score = metrics["eval_f1"]
        best_params = params
        # The best model checkpoint is saved by load_best_model_at_end=True
        # We save the *actual* best model from the current run
        trainer.save_model(f"{output_dir}/best_model_final")
        best_model_path = f"{output_dir}/best_model_final"

print("\n\n" + "="*50)
print("🏆 BEST HYPERPARAMETERS FOUND:")
print(best_params)
print(f"Best Validation F1: {best_score:.4f}")
print("="*50)

# --- Final Evaluation on Test Set ---
if best_model_path:
    print("\n📝 Evaluating Best Model on Held-out Test Set...")

    # Load the best model found during the grid search
    final_model = DistilBertForSequenceClassification.from_pretrained(best_model_path).to(device)

    # Create a new trainer for evaluation only
    final_trainer = Trainer(
        model=final_model,
        args=TrainingArguments(output_dir="./test_eval", report_to="none"), # Simple args for evaluation
        compute_metrics=compute_metrics,
    )

    test_metrics = final_trainer.evaluate(test_dataset)

    print("\n\n" + "*"*50)
    print("⭐ FINAL TEST SET RESULTS ⭐")
    print(f"Best Params: {best_params}")
    print(f"Accuracy: {test_metrics['eval_accuracy']:.4f}")
    print(f"F1 Score: {test_metrics['eval_f1']:.4f}")
    print(f"Precision: {test_metrics['eval_precision']:.4f}")
    print(f"Recall: {test_metrics['eval_recall']:.4f}")
    print("*"*50)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
GPU: Tesla T4
Original Data Distribution:
label
0    16001
1    15999
Name: count, dtype: int64


Map:   0%|          | 0/20480 [00:00<?, ? examples/s]

Map:   0%|          | 0/5120 [00:00<?, ? examples/s]

Map:   0%|          | 0/6400 [00:00<?, ? examples/s]


🔍 Run 1/4: Training with: {'learning_rate': 5e-05, 'batch_size': 8, 'weight_decay': 0.0}


  trainer = Trainer(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.04,0.039208,0.993164,0.993187,0.989911,0.996484
2,0.0211,0.042953,0.993359,0.993365,0.99259,0.994141
3,0.0134,0.034579,0.994336,0.994319,0.99725,0.991406
4,0.0081,0.040918,0.995313,0.995322,0.993385,0.997266
5,0.0039,0.037855,0.995508,0.995512,0.994542,0.996484
6,0.0059,0.03596,0.994336,0.994341,0.993372,0.995313
7,0.0018,0.048419,0.994727,0.994717,0.996472,0.992969


{'eval_loss': 0.03785516694188118, 'eval_accuracy': 0.9955078125, 'eval_f1': 0.9955121951219512, 'eval_precision': 0.9945419103313841, 'eval_recall': 0.996484375, 'eval_runtime': 6.4177, 'eval_samples_per_second': 797.794, 'eval_steps_per_second': 99.724, 'epoch': 7.0}

🔍 Run 2/4: Training with: {'learning_rate': 3e-05, 'batch_size': 16, 'weight_decay': 0.05}


  trainer = Trainer(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.028,0.015631,0.995703,0.995712,0.993774,0.997656
2,0.0099,0.017002,0.99707,0.997071,0.996876,0.997266
3,0.0068,0.029113,0.995508,0.995496,0.998037,0.992969
4,0.0043,0.026627,0.995898,0.995896,0.99648,0.995313


{'eval_loss': 0.017002161592245102, 'eval_accuracy': 0.9970703125, 'eval_f1': 0.997070884592853, 'eval_precision': 0.996876220226474, 'eval_recall': 0.997265625, 'eval_runtime': 6.3958, 'eval_samples_per_second': 800.522, 'eval_steps_per_second': 100.065, 'epoch': 4.0}

🔍 Run 3/4: Training with: {'learning_rate': 1e-05, 'batch_size': 8, 'weight_decay': 0.05}


  trainer = Trainer(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0333,0.047743,0.99082,0.990879,0.984574,0.997266
2,0.011,0.022221,0.995313,0.995301,0.997645,0.992969
3,0.0059,0.04503,0.994141,0.994154,0.991835,0.996484
4,0.0081,0.03333,0.995313,0.995296,0.99882,0.991797


{'eval_loss': 0.022220568731427193, 'eval_accuracy': 0.9953125, 'eval_f1': 0.995301487862177, 'eval_precision': 0.9976452119309263, 'eval_recall': 0.99296875, 'eval_runtime': 6.1557, 'eval_samples_per_second': 831.744, 'eval_steps_per_second': 103.968, 'epoch': 4.0}

🔍 Run 4/4: Training with: {'learning_rate': 1e-05, 'batch_size': 16, 'weight_decay': 0.1}


  trainer = Trainer(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0338,0.022945,0.994141,0.994147,0.992985,0.995313
2,0.0186,0.026893,0.994141,0.994161,0.99069,0.997656
3,0.0038,0.049853,0.992773,0.992727,0.999209,0.986328
4,0.0023,0.037623,0.994531,0.994553,0.990698,0.998437
5,0.0026,0.033258,0.994922,0.994924,0.994536,0.995313
6,0.0008,0.02801,0.996289,0.996294,0.994936,0.997656
7,0.0022,0.038234,0.995703,0.995713,0.99339,0.998047
8,0.0,0.039961,0.995313,0.995323,0.993002,0.997656


{'eval_loss': 0.028010064736008644, 'eval_accuracy': 0.9962890625, 'eval_f1': 0.9962941291203433, 'eval_precision': 0.9949357226334242, 'eval_recall': 0.99765625, 'eval_runtime': 6.306, 'eval_samples_per_second': 811.929, 'eval_steps_per_second': 101.491, 'epoch': 8.0}


🏆 BEST HYPERPARAMETERS FOUND:
{'learning_rate': 3e-05, 'batch_size': 16, 'weight_decay': 0.05}
Best Validation F1: 0.9971

📝 Evaluating Best Model on Held-out Test Set...




**************************************************
⭐ FINAL TEST SET RESULTS ⭐
Best Params: {'learning_rate': 3e-05, 'batch_size': 16, 'weight_decay': 0.05}
Accuracy: 0.9934
F1 Score: 0.9935
Precision: 0.9907
Recall: 0.9962
**************************************************
