In [1]:

# 1. SETUP AND INSTALLATION
# Run this command first in your Colab notebook:
!pip install transformers datasets accelerate ray[tune] optuna -U



In [None]:
# This script performs a highly resource-intensive Grid Search for hyperparameter
# optimization on a small subset of the SST-2 sentiment dataset.
# NOTE: Even with limited data and a simple grid, this process will involve multiple
# full training runs and may take several hours to complete on Google Colab GPU.

# 1. SETUP AND INSTALLATION
# Run this command first in your Colab notebook:
# !pip install transformers datasets accelerate ray[tune] optuna -U

import torch
import os
import numpy as np
from datasets import load_dataset
from transformers import (
    DistilBertForSequenceClassification,
    DistilBertTokenizerFast,
    TrainingArguments,
    Trainer,
    set_seed
)
from sklearn.metrics import accuracy_score, f1_score

# Set a consistent seed for reproducibility across runs
set_seed(42)

# Ensure GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

# --- 2. DATA PREPARATION (LIMITED SUBSET) ---

# Load SST-2 (Stanford Sentiment Treebank)
dataset = load_dataset("sst2")

# We dramatically limit the data size to make Grid Search feasible
# (Simulating a small, labeled corpus of 1000 training & 200 evaluation samples)
train_data = dataset["train"].select(range(1000))
eval_data = dataset["validation"].select(range(200))

# Initialize Tokenizer
MODEL_NAME = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding=True)

# Apply tokenization
tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_eval = eval_data.map(tokenize_function, batched=True)

# Rename 'label' to 'labels' and set format to PyTorch tensors
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_eval = tokenized_eval.rename_column("label", "labels")

tokenized_train.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])
tokenized_eval.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])


# --- 3. MODEL, METRICS, AND HYPERPARAMETER DEFINITION ---

# Function to initialize a fresh model for each grid search run
def model_init():
    # Model must be re-initialized for every run to ensure independence
    return DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

def compute_metrics(p):
    # Uses F1-Score as the primary metric for comparison
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds, average="binary")
    return {"accuracy": acc, "f1": f1}

# --- HYPERPARAMETER GRID DEFINITION ---
def tune_hp(trial):
    """
    This function defines the hyperparameter space to be explored.
    The `trial` object allows us to suggest different values.
    """
    # The grid uses the trial.suggest_categorical and trial.suggest_float methods
    # from the Optuna backend, which is highly efficient.

    # 1. Learning Rate (Critical for performance)
    learning_rate = trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 1e-5])

    # 2. Batch Size (Affects VRAM and stability)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16])

    # 3. Weight Decay (Regularization against overfitting)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1, step=0.05)

    # --- EXPANSION SUPPORT: Add more parameters here if needed ---
    # Example: num_train_epochs = trial.suggest_categorical("num_train_epochs", [3, 4, 5])

    return {
        "learning_rate": learning_rate,
        "per_device_train_batch_size": per_device_train_batch_size,
        "weight_decay": weight_decay,
        "num_train_epochs": 3, # Fixed for quick search
    }


# --- 4. TRAINING ARGUMENTS (Fixed for all runs) ---
# Most arguments are fixed, only the three chosen HPs vary per run.
training_args = TrainingArguments(
    output_dir="./grid_search_results",
    # Evaluation settings (fixed)
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1", # Optimize for F1-Score
    fp16=torch.cuda.is_available(),
    report_to="none",
    # Fixed parameters
    num_train_epochs=3, # Will be overridden if specified in tune_hp
    warmup_steps=500,
)

# Initialize the Trainer
trainer = Trainer(
    model_init=model_init, # We pass the function, not the object, for fresh initialization
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


# --- 5. EXECUTION OF GRID SEARCH ---
# We use Optuna backend for efficient searching. The 'hp_space' provides the search definition.
print("\n--- Starting Grid Search (Total Runs: 18) ---")
print("Optimizing for 'f1' score...")

best_trial = trainer.hyperparameter_search(
    # We use 'Optuna' as the backend for the hyperparameter search
    backend="optuna",
    # Pass the function that defines the search space
    hp_space=tune_hp,
    # Maximize the F1 score (higher is better)
    direction="maximize",
    # Set the total number of experiments to run (3*2*3 = 18 total combinations)
    n_trials=18,
)

print("\n--- Grid Search Complete ---")
print("\nBEST HYPERPARAMETERS FOUND:")

# Extract and print the best configuration
if best_trial:
    print(best_trial)
    best_hps = best_trial.hyperparameters
    print("\nBest Hyperparameters:")
    for key, value in best_hps.items():
        print(f"  {key}: {value}")
else:
    print("Search failed or no best trial found.")

print("\nTo run the final model, use the best_hps found in a new TrainingArguments instance.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  trainer = Trainer(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-11-05 13:34:03,128] A new study created in memory with name: no-name-279da843-5f34-4490-aa5f-d42b36cc3092



--- Starting Grid Search (Total Runs: 18) ---
Optimizing for 'f1' score...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.691038,0.51,0.659722
2,No log,0.637711,0.545,0.685121
3,No log,0.409328,0.83,0.813187


[I 2025-11-05 13:52:03,949] Trial 0 finished with value: 1.643186813186813 and parameters: {'learning_rate': 1e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.0}. Best is trial 0 with value: 1.643186813186813.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
