In [1]:
# ===========================================
# 1. SETUP AND INSTALLATION
# ===========================================
!pip install transformers datasets accelerate ray[tune] optuna -U



Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting ray[tune]
  Downloading ray-2.51.1-cp312-cp312-manylinux2014_x86_64.whl.metadata (21 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting click!=8.3.0,>=7.0 (from ray[tune])
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardx-2.6.4-py3-none-any.whl.metadata (6.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [2]:
import torch
import os
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    DistilBertForSequenceClassification,
    DistilBertTokenizerFast,
    TrainingArguments,
    Trainer,
    set_seed
)
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# ===========================================
# 2. DATA PREPARATION (BALANCED SMALL SUBSET)
# ===========================================
data_path = "/content/drive/MyDrive/ITC508_data/clickbait_data.csv"
df = pd.read_csv(data_path)
print("✅ Dataset loaded successfully:", df.shape)
print(df.head())

# Columns are: 'headline' (text) and 'clickbait' (label)
df = df.rename(columns={"headline": "text", "clickbait": "label"})

# Create a smaller balanced subset for quicker runs
clickbait_df = df[df["label"] == 1].sample(n=1000, random_state=42)
non_clickbait_df = df[df["label"] == 0].sample(n=1000, random_state=42)
balanced_df = pd.concat([clickbait_df, non_clickbait_df]).sample(frac=1, random_state=42).reset_index(drop=True)

print("✅ Using balanced subset of size:", balanced_df.shape)

# Split into train and eval sets
train_df, eval_df = train_test_split(balanced_df, test_size=0.2, random_state=42)

# Convert pandas DataFrames to HuggingFace Datasets
train_data = Dataset.from_pandas(train_df)
eval_data = Dataset.from_pandas(eval_df)

# ===========================================
# 3. TOKENIZER & MODEL
# ===========================================
MODEL_NAME = "distilbert-base-multilingual-cased"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_eval = eval_data.map(tokenize_function, batched=True)

# Rename 'label' to 'labels' and format tensors
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_eval = tokenized_eval.rename_column("label", "labels")

tokenized_train.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])
tokenized_eval.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])

# ===========================================
# 4. MODEL INIT & METRICS
# ===========================================
set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def model_init():
    return DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds, average="binary")
    return {"accuracy": acc, "f1": f1}

# ===========================================
# 5. HYPERPARAMETER SEARCH SPACE
# ===========================================
def tune_hp(trial):
    learning_rate = trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 1e-5])
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16])
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1, step=0.05)
    return {
        "learning_rate": learning_rate,
        "per_device_train_batch_size": per_device_train_batch_size,
        "weight_decay": weight_decay,
        "num_train_epochs": 3,
    }

# ===========================================
# 6. TRAINING ARGUMENTS
# ===========================================
training_args = TrainingArguments(
    output_dir="./grid_search_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=torch.cuda.is_available(),
    report_to="none",
    num_train_epochs=3,
    warmup_steps=500,
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# ===========================================
# 7. GRID SEARCH EXECUTION
# ===========================================
print("\n🚀 Starting Grid Search (Total Runs: 18) — may take ~30–45 mins on GPU")
print("Optimizing for F1 score...\n")

best_trial = trainer.hyperparameter_search(
    backend="optuna",
    hp_space=tune_hp,
    direction="maximize",
    n_trials=18,
)

print("\n✅ GRID SEARCH COMPLETE ✅")
print("\nBest Hyperparameters Found:")

if best_trial:
    print(best_trial)
    best_hps = best_trial.hyperparameters
    for k, v in best_hps.items():
        print(f"  {k}: {v}")
else:
    print("No best trial found (search failed).")

print("\n💾 Use the best hyperparameters for your final training run.")


Mounted at /content/drive
✅ Dataset loaded successfully: (32000, 2)
                                            headline  clickbait
0                                 Should I Get Bings          1
1      Which TV Female Friend Group Do You Belong In          1
2  The New "Star Wars: The Force Awakens" Trailer...          1
3  This Vine Of New York On "Celebrity Big Brothe...          1
4  A Couple Did A Stunning Photo Shoot With Their...          1
✅ Using balanced subset of size: (2000, 2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

  trainer = Trainer(


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-11-08 11:20:10,564] A new study created in memory with name: no-name-10e70e6e-1c37-43f3-a8df-4ea325071985



🚀 Starting Grid Search (Total Runs: 18) — may take ~30–45 mins on GPU
Optimizing for F1 score...



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.578644,0.8525,0.868597
2,No log,0.195551,0.96,0.96
3,No log,0.074882,0.9775,0.977556


[I 2025-11-08 11:21:21,955] Trial 0 finished with value: 1.9550561097256858 and parameters: {'learning_rate': 1e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.05}. Best is trial 0 with value: 1.9550561097256858.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.149202,0.9625,0.963325
2,No log,0.105636,0.9775,0.977556
3,0.197100,0.061741,0.9875,0.987277


[I 2025-11-08 11:22:59,440] Trial 1 finished with value: 1.9747773536895674 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.05}. Best is trial 1 with value: 1.9747773536895674.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.314658,0.9325,0.935561
2,No log,0.075048,0.9775,0.977556
3,No log,0.084279,0.9825,0.982544


[I 2025-11-08 11:24:30,988] Trial 2 finished with value: 1.9650436408977556 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.1}. Best is trial 1 with value: 1.9747773536895674.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.28154,0.925,0.92891
2,No log,0.070173,0.9775,0.977444
3,0.280600,0.091662,0.9825,0.982456


[I 2025-11-08 11:26:25,220] Trial 3 finished with value: 1.964956140350877 and parameters: {'learning_rate': 1e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.05}. Best is trial 1 with value: 1.9747773536895674.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.149443,0.9625,0.963325
2,No log,0.107731,0.9775,0.977556
3,0.196900,0.054461,0.99,0.989848


[I 2025-11-08 11:28:13,864] Trial 4 finished with value: 1.9798477157360406 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.0}. Best is trial 4 with value: 1.9798477157360406.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.31463,0.9325,0.935561
2,No log,0.075182,0.9775,0.977556
3,No log,0.085104,0.9825,0.982544


[I 2025-11-08 11:30:00,841] Trial 5 finished with value: 1.9650436408977556 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.05}. Best is trial 4 with value: 1.9798477157360406.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.155612,0.9625,0.963325
2,No log,0.103552,0.9775,0.977556
3,0.194800,0.060952,0.99,0.989848


[I 2025-11-08 11:32:56,167] Trial 6 finished with value: 1.9798477157360406 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.1}. Best is trial 4 with value: 1.9798477157360406.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.31463,0.9325,0.935561
2,No log,0.075182,0.9775,0.977556
3,No log,0.085104,0.9825,0.982544


[I 2025-11-08 11:35:26,356] Trial 7 finished with value: 1.9650436408977556 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.05}. Best is trial 4 with value: 1.9798477157360406.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.067252,0.985,0.984925
2,No log,0.060335,0.99,0.989848
3,0.163900,0.047287,0.99,0.989848


[I 2025-11-08 11:37:37,618] Trial 8 finished with value: 1.9798477157360406 and parameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.1}. Best is trial 4 with value: 1.9798477157360406.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.28154,0.925,0.92891


[I 2025-11-08 11:37:50,931] Trial 9 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.067454,0.985,0.984925
2,No log,0.068438,0.985,0.984848
3,0.163000,0.064567,0.9875,0.987342


[I 2025-11-08 11:39:53,716] Trial 10 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.149443,0.9625,0.963325
2,No log,0.107731,0.9775,0.977556
3,0.196900,0.054461,0.99,0.989848


[I 2025-11-08 11:42:16,001] Trial 11 finished with value: 1.9798477157360406 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.0}. Best is trial 4 with value: 1.9798477157360406.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.155612,0.9625,0.963325
2,No log,0.103552,0.9775,0.977556
3,0.194800,0.060952,0.99,0.989848


[I 2025-11-08 11:44:43,415] Trial 12 finished with value: 1.9798477157360406 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.1}. Best is trial 4 with value: 1.9798477157360406.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.149443,0.9625,0.963325
2,No log,0.107731,0.9775,0.977556
3,0.196900,0.054461,0.99,0.989848


[I 2025-11-08 11:47:25,561] Trial 13 finished with value: 1.9798477157360406 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.0}. Best is trial 4 with value: 1.9798477157360406.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.149443,0.9625,0.963325
2,No log,0.107731,0.9775,0.977556
3,0.196900,0.054461,0.99,0.989848


[I 2025-11-08 11:49:43,728] Trial 14 finished with value: 1.9798477157360406 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.0}. Best is trial 4 with value: 1.9798477157360406.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.067252,0.985,0.984925
2,No log,0.060335,0.99,0.989848
3,0.163900,0.047287,0.99,0.989848


[I 2025-11-08 11:52:02,409] Trial 15 finished with value: 1.9798477157360406 and parameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.1}. Best is trial 4 with value: 1.9798477157360406.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.149443,0.9625,0.963325
2,No log,0.107731,0.9775,0.977556
3,0.196900,0.054461,0.99,0.989848


[I 2025-11-08 11:54:36,642] Trial 16 finished with value: 1.9798477157360406 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.0}. Best is trial 4 with value: 1.9798477157360406.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.155612,0.9625,0.963325


[W 2025-11-08 11:55:05,434] Trial 17 failed with parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.1} because of the following error: RuntimeError('[enforce fail at inline_container.cc:664] . unexpected pos 367277312 vs 367277200').
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/serialization.py", line 967, in save
    _save(
  File "/usr/local/lib/python3.12/dist-packages/torch/serialization.py", line 1268, in _save
    zip_file.write_record(name, storage, num_bytes)
RuntimeError: [enforce fail at inline_container.cc:858] . PytorchStreamWriter failed writing file data/2: file write failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/t

RuntimeError: [enforce fail at inline_container.cc:664] . unexpected pos 367277312 vs 367277200