In [1]:
!pip install optuna -q
!pip install transformers[torch] accelerate -q
!pip install pandas scikit-learn seaborn matplotlib -q

In [2]:
import pandas as pd
import torch
import numpy as np
import optuna
import os
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from transformers import (
    DistilBertTokenizer, RobertaTokenizer, XLNetTokenizer,
    DistilBertForSequenceClassification, RobertaForSequenceClassification, XLNetForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
from torch.utils.data import Dataset
from optuna import Trial

MODEL_NAMES = ["distilbert", "roberta", "xlnet"]
N_OPTUNA_TRIALS = 5
N_TRAIN_EPOCHS = 3
OUTPUT_DIR = './comparative_results'

def load_and_preprocess_data():
    print("Loading and preprocessing data...")
    if not os.path.exists('spam_ham_dataset.csv'):
        raise FileNotFoundError("Error: 'spam_ham_dataset.csv' not found. Please ensure you have uploaded it to the Colab session.")

    df = pd.read_csv('spam_ham_dataset.csv')[['text', 'label']].dropna().sample(frac=1, random_state=42)
    X = df['text']
    y = df['label'].map({'ham': 0, 'spam': 1})

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("Data loaded and split successfully.")
    return X_train, X_test, y_train, y_test

class SpamDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def get_model_and_tokenizer(model_name):
    model_map = {
        "distilbert": ("distilbert-base-uncased", DistilBertForSequenceClassification, DistilBertTokenizer),
        "roberta": ("roberta-base", RobertaForSequenceClassification, RobertaTokenizer),
        "xlnet": ("xlnet-base-cased", XLNetForSequenceClassification, XLNetTokenizer)
    }
    pretrained_name, model_class, tokenizer_class = model_map[model_name]

    model = model_class.from_pretrained(pretrained_name, num_labels=2)
    tokenizer = tokenizer_class.from_pretrained(pretrained_name)

    return model, tokenizer

def main():
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    all_results = []

    for model_name in MODEL_NAMES:
        print(f"\n{'='*20} Processing Model: {model_name.upper()} {'='*20}")

        _, tokenizer = get_model_and_tokenizer(model_name)

        print(f"Tokenizing data for {model_name}...")
        train_encodings = tokenizer(list(X_train), truncation=True, padding=False, max_length=128)
        test_encodings = tokenizer(list(X_test), truncation=True, padding=False, max_length=128)

        train_dataset = SpamDataset(train_encodings, y_train.tolist())
        test_dataset = SpamDataset(test_encodings, y_test.tolist())
        data_collator = DataCollatorWithPadding(tokenizer)

        print("Starting hyperparameter tuning...")

        def objective(trial: Trial):
            learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-4, log=True)
            batch_size = trial.suggest_categorical('batch_size', [8, 16, 32])

            model_for_trial, _ = get_model_and_tokenizer(model_name)

            training_args = TrainingArguments(
                output_dir=f"{OUTPUT_DIR}/{model_name}_optuna",
                num_train_epochs=1,
                per_device_train_batch_size=batch_size,
                learning_rate=learning_rate,
                logging_dir=f'./logs/{model_name}_optuna',
                eval_strategy="steps", # CORRECTED
                eval_steps=200,
                save_steps=200,
                save_strategy="steps",
                load_best_model_at_end=True,
                disable_tqdm=True,
                report_to="none"
            )

            trainer = Trainer(
                model=model_for_trial,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                data_collator=data_collator,
            )

            trainer.train()
            eval_result = trainer.evaluate()
            return eval_result['eval_loss']

        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=N_OPTUNA_TRIALS)
        best_params = study.best_trial.params
        print(f"Best hyperparameters found for {model_name}: {best_params}")

        print(f"Starting final training for {model_name} with best parameters...")
        final_model, _ = get_model_and_tokenizer(model_name)

        final_training_args = TrainingArguments(
            output_dir=f"{OUTPUT_DIR}/{model_name}_final",
            num_train_epochs=N_TRAIN_EPOCHS,
            per_device_train_batch_size=best_params['batch_size'],
            learning_rate=best_params['learning_rate'],
            weight_decay=0.01,
            warmup_steps=500,
            fp16=torch.cuda.is_available(),
            logging_dir=f'./logs/{model_name}_final',
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            report_to="none"
        )

        final_trainer = Trainer(
            model=final_model,
            args=final_training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            data_collator=data_collator,
        )

        final_trainer.train()

        print(f"Evaluating final {model_name} model...")
        predictions = final_trainer.predict(test_dataset)
        y_pred = np.argmax(predictions.predictions, axis=-1)
        y_true = y_test.to_numpy()

        report = classification_report(y_true, y_pred, output_dict=True, target_names=['ham', 'spam'])
        roc_auc = roc_auc_score(y_true, y_pred)

        results = {
            "Model": model_name.capitalize(),
            "Accuracy": report['accuracy'],
            "Precision (Spam)": report['spam']['precision'],
            "Recall (Spam)": report['spam']['recall'],
            "F1-Score (Spam)": report['spam']['f1-score'],
            "ROC-AUC Score": roc_auc
        }
        all_results.append(results)

        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
        plt.title(f"Confusion Matrix - {model_name.capitalize()}")
        plt.xlabel("Predicted Label")
        plt.ylabel("True Label")
        plt.tight_layout()
        plt.savefig(f"{OUTPUT_DIR}/{model_name}_confusion_matrix.png")
        plt.close()
        print(f"Confusion matrix for {model_name} saved.")

    print("\n\n" + "="*25 + " FINAL COMPARATIVE RESULTS " + "="*25)
    results_df = pd.DataFrame(all_results)
    print(results_df.to_string(index=False))

if __name__ == "__main__":
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    main()

Loading and preprocessing data...
Data loaded and split successfully.



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizing data for distilbert...


[I 2025-06-28 02:25:48,483] A new study created in memory with name: no-name-20d48a16-51b8-46cf-9912-dbc499398398


Starting hyperparameter tuning...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.050686098635196686, 'eval_runtime': 3.2427, 'eval_samples_per_second': 319.182, 'eval_steps_per_second': 40.09, 'epoch': 0.7722007722007722}
{'train_runtime': 85.3982, 'train_samples_per_second': 48.432, 'train_steps_per_second': 3.033, 'train_loss': 0.1416934687198359, 'epoch': 1.0}


[I 2025-06-28 02:27:18,481] Trial 0 finished with value: 0.050686098635196686 and parameters: {'learning_rate': 1.9842435801205965e-05, 'batch_size': 16}. Best is trial 0 with value: 0.050686098635196686.


{'eval_loss': 0.050686098635196686, 'eval_runtime': 3.3006, 'eval_samples_per_second': 313.58, 'eval_steps_per_second': 39.387, 'epoch': 1.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 53.2647, 'train_samples_per_second': 77.65, 'train_steps_per_second': 2.441, 'train_loss': 0.2896953289325421, 'epoch': 1.0}


[I 2025-06-28 02:28:16,776] Trial 1 finished with value: 0.13059240579605103 and parameters: {'learning_rate': 1.0117379797328297e-05, 'batch_size': 32}. Best is trial 0 with value: 0.050686098635196686.


{'eval_loss': 0.13059240579605103, 'eval_runtime': 3.444, 'eval_samples_per_second': 300.522, 'eval_steps_per_second': 37.747, 'epoch': 1.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.15596352517604828, 'eval_runtime': 3.5137, 'eval_samples_per_second': 294.56, 'eval_steps_per_second': 36.998, 'epoch': 0.38684719535783363}
{'eval_loss': 0.030128110200166702, 'eval_runtime': 3.5393, 'eval_samples_per_second': 292.432, 'eval_steps_per_second': 36.731, 'epoch': 0.7736943907156673}
{'loss': 0.1204, 'grad_norm': 0.02742757275700569, 'learning_rate': 1.4500383946760346e-06, 'epoch': 0.9671179883945842}
{'train_runtime': 112.8518, 'train_samples_per_second': 36.65, 'train_steps_per_second': 4.581, 'train_loss': 0.11776376669826545, 'epoch': 1.0}


[I 2025-06-28 02:30:14,316] Trial 2 finished with value: 0.030128110200166702 and parameters: {'learning_rate': 4.164832500263944e-05, 'batch_size': 8}. Best is trial 2 with value: 0.030128110200166702.


{'eval_loss': 0.030128110200166702, 'eval_runtime': 3.5673, 'eval_samples_per_second': 290.134, 'eval_steps_per_second': 36.442, 'epoch': 1.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.04583505168557167, 'eval_runtime': 3.7051, 'eval_samples_per_second': 279.346, 'eval_steps_per_second': 35.087, 'epoch': 0.7722007722007722}
{'train_runtime': 77.2591, 'train_samples_per_second': 53.534, 'train_steps_per_second': 3.352, 'train_loss': 0.11990718031481887, 'epoch': 1.0}


[I 2025-06-28 02:31:36,364] Trial 3 finished with value: 0.04583505168557167 and parameters: {'learning_rate': 9.761479542536941e-05, 'batch_size': 16}. Best is trial 2 with value: 0.030128110200166702.


{'eval_loss': 0.04583505168557167, 'eval_runtime': 3.6517, 'eval_samples_per_second': 283.427, 'eval_steps_per_second': 35.6, 'epoch': 1.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 61.3771, 'train_samples_per_second': 67.387, 'train_steps_per_second': 2.118, 'train_loss': 0.174577390230619, 'epoch': 1.0}


[I 2025-06-28 02:32:42,460] Trial 4 finished with value: 0.05192308500409126 and parameters: {'learning_rate': 2.3857208165568166e-05, 'batch_size': 32}. Best is trial 2 with value: 0.030128110200166702.


{'eval_loss': 0.05192308500409126, 'eval_runtime': 3.5893, 'eval_samples_per_second': 288.357, 'eval_steps_per_second': 36.219, 'epoch': 1.0}
Best hyperparameters found for distilbert: {'learning_rate': 4.164832500263944e-05, 'batch_size': 8}
Starting final training for distilbert with best parameters...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2322,0.0648
2,0.0536,0.051102
3,0.0086,0.040603


Evaluating final distilbert model...


Confusion matrix for distilbert saved.



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizing data for roberta...


[I 2025-06-28 02:36:03,831] A new study created in memory with name: no-name-3347d91a-c0f0-4119-8e9b-8120deb48cd3


Starting hyperparameter tuning...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 76.2815, 'train_samples_per_second': 54.22, 'train_steps_per_second': 1.704, 'train_loss': 0.187915405860314, 'epoch': 1.0}


[I 2025-06-28 02:37:25,425] Trial 0 finished with value: 0.05982988700270653 and parameters: {'learning_rate': 1.630956626615957e-05, 'batch_size': 32}. Best is trial 0 with value: 0.05982988700270653.


{'eval_loss': 0.05982988700270653, 'eval_runtime': 2.5187, 'eval_samples_per_second': 410.923, 'eval_steps_per_second': 51.614, 'epoch': 1.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.12425140291452408, 'eval_runtime': 2.3427, 'eval_samples_per_second': 441.795, 'eval_steps_per_second': 55.491, 'epoch': 0.38684719535783363}
{'eval_loss': 0.08355453610420227, 'eval_runtime': 2.1233, 'eval_samples_per_second': 487.453, 'eval_steps_per_second': 61.226, 'epoch': 0.7736943907156673}
{'loss': 0.1849, 'grad_norm': 0.023859377950429916, 'learning_rate': 5.911073618156222e-07, 'epoch': 0.9671179883945842}
{'train_runtime': 257.8081, 'train_samples_per_second': 16.043, 'train_steps_per_second': 2.005, 'train_loss': 0.18262696427579544, 'epoch': 1.0}


[I 2025-06-28 02:41:46,975] Trial 1 finished with value: 0.08355453610420227 and parameters: {'learning_rate': 1.3287065480812028e-05, 'batch_size': 8}. Best is trial 0 with value: 0.05982988700270653.


{'eval_loss': 0.08355453610420227, 'eval_runtime': 2.1065, 'eval_samples_per_second': 491.331, 'eval_steps_per_second': 61.713, 'epoch': 1.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.04142795130610466, 'eval_runtime': 2.3037, 'eval_samples_per_second': 449.282, 'eval_steps_per_second': 56.432, 'epoch': 0.7722007722007722}
{'train_runtime': 115.7529, 'train_samples_per_second': 35.731, 'train_steps_per_second': 2.238, 'train_loss': 0.17309813333754373, 'epoch': 1.0}


[I 2025-06-28 02:43:46,306] Trial 2 finished with value: 0.04142795130610466 and parameters: {'learning_rate': 5.671796512060776e-05, 'batch_size': 16}. Best is trial 2 with value: 0.04142795130610466.


{'eval_loss': 0.04142795130610466, 'eval_runtime': 2.109, 'eval_samples_per_second': 490.756, 'eval_steps_per_second': 61.641, 'epoch': 1.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 78.0241, 'train_samples_per_second': 53.009, 'train_steps_per_second': 1.666, 'train_loss': 0.16533611004169171, 'epoch': 1.0}


[I 2025-06-28 02:45:07,986] Trial 3 finished with value: 0.044011227786540985 and parameters: {'learning_rate': 2.9804414511554246e-05, 'batch_size': 32}. Best is trial 2 with value: 0.04142795130610466.


{'eval_loss': 0.044011227786540985, 'eval_runtime': 2.1361, 'eval_samples_per_second': 484.525, 'eval_steps_per_second': 60.858, 'epoch': 1.0}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.06191938370466232, 'eval_runtime': 2.2295, 'eval_samples_per_second': 464.235, 'eval_steps_per_second': 58.31, 'epoch': 0.7722007722007722}
{'train_runtime': 144.421, 'train_samples_per_second': 28.638, 'train_steps_per_second': 1.793, 'train_loss': 0.1971076317275353, 'epoch': 1.0}


[I 2025-06-28 02:47:36,001] Trial 4 finished with value: 0.06191938370466232 and parameters: {'learning_rate': 1.1932687629333148e-05, 'batch_size': 16}. Best is trial 2 with value: 0.04142795130610466.


{'eval_loss': 0.06191938370466232, 'eval_runtime': 2.1405, 'eval_samples_per_second': 483.522, 'eval_steps_per_second': 60.732, 'epoch': 1.0}
Best hyperparameters found for roberta: {'learning_rate': 5.671796512060776e-05, 'batch_size': 16}
Starting final training for roberta with best parameters...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.043558
2,0.203800,0.08233
3,0.203800,0.040053


Evaluating final roberta model...


Confusion matrix for roberta saved.



config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Tokenizing data for xlnet...


[I 2025-06-28 02:52:44,041] A new study created in memory with name: no-name-32348f2d-866f-4701-992a-d087afd2879b


Starting hyperparameter tuning...


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.08151022344827652, 'eval_runtime': 4.3773, 'eval_samples_per_second': 236.445, 'eval_steps_per_second': 29.698, 'epoch': 0.7722007722007722}
{'train_runtime': 183.0851, 'train_samples_per_second': 22.591, 'train_steps_per_second': 1.415, 'train_loss': 0.2338169053714708, 'epoch': 1.0}


[I 2025-06-28 02:56:18,382] Trial 0 finished with value: 0.08151022344827652 and parameters: {'learning_rate': 6.024648284722695e-05, 'batch_size': 16}. Best is trial 0 with value: 0.08151022344827652.


{'eval_loss': 0.08151022344827652, 'eval_runtime': 4.6872, 'eval_samples_per_second': 220.816, 'eval_steps_per_second': 27.735, 'epoch': 1.0}


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.605430543422699, 'eval_runtime': 4.3647, 'eval_samples_per_second': 237.13, 'eval_steps_per_second': 29.784, 'epoch': 0.38684719535783363}
{'eval_loss': 0.606025218963623, 'eval_runtime': 4.3806, 'eval_samples_per_second': 236.27, 'eval_steps_per_second': 29.676, 'epoch': 0.7736943907156673}
{'loss': 0.6139, 'grad_norm': 3.411466360092163, 'learning_rate': 1.945130717489581e-06, 'epoch': 0.9671179883945842}
{'train_runtime': 243.1975, 'train_samples_per_second': 17.007, 'train_steps_per_second': 2.126, 'train_loss': 0.6149315013184539, 'epoch': 1.0}


[I 2025-06-28 03:00:28,220] Trial 1 finished with value: 0.605430543422699 and parameters: {'learning_rate': 4.571057186100515e-05, 'batch_size': 8}. Best is trial 0 with value: 0.08151022344827652.


{'eval_loss': 0.605430543422699, 'eval_runtime': 4.3441, 'eval_samples_per_second': 238.253, 'eval_steps_per_second': 29.926, 'epoch': 1.0}


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.06732004135847092, 'eval_runtime': 4.4193, 'eval_samples_per_second': 234.199, 'eval_steps_per_second': 29.416, 'epoch': 0.7722007722007722}
{'train_runtime': 163.6992, 'train_samples_per_second': 25.266, 'train_steps_per_second': 1.582, 'train_loss': 0.18302699497767858, 'epoch': 1.0}


[I 2025-06-28 03:03:18,601] Trial 2 finished with value: 0.06732004135847092 and parameters: {'learning_rate': 1.8599768910442542e-05, 'batch_size': 16}. Best is trial 2 with value: 0.06732004135847092.


{'eval_loss': 0.06732004135847092, 'eval_runtime': 4.5095, 'eval_samples_per_second': 229.518, 'eval_steps_per_second': 28.828, 'epoch': 1.0}


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.06185697019100189, 'eval_runtime': 4.7122, 'eval_samples_per_second': 219.643, 'eval_steps_per_second': 27.588, 'epoch': 0.7722007722007722}
{'train_runtime': 179.1167, 'train_samples_per_second': 23.091, 'train_steps_per_second': 1.446, 'train_loss': 0.22052850318231176, 'epoch': 1.0}


[I 2025-06-28 03:06:24,886] Trial 3 finished with value: 0.06185697019100189 and parameters: {'learning_rate': 4.3312058828247213e-05, 'batch_size': 16}. Best is trial 3 with value: 0.06185697019100189.


{'eval_loss': 0.06185697019100189, 'eval_runtime': 4.4763, 'eval_samples_per_second': 231.22, 'eval_steps_per_second': 29.042, 'epoch': 1.0}


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.11667026579380035, 'eval_runtime': 4.3923, 'eval_samples_per_second': 235.642, 'eval_steps_per_second': 29.598, 'epoch': 0.38684719535783363}
{'eval_loss': 0.058007605373859406, 'eval_runtime': 4.7109, 'eval_samples_per_second': 219.701, 'eval_steps_per_second': 27.595, 'epoch': 0.7736943907156673}
{'loss': 0.1937, 'grad_norm': 0.017740381881594658, 'learning_rate': 1.3192488563754215e-06, 'epoch': 0.9671179883945842}
{'train_runtime': 222.8736, 'train_samples_per_second': 18.558, 'train_steps_per_second': 2.32, 'train_loss': 0.1896294892641285, 'epoch': 1.0}


[I 2025-06-28 03:10:14,413] Trial 4 finished with value: 0.058007605373859406 and parameters: {'learning_rate': 3.1002348124822405e-05, 'batch_size': 8}. Best is trial 4 with value: 0.058007605373859406.


{'eval_loss': 0.058007605373859406, 'eval_runtime': 4.3538, 'eval_samples_per_second': 237.723, 'eval_steps_per_second': 29.859, 'epoch': 1.0}
Best hyperparameters found for xlnet: {'learning_rate': 3.1002348124822405e-05, 'batch_size': 8}
Starting final training for xlnet with best parameters...


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.288,0.138605
2,0.1094,0.086651
3,0.0203,0.110011


Evaluating final xlnet model...


Confusion matrix for xlnet saved.


     Model  Accuracy  Precision (Spam)  Recall (Spam)  F1-Score (Spam)  ROC-AUC Score
Distilbert  0.991304          0.986799       0.983553         0.985173       0.989040
   Roberta  0.989372          0.983498       0.980263         0.981878       0.986712
     Xlnet  0.986473          0.993197       0.960526         0.976589       0.978895


In [3]:
!zip -r /content/results.zip /content/comparative_results

  adding: content/comparative_results/ (stored 0%)
  adding: content/comparative_results/xlnet_optuna/ (stored 0%)
  adding: content/comparative_results/xlnet_optuna/checkpoint-259/ (stored 0%)
  adding: content/comparative_results/xlnet_optuna/checkpoint-259/tokenizer_config.json (deflated 81%)
  adding: content/comparative_results/xlnet_optuna/checkpoint-259/config.json (deflated 52%)
  adding: content/comparative_results/xlnet_optuna/checkpoint-259/training_args.bin (deflated 52%)
  adding: content/comparative_results/xlnet_optuna/checkpoint-259/special_tokens_map.json (deflated 52%)
  adding: content/comparative_results/xlnet_optuna/checkpoint-259/scaler.pt (deflated 60%)
  adding: content/comparative_results/xlnet_optuna/checkpoint-259/spiece.model (deflated 49%)
  adding: content/comparative_results/xlnet_optuna/checkpoint-259/scheduler.pt (deflated 55%)
  adding: content/comparative_results/xlnet_optuna/checkpoint-259/optimizer.pt (deflated 21%)
  adding: content/comparative_res