# Notebook 3: Model Training with HPO (GPU)
Purpose:
1. Load pre-processed original and augmented AG News data.
2. Define functions for HPO (model_init, hp_space).
3. Use Trainer.hyperparameter_search with Optuna to find best params (tuning LR, decay, warmup, etc.).
4. Train the final model using the best hyperparameters.
5. Evaluate, save, visualize, and generate submission.

In [33]:
# --- Essential Setup ---
import os
import time
import pickle
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import traceback
import random
import shutil
import gc

print("Setting up environment and installing Optuna...")
!rm -rf /kaggle/working/*
!pip install -q optuna
import optuna
print("Optuna installed.")

# --- Cache Directory Setup ---
cache_dir = "/kaggle/working/hf_datasets_cache"
os.environ['HF_DATASETS_CACHE'] = cache_dir
os.environ['DATASETS_CACHE'] = cache_dir
os.makedirs(cache_dir, exist_ok=True)
print(f"INFO: Hugging Face datasets cache directory set to: {os.environ.get('HF_DATASETS_CACHE')}")

# --- Imports ---
from datasets import load_dataset, Dataset, ClassLabel, load_from_disk, concatenate_datasets
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    TrainerCallback,
    SchedulerType
)
from peft import LoraConfig, get_peft_model, PeftModel, TaskType
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.manifold import TSNE

Setting up environment and installing Optuna...
Optuna installed.
INFO: Hugging Face datasets cache directory set to: /kaggle/working/hf_datasets_cache


In [34]:
# --- Configuration ---
base_model_name = 'roberta-base'
dataset_name = 'ag_news'
test_split_name = 'test'

# Paths to pre-processed data from previous steps
cleaned_original_load_path = "/kaggle/input/cleanedorig"
tokenized_augmented_load_path = "/kaggle/input/cleanedaugmenteddata"

# Output directories
hpo_output_dir = "/kaggle/working/results_hpo" 
final_output_dir = "/kaggle/working/results_final"
final_model_save_path = "/kaggle/working/agnews_best_model_final"

LORA_R = 8 # IMPORTANT: Set your fixed rank based on param limit
LORA_TARGET_MODULES = ['query', 'value'] # Or ['query', 'key', 'value'] if you prefer

# Tokenizer settings
TOKENIZER_MAX_LENGTH = 512

# HPO settings
N_HPO_TRIALS = 20 # Number of HPO trials to run (adjust based on time/resources)

In [35]:
# --- GPU Check ---
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU is available. Using device: {device}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("WARNING: GPU not available, using CPU. HPO and Training will be very slow.")

GPU is available. Using device: cuda
GPU Name: Tesla P100-PCIE-16GB


In [36]:
# Label Info (Define explicitly)
num_labels = 4
id2label = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}
label2id = {'World': 0, 'Sports': 1, 'Business': 2, 'Sci/Tech': 3}
class_names = list(id2label.values())

In [37]:
# --- Load Tokenizer ---
print(f"Loading tokenizer: {base_model_name}")
try:
    tokenizer = RobertaTokenizer.from_pretrained(base_model_name)
except Exception as e:
    print(f"ERROR: Failed to load tokenizer: {e}")
    raise e

Loading tokenizer: roberta-base


In [38]:
# Load CLEANED ORIGINAL data
print(f"INFO: Attempting to load CLEANED ORIGINAL data from: {cleaned_original_load_path}")
if os.path.exists(cleaned_original_load_path):
    try:
        loaded_cleaned_original_ds = load_from_disk(cleaned_original_load_path)
        print(f"INFO: Successfully loaded cleaned original data ({len(loaded_cleaned_original_ds)} examples).")
        print(f"INFO: Features: {loaded_cleaned_original_ds.features}")
        # Get the label feature info from the loaded dataset
        if 'labels' in loaded_cleaned_original_ds.features and isinstance(loaded_cleaned_original_ds.features['labels'], ClassLabel):
             original_labels_feature = loaded_cleaned_original_ds.features['labels']
             print(f"INFO: Extracted ClassLabel features: {original_labels_feature}")
        else:
             raise ValueError("Loaded cleaned original dataset missing valid ClassLabel 'labels' feature.")
    except Exception as e:
        print(f"ERROR loading cleaned original dataset from disk: {e}.")
        raise RuntimeError("Failed to load cleaned original dataset")
else:
    raise FileNotFoundError(f"Cleaned original dataset not found at {cleaned_original_load_path}")

# Load TOKENIZED AUGMENTED (from cleaned) data
print(f"INFO: Attempting to load TOKENIZED AUGMENTED data from: {tokenized_augmented_load_path}")
if os.path.exists(tokenized_augmented_load_path):
    try:
        loaded_tokenized_augmented_ds = load_from_disk(tokenized_augmented_load_path)
        print(f"INFO: Successfully loaded tokenized augmented data ({len(loaded_tokenized_augmented_ds)} examples).")
        print(f"INFO: Features: {loaded_tokenized_augmented_ds.features}")
        # Verify/cast labels just in case save/load altered type
        if loaded_tokenized_augmented_ds.features['labels'] != original_labels_feature:
             print("WARNING: Loaded augmented dataset labels feature mismatch! Casting...")
             loaded_tokenized_augmented_ds = loaded_tokenized_augmented_ds.cast_column('labels', original_labels_feature)
    except Exception as e:
        print(f"ERROR loading tokenized augmented dataset from disk: {e}.")
        raise RuntimeError("Failed to load tokenized augmented dataset")
else:
    raise FileNotFoundError(f"Tokenized augmented dataset not found at {tokenized_augmented_load_path}")

INFO: Attempting to load CLEANED ORIGINAL data from: /kaggle/input/cleanedorig
INFO: Successfully loaded cleaned original data (114832 examples).
INFO: Features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)}
INFO: Extracted ClassLabel features: ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)
INFO: Attempting to load TOKENIZED AUGMENTED data from: /kaggle/input/cleanedaugmenteddata
INFO: Successfully loaded tokenized augmented data (114832 examples).
INFO: Features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)}


In [39]:
print("INFO: Loading and preprocessing original TEST dataset...")
try:
    original_test_dataset = load_dataset(dataset_name, split=test_split_name)
    def preprocess_original_test(examples):
        return tokenizer(examples['text'], truncation=True, padding=False, max_length=TOKENIZER_MAX_LENGTH)
    num_cpus = os.cpu_count()
    num_proc_initial = max(1, num_cpus - 2) if num_cpus > 2 else 1
    tokenized_test_dataset = original_test_dataset.map(
        preprocess_original_test, batched=True, num_proc=num_proc_initial, remove_columns=['text']
    )
    tokenized_test_dataset = tokenized_test_dataset.rename_column("label", "labels")
    # Cast test labels to match train labels feature
    tokenized_test_dataset = tokenized_test_dataset.cast_column('labels', original_labels_feature)
    eval_dataset = tokenized_test_dataset
    print(f"INFO: Original test dataset processed ({len(eval_dataset)} examples).")
except Exception as e:
    print(f"ERROR: Failed to load/process original test dataset: {e}")
    raise e

INFO: Loading and preprocessing original TEST dataset...


Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/7600 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7600 [00:00<?, ? examples/s]

INFO: Original test dataset processed (7600 examples).


In [40]:
import gc
# --- Combine Datasets ---
print("INFO: Combining CLEANED original and CLEANED augmented datasets...")
required_columns = ['input_ids', 'attention_mask', 'labels']
try:
    train_dataset_for_concat = loaded_cleaned_original_ds.select_columns(required_columns)
    tokenized_augmented_dataset_for_concat = loaded_tokenized_augmented_ds.select_columns(required_columns)
except ValueError as e:
    print(f"ERROR: Column selection error: {e}. Check loaded dataset columns.")
    print(f"Cleaned Original columns: {loaded_cleaned_original_ds.column_names}")
    print(f"Cleaned Augmented columns: {loaded_tokenized_augmented_ds.column_names}")
    raise e

combined_train_dataset_unshuffled = concatenate_datasets([train_dataset_for_concat, tokenized_augmented_dataset_for_concat])
print(f"Combined dataset created (unshuffled) with {len(combined_train_dataset_unshuffled)} examples.")

# --- Save temporarily to /kaggle/working ---
# Define a temporary path in the writable directory
temp_save_path = "/kaggle/working/combined_dataset_temp_unshuffled"
print(f"INFO: Saving unshuffled combined dataset temporarily to {temp_save_path}...")
try:
    # Ensure the directory exists if saving nested datasets (though save_to_disk handles top level)
    # os.makedirs(os.path.dirname(temp_save_path), exist_ok=True) # Usually not needed for top-level save
    combined_train_dataset_unshuffled.save_to_disk(temp_save_path)
    # Delete the in-memory object to free up RAM before loading it back
    del combined_train_dataset_unshuffled
    del train_dataset_for_concat # Also delete intermediate objects
    del tokenized_augmented_dataset_for_concat
    gc.collect() # Run garbage collection
    if torch.cuda.is_available(): torch.cuda.empty_cache() # Clear GPU cache if needed
    print("INFO: Temporary save complete, memory cleared.")
except Exception as e:
    print(f"ERROR saving temporary combined dataset: {e}")
    traceback.print_exc()
    raise e

# --- Load back from /kaggle/working ---
# Now the dataset object's path is associated with a writable location
print(f"INFO: Loading combined dataset from writable path: {temp_save_path}")
try:
    combined_train_dataset_reloaded = load_from_disk(temp_save_path)
    print("INFO: Reloaded successfully.")
except Exception as e:
    print(f"ERROR loading temporary combined dataset: {e}")
    raise e

# --- Shuffle the reloaded dataset ---
print("INFO: Shuffling the reloaded dataset...")
try:
    # Shuffle should now use /kaggle/working for any temporary files it needs
    combined_train_dataset = combined_train_dataset_reloaded.shuffle(seed=42)
    print(f"Combined training dataset ready with {len(combined_train_dataset)} examples.")
    print(f"Columns: {combined_train_dataset.column_names}")
except Exception as e:
    print(f"ERROR during shuffle after reload: {e}")
    traceback.print_exc()
    raise e

# --- Clean up ---
del combined_train_dataset_reloaded # Delete the reloaded unshuffled version
# Optional: remove the temporary directory to save space on /kaggle/working
import shutil
try:
    print(f"INFO: Removing temporary save directory: {temp_save_path}")
    shutil.rmtree(temp_save_path)
except Exception as e:
    print(f"WARNING: Could not remove temporary directory {temp_save_path}: {e}")

gc.collect()
if torch.cuda.is_available(): torch.cuda.empty_cache()

# --- Define eval_dataset ---
eval_dataset = tokenized_test_dataset # Make sure this is defined before Trainer

INFO: Combining CLEANED original and CLEANED augmented datasets...
Combined dataset created (unshuffled) with 229664 examples.
INFO: Saving unshuffled combined dataset temporarily to /kaggle/working/combined_dataset_temp_unshuffled...


Saving the dataset (0/1 shards):   0%|          | 0/229664 [00:00<?, ? examples/s]

INFO: Temporary save complete, memory cleared.
INFO: Loading combined dataset from writable path: /kaggle/working/combined_dataset_temp_unshuffled
INFO: Reloaded successfully.
INFO: Shuffling the reloaded dataset...
Combined training dataset ready with 229664 examples.
Columns: ['input_ids', 'attention_mask', 'labels']
INFO: Removing temporary save directory: /kaggle/working/combined_dataset_temp_unshuffled


In [41]:
# --- Adversarial Trainer Implementation ---
from transformers import Trainer
import torch
import torch.nn.functional as F
import traceback

# Define adversarial parameter (fixed for now, can be tuned via hp_space later)
# Tune this epsilon value based on experiments (start small, e.g., 0.5, 1.0)
ADVERSARIAL_EPSILON = 1.0

class AdversarialTrainer(Trainer):
    """
    Custom Trainer that incorporates FGM adversarial training.
    """
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Override compute_loss to add Fast Gradient Method (FGM) adversarial training.
        """
        # Store labels separately and remove from inputs if present
        # to avoid issues when model calculates loss internally
        labels = None
        if "labels" in inputs:
            labels = inputs.pop("labels").clone() # Ensure labels are removed for **inputs

        # --- Original Forward Pass ---
        outputs = model(**inputs) # Pass inputs without labels initially
        # Calculate original loss using model's output and stored labels
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss() # Assuming CrossEntropyLoss for classification
        original_loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        # Default to original loss if not training or if adversarial fails
        loss_to_return = original_loss
        adversarial_loss = None

        # --- Adversarial Perturbation (Only during Training) ---
        if self.is_in_train: # Use self.is_in_train provided by Trainer
            try:
                # 1. Get Embeddings
                input_ids = inputs.get("input_ids")
                if input_ids is None:
                    # If input_ids aren't passed, try getting them from the model maybe? Unlikely.
                    # Or, if inputs_embeds were passed directly, use those.
                    if inputs.get("inputs_embeds") is not None:
                         original_embeddings = inputs.get("inputs_embeds")
                    else:
                         raise ValueError("Cannot perform adversarial training without input_ids or inputs_embeds")
                else:
                     embedding_layer = model.get_input_embeddings()
                     original_embeddings = embedding_layer(input_ids)

                # 2. Calculate Gradients w.r.t. Embeddings
                # Create a detached copy that requires gradients
                embeds_for_grad = original_embeddings.detach().clone()
                embeds_for_grad.requires_grad_(True)

                # Prepare inputs for gradient calculation pass
                grad_inputs = inputs.copy() # Use original inputs dict structure
                grad_inputs["inputs_embeds"] = embeds_for_grad
                grad_inputs["input_ids"] = None # Ensure input_ids is None if embeds are passed

                # Forward pass ONLY to get gradients w.r.t. embeddings
                # We use the already computed logits from the original pass if possible,
                # or recompute if necessary. Recomputing is safer.
                with torch.enable_grad(): # Ensure gradients are computed
                    # Need model outputs corresponding to embeds_for_grad
                    temp_outputs = model(**grad_inputs)
                    temp_logits = temp_outputs.logits
                    temp_loss = loss_fct(temp_logits.view(-1, self.model.config.num_labels), labels.view(-1))

                # Calculate gradients w.r.t. embeds_for_grad
                # Using torch.autograd.grad is generally cleaner if it works
                embed_grads = torch.autograd.grad(temp_loss, embeds_for_grad, retain_graph=False, create_graph=False)[0]

                if embed_grads is None:
                    raise RuntimeError("Gradient w.r.t embeddings is None.")

                # 3. Calculate FGM Perturbation (L2 norm version)
                norm = torch.norm(embed_grads, p=2, dim=-1, keepdim=True) + 1e-8
                delta = embed_grads / norm
                perturbation = ADVERSARIAL_EPSILON * delta

                # 4. Apply Perturbation
                perturbed_embeddings = original_embeddings.detach() + perturbation

                # 5. Adversarial Forward Pass
                adv_inputs = inputs.copy()
                adv_inputs["inputs_embeds"] = perturbed_embeddings
                adv_inputs["input_ids"] = None

                adv_outputs = model(**adv_inputs)
                adv_logits = adv_outputs.logits
                adversarial_loss = loss_fct(adv_logits.view(-1, self.model.config.num_labels), labels.view(-1))

                # 6. Combine Losses (Summing is common)
                loss_to_return = original_loss + adversarial_loss
                # Optional debug print:
                # if self.state.global_step % 50 == 0: # Print periodically
                #    print(f"Step: {self.state.global_step}, Orig Loss: {original_loss.item():.4f}, Adv Loss: {adversarial_loss.item():.4f}, Total: {loss_to_return.item():.4f}")

            except Exception as e:
                print(f"WARNING: Adversarial step failed at step {self.state.global_step if hasattr(self, 'state') else 'N/A'}: {e}. Using original loss.")
                # traceback.print_exc() # Uncomment for detailed debugging
                loss_to_return = original_loss # Fall back to original loss

        # Add labels back for Trainer's potential use (e.g., in evaluation)
        if labels is not None:
            inputs["labels"] = labels

        # Return loss, and potentially outputs if requested by the Trainer's internal logic
        # The Trainer calls .backward() on the returned loss value
        return (loss_to_return, outputs) if return_outputs else loss_to_return

# --- End Adversarial Trainer Class ---

In [42]:
# --- Define Model Initialization Function (Simpler for HPO Search) ---
# Using fixed LORA_R and initial LORA_ALPHA/LORA_DROPOUT during search
INITIAL_LORA_ALPHA = LORA_R * 2 # Example: Start with alpha = 2*r during search
INITIAL_LORA_DROPOUT = 0.1     # Example: Start with 0.1 dropout during search

def model_init_for_hpo(trial=None): # trial argument often not needed here by default
    print(f"HPO Trial: Initializing base model + fixed r + initial alpha/dropout")
    model = RobertaForSequenceClassification.from_pretrained(
        base_model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=LORA_R,                   # Your FIXED rank
        lora_alpha=INITIAL_LORA_ALPHA, # Use initial fixed value for search
        target_modules=LORA_TARGET_MODULES,
        lora_dropout=INITIAL_LORA_DROPOUT # Use initial fixed value for search
    )
    peft_model = get_peft_model(model, lora_config)
    return peft_model

In [43]:
# --- Define Hyperparameter Search Space for Optuna ---
def hp_space(trial: optuna.trial.Trial) -> dict:
    return {
        # TrainingArguments parameters
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 8e-5, log=True),
        "num_train_epochs": trial.suggest_categorical("num_train_epochs", [1]), # More epochs ok here
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.2),
        "label_smoothing_factor": trial.suggest_float("label_smoothing_factor", 0.0, 0.2, step=0.05),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4]),

        # Model configuration parameters (to be used AFTER search)
        "lora_alpha_multiplier": trial.suggest_categorical("lora_alpha_multiplier", [1, 2, 4]), # Suggest multiplier for fixed R
        "lora_dropout": trial.suggest_float("lora_dropout", 0.0, 0.2, step=0.05),
    }

In [44]:
# --- Define Initial Training Arguments (Some will be overridden by HPO) ---
print("Defining Initial Training Arguments for HPO...")
# Note: output_dir here is temporary for HPO runs
# num_train_epochs might be overridden by hp_space per trial
training_args_for_hpo = TrainingArguments(
    output_dir=hpo_output_dir,
    eval_strategy="steps",
    eval_steps=500,         # Evaluate relatively frequently during HPO
    logging_steps=250,      # Log fairly often
    save_steps=10000,       # Don't need to save checkpoints during HPO search itself
    save_total_limit=1,     # Only keep one checkpoint (if save_steps is reached)

    # These will likely be overridden by hp_space:
    learning_rate=5e-5,
    num_train_epochs=1,     # Low default for HPO trials
    per_device_train_batch_size=16, # Keep this fixed based on memory
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2, # Will be tuned by hp_space
    weight_decay=0.1,
    warmup_ratio=0.06,
    label_smoothing_factor=0.1,
    # lora_alpha / lora_dropout are model config, set via model_init

    # These control the HPO process evaluation:
    load_best_model_at_end=False, # Don't load best during HPO search phase
    metric_for_best_model="accuracy", # Metric to optimize
    greater_is_better=True,

    fp16=torch.cuda.is_available(), # Use FP16 if GPU available

    dataloader_num_workers=2,
    report_to=[], # Disable default reporting like WandB during HPO
    logging_dir=f"{hpo_output_dir}/hpo_logs", # Keep logs separate
)

Defining Initial Training Arguments for HPO...


In [45]:
# --- Define Metrics, Collator, Callbacks ---
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "f1": f1}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Callbacks
# MetricsCollectorCallback still uses logging internally to store logs,
# but we don't rely on it for immediate cell output.
class MetricsCollectorCallback(TrainerCallback):
    def __init__(self):
        self.logs = []
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            self.logs.append((state.global_step, logs))
            # print(f"Internal Log Collected: Step {state.global_step} - {logs}") # Optional: noisy
        return control

# TrainingMetricsCallback modified to use print
class TrainingMetricsCallback(TrainerCallback):
     def on_train_batch_end(self, args, state, control, model=None, **kwargs):
         if state.global_step % 50 == 0 and "model_outputs" in kwargs and "inputs" in kwargs: # Check periodically
             try:
                 outputs = kwargs["model_outputs"]
                 inputs = kwargs["inputs"]
                 if hasattr(outputs, "logits"):
                     logits = outputs.logits.detach().cpu().numpy()
                 elif isinstance(outputs, dict) and "logits" in outputs:
                     logits = outputs["logits"].detach().cpu().numpy()
                 else: return control
                 if "labels" not in inputs: return control

                 labels = inputs["labels"].detach().cpu().numpy()
                 preds = np.argmax(logits, axis=-1)
                 acc = accuracy_score(labels, preds)
                 f1 = f1_score(labels, preds, average="weighted", zero_division=0)
                 # Use print instead of logging
                 print(f"Train Step {state.global_step} - Batch Metrics: Acc={acc:.4f}, F1={f1:.4f}")
             except Exception as e:
                 print(f"WARNING: Could not compute training batch metrics at step {state.global_step}: {e}")
         return control

metrics_collector = MetricsCollectorCallback()
training_metrics_callback = TrainingMetricsCallback()

In [46]:
# --- Initialize Trainer for HPO ---
print("Initializing Trainer with model_init for HPO...")
# If using AdversarialTrainer, instantiate that class instead:
# trainer_for_hpo = AdversarialTrainer(
trainer_for_hpo = AdversarialTrainer(
    model_init=model_init_for_hpo, # Use the model_init function
    args=training_args_for_hpo,
    data_collator=data_collator,
    train_dataset=combined_train_dataset,
    eval_dataset=eval_dataset,     # Use the AG News test set for evaluating HPO trials
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    # No need for metrics_collector callback during HPO search
)
print("Trainer initialized for HPO.")

Initializing Trainer with model_init for HPO...
HPO Trial: Initializing base model + fixed r + initial alpha/dropout


  trainer_for_hpo = AdversarialTrainer(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Trainer initialized for HPO.


In [None]:
from transformers import TrainingArguments, TrainerState, TrainerControl

# --- 1. Create the Pruner Instance ---
print("Creating Optuna Pruner...")
max_reports_per_epoch = (7500 // 500) # Rough estimate based on 1 epoch / 7500 steps
# Choose your pruner (e.g., Hyperband or Median)
pruner = optuna.pruners.HyperbandPruner(
    min_resource=1,
    max_resource=max_reports_per_epoch, 
    reduction_factor=3
)
# pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=6)
print(f"Using Pruner: {type(pruner).__name__}")

# --- 2. Define the Custom Pruning Callback ---
class OptunaPruningCallbackWithManualCheck(TrainerCallback):
    def __init__(self, pruner: optuna.pruners.BasePruner):
        self.pruner = pruner
        print("OptunaPruningCallbackWithManualCheck Initialized.")

    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics: dict, **kwargs):
        # The Trainer's HPO integration *should* make the optuna trial available via state
        # We need to access it to report metrics and check for pruning
        trial = getattr(state, "trial", None) # Safely get trial if it exists

        if trial is None:
            # This might happen if not running inside trainer.hyperparameter_search
            # Or if the version doesn't expose it this way.
            # print("Warning: Optuna trial not found in TrainerState during on_evaluate.")
            return control

        # Get the metric value (must match compute_objective/metric_for_best_model)
        metric_value = metrics.get("eval_accuracy")
        if metric_value is None:
            print(f"Warning: eval_accuracy not found in metrics for pruning check at step {state.global_step}.")
            return control

        # Report the intermediate value to the trial
        trial.report(metric_value, state.global_step)
        print(f"Trial {trial.number}: Reported metric {metric_value:.4f} at step {state.global_step}.") # Debug print

        # Ask the pruner (that we hold) if this trial should be pruned
        if self.pruner.prune(study=trial.study, trial=trial): # Use study associated with trial
            message = f"Trial {trial.number} pruned at step {state.global_step}."
            print(message)
            raise optuna.TrialPruned(message)

        return control

# --- 3. Initialize Trainer with the Callback ---
print("Initializing Trainer with model_init and Pruning Callback...")

# Create instance of the callback
optuna_pruning_callback = OptunaPruningCallbackWithManualCheck(pruner)

# Initialize Trainer (use AdversarialTrainer if applicable)
trainer_for_hpo = AdversarialTrainer( # Or AdversarialTrainer(...)
    model_init=model_init_for_hpo, # Your existing model_init
    args=training_args_for_hpo,    # Your existing HPO args
    data_collator=data_collator,
    train_dataset=combined_train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[optuna_pruning_callback], # ADD THE CUSTOM CALLBACK HERE
)
print("Trainer initialized for HPO with Pruning Callback.")


# --- 4. Run HPO Search (WITHOUT passing study) ---
print(f"Starting hyperparameter search with Optuna ({N_HPO_TRIALS} trials)...")
# Let the Trainer create the study internally, the callback will interact with it
start_hpo_time = time.time()

# Define objective computation (still needed for final value)
def compute_objective(metrics: dict) -> float:
    metric_value = metrics.get("eval_accuracy")
    if metric_value is None: return 0.0
    return metric_value

best_run = trainer_for_hpo.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    n_trials=N_HPO_TRIALS,
    compute_objective=compute_objective,
    # study=study, # DO NOT PASS STUDY HERE
)

end_hpo_time = time.time()
print("\n--- Hyperparameter Search Finished ---")
print(f"HPO Duration: {end_hpo_time - start_hpo_time:.2f} seconds")
print(f"Best Run Results:")
print(f"  Trial ID (Optuna): {best_run.run_id}") # run_id is Optuna's trial number
print(f"  Objective Value (eval_accuracy): {best_run.objective}")
print(f"  Best Hyperparameters: {best_run.hyperparameters}")

Creating Optuna Pruner...
Using Pruner: HyperbandPruner
Initializing Trainer with model_init and Pruning Callback...
OptunaPruningCallbackWithManualCheck Initialized.
HPO Trial: Initializing base model + fixed r + initial alpha/dropout


  trainer_for_hpo = AdversarialTrainer( # Or AdversarialTrainer(...)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[I 2025-04-15 04:53:52,828] A new study created in memory with name: no-name-8b7e273b-4876-4815-97d4-b588ecec4bed
Trying to set lora_alpha_multiplier in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_dropout in the hyperparameter search but

Trainer initialized for HPO with Pruning Callback.
Starting hyperparameter search with Optuna (20 trials)...
HPO Trial: Initializing base model + fixed r + initial alpha/dropout


Step,Training Loss,Validation Loss,Accuracy,F1
500,2.2374,1.658672,0.885658,0.88524
1000,1.179,1.43883,0.895395,0.895257
1500,1.0945,1.380149,0.894474,0.894201
2000,1.0782,1.341694,0.896316,0.895955
2500,1.0457,1.305114,0.8975,0.897317
3000,1.0281,1.296962,0.898947,0.898658
3500,0.9975,1.298097,0.899211,0.898922


[I 2025-04-15 06:20:00,138] Trial 0 finished with value: 0.8992105263157895 and parameters: {'learning_rate': 2.3993486539569273e-05, 'num_train_epochs': 1, 'weight_decay': 0.16964184665842208, 'warmup_ratio': 0.14611084220497558, 'label_smoothing_factor': 0.2, 'gradient_accumulation_steps': 4, 'lora_alpha_multiplier': 1, 'lora_dropout': 0.2}. Best is trial 0 with value: 0.8992105263157895.
Trying to set lora_alpha_multiplier in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_dropout in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HPO Trial: Initializing base model + fixed r + initial alpha/dropout


Step,Training Loss,Validation Loss,Accuracy,F1
500,2.7876,2.784236,0.276316,0.14987
1000,2.6053,2.174863,0.855263,0.852934
1500,1.4387,1.589296,0.883026,0.882467
2000,1.2375,1.509378,0.892632,0.892414
2500,1.1524,1.44905,0.889737,0.889269
3000,1.1197,1.384844,0.894868,0.894635
3500,1.0887,1.355681,0.895789,0.895604
4000,1.0413,1.341303,0.895,0.894566
4500,1.0889,1.306831,0.898421,0.898123
5000,0.9895,1.3107,0.898289,0.898012


[I 2025-04-15 08:22:25,672] Trial 1 finished with value: 0.9056578947368421 and parameters: {'learning_rate': 2.7646451679018978e-05, 'num_train_epochs': 1, 'weight_decay': 0.06717801573840793, 'warmup_ratio': 0.18706404288326958, 'label_smoothing_factor': 0.0, 'gradient_accumulation_steps': 1, 'lora_alpha_multiplier': 1, 'lora_dropout': 0.1}. Best is trial 1 with value: 0.9056578947368421.
Trying to set lora_alpha_multiplier in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_dropout in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HPO Trial: Initializing base model + fixed r + initial alpha/dropout


Step,Training Loss,Validation Loss,Accuracy,F1
500,2.7579,2.729067,0.574737,0.55649
1000,1.392,1.582008,0.885526,0.8851
1500,1.2655,1.506019,0.891447,0.891064
2000,1.1666,1.47664,0.891974,0.891761
2500,1.1257,1.438041,0.893026,0.892635
3000,1.1231,1.399004,0.895526,0.895321
3500,1.0989,1.385061,0.895,0.89483
4000,1.0599,1.370346,0.895,0.894675
4500,1.1193,1.341063,0.895263,0.894962
5000,1.0231,1.354651,0.895395,0.895126


[I 2025-04-15 10:24:49,488] Trial 2 finished with value: 0.9010526315789473 and parameters: {'learning_rate': 1.7492051078303416e-05, 'num_train_epochs': 1, 'weight_decay': 0.29650518751275967, 'warmup_ratio': 0.041357842290825425, 'label_smoothing_factor': 0.15000000000000002, 'gradient_accumulation_steps': 1, 'lora_alpha_multiplier': 4, 'lora_dropout': 0.15000000000000002}. Best is trial 1 with value: 0.9056578947368421.
Trying to set lora_alpha_multiplier in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_dropout in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


HPO Trial: Initializing base model + fixed r + initial alpha/dropout


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
500,1.4254,1.499351,0.889737,0.889322
1000,1.0931,1.377672,0.895789,0.89556
1500,1.0246,1.278351,0.899605,0.899468
2000,0.9352,1.22784,0.902368,0.901877
2500,0.8559,1.16784,0.904474,0.904271
3000,0.7702,1.054429,0.905526,0.905242
3500,0.6613,1.022271,0.906842,0.906656
4000,0.6227,1.02788,0.907105,0.906811
4500,0.58,1.099203,0.904211,0.903785
5000,0.5918,0.976683,0.908684,0.908489


[I 2025-04-15 12:02:57,732] Trial 3 finished with value: 0.9068421052631579 and parameters: {'learning_rate': 6.2426198959672e-05, 'num_train_epochs': 1, 'weight_decay': 0.018224213058098713, 'warmup_ratio': 0.06090936556667781, 'label_smoothing_factor': 0.05, 'gradient_accumulation_steps': 2, 'lora_alpha_multiplier': 1, 'lora_dropout': 0.1}. Best is trial 3 with value: 0.9068421052631579.
Trying to set lora_alpha_multiplier in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_dropout in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HPO Trial: Initializing base model + fixed r + initial alpha/dropout


Step,Training Loss,Validation Loss,Accuracy,F1
500,2.7152,2.477549,0.809342,0.797798
1000,1.2928,1.5237,0.886316,0.885991
1500,1.2046,1.464739,0.892632,0.892228
2000,1.1106,1.425586,0.894737,0.894533
2500,1.0796,1.377716,0.894605,0.894228
3000,1.0668,1.343527,0.896184,0.896004
3500,1.0529,1.33446,0.897368,0.897138
4000,1.0124,1.31719,0.898158,0.897774
4500,1.0744,1.296613,0.899868,0.899593
5000,0.9726,1.302864,0.898684,0.898422


[I 2025-04-15 14:04:49,983] Trial 4 finished with value: 0.9063157894736842 and parameters: {'learning_rate': 2.5217218638599337e-05, 'num_train_epochs': 1, 'weight_decay': 0.034093832896985476, 'warmup_ratio': 0.04459190513152816, 'label_smoothing_factor': 0.0, 'gradient_accumulation_steps': 1, 'lora_alpha_multiplier': 2, 'lora_dropout': 0.05}. Best is trial 3 with value: 0.9068421052631579.
Trying to set lora_alpha_multiplier in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_dropout in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HPO Trial: Initializing base model + fixed r + initial alpha/dropout


Step,Training Loss,Validation Loss,Accuracy,F1
500,2.4209,1.872746,0.872105,0.871466
1000,1.3768,1.583091,0.889474,0.889066
1500,1.2769,1.499515,0.891974,0.891711


[I 2025-04-15 14:25:18,548] Trial 5 pruned. 
Trying to set lora_alpha_multiplier in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_dropout in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


HPO Trial: Initializing base model + fixed r + initial alpha/dropout


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
500,2.1528,1.755479,0.878289,0.877604
1000,1.3027,1.552196,0.886579,0.886261
1500,1.2453,1.500809,0.891447,0.891084


[I 2025-04-15 14:38:06,272] Trial 6 pruned. 
Trying to set lora_alpha_multiplier in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_dropout in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


HPO Trial: Initializing base model + fixed r + initial alpha/dropout


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
500,2.7361,2.63032,0.81,0.806305
1000,1.3102,1.519846,0.892368,0.892119
1500,1.1537,1.379949,0.896974,0.896826
2000,1.0654,1.343815,0.896447,0.89606
2500,1.0151,1.301609,0.897105,0.896853
3000,0.997,1.274174,0.899079,0.898789
3500,0.9662,1.253011,0.900132,0.899946
4000,0.9446,1.212001,0.902632,0.902321
4500,0.9065,1.22117,0.902763,0.902504
5000,0.9068,1.166971,0.9025,0.902336
