# Stage 3: Hyperparameter Optimization

Optuna를 사용한 자동 하이퍼파라미터 최적화

## 목표
1. Optuna 베이지안 최적화 설정
2. WandB Sweep 통합
3. 20+ trials 실행
4. 최적 하이퍼파라미터 발견 및 저장
5. 최적 설정으로 재학습

## 📋 Base Config

In [None]:
BASE_CONFIG = {
    # Experiment
    "exp_name": "kobart-optuna",
    "seed": 42,
    "n_trials": 20,
    
    # Model
    "model_name": "gogamza/kobart-base-v2",
    "max_input_length": 512,
    "max_target_length": 128,
    
    # Fixed training params
    "fp16": True,
    "gradient_checkpointing": True,
    "eval_steps": 500,
    "save_steps": 500,
    "logging_steps": 100,
    
    # Paths
    "data_dir": "../data/processed",
    "config_dir": "../configs",
    "checkpoint_dir": "../checkpoints/optuna",
}

## 🔧 Setup

In [None]:
import os
import json
from pathlib import Path

import torch
import pandas as pd
import numpy as np
import optuna
from optuna.integration.wandb import WeightsAndBiasesCallback
from tqdm.auto import tqdm

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Custom utilities
from utils import (
    set_seed,
    compute_rouge,
    auto_git_backup
)

# Set seed
set_seed(BASE_CONFIG["seed"])

# Create directories
os.makedirs(BASE_CONFIG["checkpoint_dir"], exist_ok=True)

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🖥️ Using device: {device}")

## 📊 Data Preparation

In [None]:
# Load processed data
train_df = pd.read_csv(os.path.join(BASE_CONFIG["data_dir"], "train_processed.csv"))
train_df = train_df[['fname', 'dialogue_clean', 'summary_clean']].rename(
    columns={'dialogue_clean': 'dialogue', 'summary_clean': 'summary'}
)

# Train/validation split
train_data, val_data = train_test_split(
    train_df,
    test_size=0.1,
    random_state=BASE_CONFIG["seed"]
)

print(f"📊 Train: {len(train_data):,}, Val: {len(val_data):,}")

In [None]:
# Load tokenizer (will be reused across trials)
tokenizer = AutoTokenizer.from_pretrained(BASE_CONFIG["model_name"])

# Load special tokens
special_tokens_path = os.path.join(BASE_CONFIG["config_dir"], "special_tokens.json")
if os.path.exists(special_tokens_path):
    with open(special_tokens_path, 'r', encoding='utf-8') as f:
        special_tokens_config = json.load(f)
    tokenizer.add_special_tokens({
        'additional_special_tokens': special_tokens_config['additional_special_tokens']
    })

print(f"✅ Tokenizer loaded with {len(tokenizer)} tokens")

In [None]:
# Preprocess datasets once
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples['dialogue'],
        max_length=BASE_CONFIG["max_input_length"],
        truncation=True,
        padding=False
    )
    labels = tokenizer(
        examples['summary'],
        max_length=BASE_CONFIG["max_target_length"],
        truncation=True,
        padding=False
    )
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_dataset = Dataset.from_pandas(train_data.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_data.reset_index(drop=True))

train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['dialogue', 'summary']
)
val_dataset = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['dialogue', 'summary']
)

print("✅ Datasets preprocessed")

## 🔬 Optuna Objective Function

In [None]:
def objective(trial):
    """Optuna objective function."""
    
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
    gradient_accumulation_steps = trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4])
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.0, 0.2)
    num_train_epochs = trial.suggest_int("num_train_epochs", 3, 10)
    num_beams = trial.suggest_int("num_beams", 3, 6)
    length_penalty = trial.suggest_float("length_penalty", 0.5, 2.0)
    
    # Load model
    model = AutoModelForSeq2SeqLM.from_pretrained(BASE_CONFIG["model_name"])
    model.resize_token_embeddings(len(tokenizer))
    model = model.to(device)
    
    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        padding=True
    )
    
    # Training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=os.path.join(BASE_CONFIG["checkpoint_dir"], f"trial_{trial.number}"),
        
        # Hyperparameters from trial
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size * 2,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=num_train_epochs,
        warmup_ratio=warmup_ratio,
        
        # Fixed params
        fp16=BASE_CONFIG["fp16"],
        gradient_checkpointing=BASE_CONFIG["gradient_checkpointing"],
        
        # Evaluation
        eval_strategy="steps",
        eval_steps=BASE_CONFIG["eval_steps"],
        save_strategy="steps",
        save_steps=BASE_CONFIG["save_steps"],
        logging_steps=BASE_CONFIG["logging_steps"],
        
        # Prediction
        predict_with_generate=True,
        generation_max_length=BASE_CONFIG["max_target_length"],
        generation_num_beams=num_beams,
        
        # Saving
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="eval_rouge_sum",
        greater_is_better=True,
        
        # WandB
        report_to="wandb",
        run_name=f"{BASE_CONFIG['exp_name']}-trial{trial.number}",
        
        # Misc
        seed=BASE_CONFIG["seed"],
    )
    
    # Compute metrics
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        result = compute_rouge(
            predictions=decoded_preds,
            references=decoded_labels,
            use_korean_tokenizer=True
        )
        return result
    
    # Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    
    # Train
    trainer.train()
    
    # Evaluate
    eval_results = trainer.evaluate()
    rouge_sum = eval_results['eval_rouge_sum']
    
    # Clean up to save memory
    del model
    del trainer
    torch.cuda.empty_cache()
    
    return rouge_sum

print("✅ Objective function defined")

## 🚀 Run Optuna Study

In [None]:
# Create Optuna study
study = optuna.create_study(
    direction="maximize",
    study_name=BASE_CONFIG["exp_name"],
    sampler=optuna.samplers.TPESampler(seed=BASE_CONFIG["seed"])
)

print(f"🔬 Starting Optuna study with {BASE_CONFIG['n_trials']} trials...")

In [None]:
# Run optimization
study.optimize(
    objective,
    n_trials=BASE_CONFIG["n_trials"],
    show_progress_bar=True
)

print("\n✅ Optimization complete!")

## 📊 Results Analysis

In [None]:
# Best trial
best_trial = study.best_trial

print("\n🏆 Best Trial:")
print(f"  Value (ROUGE sum): {best_trial.value:.4f}")
print(f"  Params:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

In [None]:
# Optimization history
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Optimization history
trial_nums = [trial.number for trial in study.trials]
values = [trial.value for trial in study.trials]
axes[0].plot(trial_nums, values, 'bo-')
axes[0].axhline(best_trial.value, color='r', linestyle='--', label='Best')
axes[0].set_xlabel('Trial')
axes[0].set_ylabel('ROUGE Sum')
axes[0].set_title('Optimization History')
axes[0].legend()
axes[0].grid(True)

# Parameter importance
importance = optuna.importance.get_param_importances(study)
params = list(importance.keys())
importances = list(importance.values())
axes[1].barh(params, importances)
axes[1].set_xlabel('Importance')
axes[1].set_title('Parameter Importance')
axes[1].grid(True, axis='x')

plt.tight_layout()
plt.show()

## 💾 Save Best Config

In [None]:
# Create best config
best_config = {
    **BASE_CONFIG,
    **best_trial.params,
    "best_rouge_sum": best_trial.value,
    "trial_number": best_trial.number
}

# Save to file
best_config_path = os.path.join(BASE_CONFIG["config_dir"], "best_config_stage3.json")
with open(best_config_path, 'w', encoding='utf-8') as f:
    json.dump(best_config, f, ensure_ascii=False, indent=2)

print(f"✅ Best config saved to {best_config_path}")

## 📋 Summary

**완료된 작업**:
- ✅ Optuna 베이지안 최적화 설정
- ✅ 20+ trials 실행
- ✅ 최적 하이퍼파라미터 발견
- ✅ best_config_stage3.json 저장
- ✅ 최적화 결과 시각화

**다음 단계**: 
- best_config로 전체 데이터셋 재학습
- Stage 4 - Advanced Models