# Stage 4: Advanced Models

고급 모델 실험 및 최적화 기법 적용

## 목표
1. 다양한 사전학습 모델 실험 (mBART, Qwen 등)
2. LoRA/QLoRA 파라미터 효율적 미세조정
3. 앙상블 기법 적용
4. 최종 제출 파일 생성

## 📋 Config

In [None]:
import json

# Load best config from Stage 3
with open('../configs/best_config_stage3.json', 'r') as f:
    BEST_CONFIG = json.load(f)

CONFIG = {
    **BEST_CONFIG,
    
    # Experiment
    "exp_num": "004",
    "exp_name": "advanced-model",
    
    # Model options
    "models_to_test": [
        "gogamza/kobart-base-v2",
        "facebook/mbart-large-50",
        # "Qwen/Qwen2.5-7B-Instruct",  # Requires more memory
    ],
    
    # LoRA configuration
    "use_lora": True,
    "lora_r": 8,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "lora_target_modules": ["q_proj", "v_proj"],
    
    # Ensemble
    "ensemble_method": "voting",  # voting, averaging, weighted
    "ensemble_weights": None,  # Auto-calculated from validation scores
    
    # Paths
    "checkpoint_dir": "../checkpoints/advanced",
    "submission_dir": "../submissions",
}

print("✅ Config loaded")
print(f"📊 Best ROUGE from Stage 3: {BEST_CONFIG.get('best_rouge_sum', 'N/A')}")

## 🔧 Setup

In [None]:
import os
from pathlib import Path

import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Custom utilities
from utils import (
    set_seed,
    setup_wandb,
    compute_rouge,
    auto_git_backup
)

# Set seed
set_seed(CONFIG["seed"])

# Create directories
os.makedirs(CONFIG["checkpoint_dir"], exist_ok=True)

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🖥️ Using device: {device}")

## 📊 Data Loading

In [None]:
# Load processed data
train_df = pd.read_csv(os.path.join(CONFIG["data_dir"], "train_processed.csv"))
test_df = pd.read_csv(os.path.join(CONFIG["data_dir"], "test_processed.csv"))

# Use cleaned text
train_df = train_df[['fname', 'dialogue_clean', 'summary_clean']].rename(
    columns={'dialogue_clean': 'dialogue', 'summary_clean': 'summary'}
)
test_df = test_df[['fname', 'dialogue_clean']].rename(
    columns={'dialogue_clean': 'dialogue'}
)

# Train/validation split
train_data, val_data = train_test_split(
    train_df,
    test_size=0.1,
    random_state=CONFIG["seed"]
)

print(f"📊 Train: {len(train_data):,}, Val: {len(val_data):,}, Test: {len(test_df):,}")

## 🤖 Model Training Functions

In [None]:
def create_lora_model(base_model, config):
    """LoRA 설정으로 모델을 래핑합니다."""
    lora_config = LoraConfig(
        r=config["lora_r"],
        lora_alpha=config["lora_alpha"],
        target_modules=config["lora_target_modules"],
        lora_dropout=config["lora_dropout"],
        bias="none",
        task_type=TaskType.SEQ_2_SEQ_LM
    )
    
    model = get_peft_model(base_model, lora_config)
    model.print_trainable_parameters()
    
    return model

print("✅ LoRA helper function defined")

In [None]:
def train_model(model_name, config, train_dataset, val_dataset, tokenizer):
    """단일 모델을 학습합니다."""
    print(f"\n🚀 Training: {model_name}")
    
    # Load model
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model.resize_token_embeddings(len(tokenizer))
    
    # Apply LoRA if enabled
    if config["use_lora"]:
        model = create_lora_model(model, config)
    
    model = model.to(device)
    
    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        padding=True
    )
    
    # Training arguments
    model_short_name = model_name.split("/")[-1]
    training_args = Seq2SeqTrainingArguments(
        output_dir=os.path.join(config["checkpoint_dir"], model_short_name),
        
        # Use best config from Stage 3
        learning_rate=config["learning_rate"],
        per_device_train_batch_size=config["batch_size"],
        per_device_eval_batch_size=config["batch_size"] * 2,
        gradient_accumulation_steps=config["gradient_accumulation_steps"],
        num_train_epochs=config["num_train_epochs"],
        warmup_ratio=config["warmup_ratio"],
        
        # Optimization
        fp16=config["fp16"],
        gradient_checkpointing=True,
        
        # Evaluation
        eval_strategy="steps",
        eval_steps=config["eval_steps"],
        save_strategy="steps",
        save_steps=config["save_steps"],
        logging_steps=config["logging_steps"],
        
        # Prediction
        predict_with_generate=True,
        generation_max_length=config["max_target_length"],
        generation_num_beams=config["num_beams"],
        
        # Saving
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_rouge_sum",
        greater_is_better=True,
        
        # WandB
        report_to="wandb",
        run_name=f"{config['exp_name']}-{model_short_name}",
        
        # Misc
        seed=config["seed"],
    )
    
    # Compute metrics
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        result = compute_rouge(
            predictions=decoded_preds,
            references=decoded_labels,
            use_korean_tokenizer=True
        )
        return result
    
    # Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    
    # Train
    trainer.train()
    
    # Evaluate
    eval_results = trainer.evaluate()
    rouge_sum = eval_results['eval_rouge_sum']
    
    # Save model
    final_model_path = os.path.join(config["checkpoint_dir"], f"{model_short_name}_final")
    trainer.save_model(final_model_path)
    
    print(f"✅ {model_name} training complete")
    print(f"📊 ROUGE sum: {rouge_sum:.2f}")
    
    return {
        "model_name": model_name,
        "model_path": final_model_path,
        "rouge_sum": rouge_sum,
        "eval_results": eval_results
    }

print("✅ Training function defined")

## 🚀 Train Multiple Models

In [None]:
# Load tokenizer and special tokens
base_tokenizer = AutoTokenizer.from_pretrained(CONFIG["models_to_test"][0])

special_tokens_path = os.path.join(CONFIG["config_dir"], "special_tokens.json")
if os.path.exists(special_tokens_path):
    with open(special_tokens_path, 'r', encoding='utf-8') as f:
        special_tokens_config = json.load(f)
    base_tokenizer.add_special_tokens({
        'additional_special_tokens': special_tokens_config['additional_special_tokens']
    })

# Preprocess datasets
def preprocess_function(examples):
    model_inputs = base_tokenizer(
        examples['dialogue'],
        max_length=CONFIG["max_input_length"],
        truncation=True,
        padding=False
    )
    labels = base_tokenizer(
        examples['summary'],
        max_length=CONFIG["max_target_length"],
        truncation=True,
        padding=False
    )
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_dataset = Dataset.from_pandas(train_data.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_data.reset_index(drop=True))

train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=['dialogue', 'summary'])
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=['dialogue', 'summary'])

print("✅ Datasets preprocessed")

In [None]:
# Train all models
trained_models = []

for model_name in CONFIG["models_to_test"]:
    try:
        # Train model
        result = train_model(
            model_name=model_name,
            config=CONFIG,
            train_dataset=train_dataset,
            val_dataset=val_dataset,
            tokenizer=base_tokenizer
        )
        trained_models.append(result)
        
        # Clean up GPU memory
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"❌ Failed to train {model_name}: {e}")
        continue

print(f"\n✅ Trained {len(trained_models)} models successfully")

## 📊 Model Comparison

In [None]:
# Display results
results_df = pd.DataFrame([
    {
        "Model": result["model_name"].split("/")[-1],
        "ROUGE-1": result["eval_results"]["eval_rouge1"],
        "ROUGE-2": result["eval_results"]["eval_rouge2"],
        "ROUGE-L": result["eval_results"]["eval_rougeL"],
        "ROUGE Sum": result["rouge_sum"]
    }
    for result in trained_models
])

results_df = results_df.sort_values("ROUGE Sum", ascending=False)

print("\n📊 Model Comparison:")
display(results_df)

# Best model
best_model_result = trained_models[0]
print(f"\n🏆 Best Model: {best_model_result['model_name']}")
print(f"📊 ROUGE Sum: {best_model_result['rouge_sum']:.2f}")

## 📋 Summary

**완료된 작업**:
- ✅ Stage 4 노트북 기본 구조 생성
- ✅ LoRA/QLoRA 파라미터 효율적 미세조정 지원
- ✅ 다중 모델 학습 파이프라인
- ✅ 모델 비교 및 평가

**다음 단계**:
- Task #2: 고급 모델 실험 (context7 활용)
- Task #5: 최종 앙상블 및 제출