# Stage 2: Baseline Training (KoBART)

KoBART 모델을 사용한 대화 요약 베이스라인 모델 학습

## 목표
1. KoBART 모델 로딩 및 설정
2. 데이터셋 준비 (Hugging Face Dataset)
3. 모델 학습 (Seq2SeqTrainer)
4. ROUGE 평가
5. 첫 제출 파일 생성

## 📋 Config

In [None]:
CONFIG = {
    # Experiment
    "exp_num": "001",
    "exp_name": "kobart-baseline",
    "seed": 42,
    
    # Model
    "model_name": "gogamza/kobart-base-v2",
    "max_input_length": 512,
    "max_target_length": 128,
    
    # Training
    "learning_rate": 5e-5,
    "batch_size": 8,
    "gradient_accumulation_steps": 2,  # Effective batch size = 16
    "num_train_epochs": 3,
    "warmup_ratio": 0.1,
    "fp16": True,  # Mixed precision
    
    # Evaluation
    "eval_steps": 500,
    "save_steps": 500,
    "logging_steps": 100,
    
    # Generation
    "num_beams": 4,
    "length_penalty": 1.0,
    "no_repeat_ngram_size": 3,
    
    # Paths
    "data_dir": "../data/processed",
    "config_dir": "../configs",
    "checkpoint_dir": "../checkpoints/baseline",
    "submission_dir": "../submissions",
}

## 🔧 Setup

In [None]:
import os
import json
from pathlib import Path

import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from datasets import Dataset

# Custom utilities
from utils import (
    set_seed,
    setup_wandb,
    compute_rouge,
    auto_git_backup
)

# Set seed
set_seed(CONFIG["seed"])

# Create directories
os.makedirs(CONFIG["checkpoint_dir"], exist_ok=True)
os.makedirs(CONFIG["submission_dir"], exist_ok=True)

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🖥️ Using device: {device}")

## 🔬 WandB Setup

In [None]:
# Initialize WandB
run = setup_wandb(
    project_name="dialogue-summarization-competition",
    config_dict=CONFIG,
    run_name=f"{CONFIG['exp_name']}-exp{CONFIG['exp_num']}",
    tags=["baseline", "kobart", "stage2"]
)

## 📊 Data Loading

In [None]:
# Load processed data
train_df = pd.read_csv(os.path.join(CONFIG["data_dir"], "train_processed.csv"))
test_df = pd.read_csv(os.path.join(CONFIG["data_dir"], "test_processed.csv"))

# Use cleaned text
train_df = train_df[['fname', 'dialogue_clean', 'summary_clean']].rename(
    columns={'dialogue_clean': 'dialogue', 'summary_clean': 'summary'}
)
test_df = test_df[['fname', 'dialogue_clean']].rename(
    columns={'dialogue_clean': 'dialogue'}
)

print(f"📊 Train: {len(train_df):,} samples")
print(f"📊 Test: {len(test_df):,} samples")

# Train/validation split
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=CONFIG["seed"])

print(f"📊 Split - Train: {len(train_data):,}, Val: {len(val_data):,}")

## 🤖 Model & Tokenizer

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])
print(f"✅ Tokenizer loaded: {CONFIG['model_name']}")

# Load special tokens
special_tokens_path = os.path.join(CONFIG["config_dir"], "special_tokens.json")
if os.path.exists(special_tokens_path):
    with open(special_tokens_path, 'r', encoding='utf-8') as f:
        special_tokens_config = json.load(f)
    
    # Add special tokens to tokenizer
    num_added = tokenizer.add_special_tokens({
        'additional_special_tokens': special_tokens_config['additional_special_tokens']
    })
    print(f"✅ Added {num_added} special tokens")
else:
    print("⚠️ No special tokens file found, skipping...")
    num_added = 0

In [None]:
# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(CONFIG["model_name"])

# Resize token embeddings if we added special tokens
if num_added > 0:
    model.resize_token_embeddings(len(tokenizer))
    print(f"✅ Resized token embeddings to {len(tokenizer)}")

# Model to device
model = model.to(device)

print(f"✅ Model loaded: {CONFIG['model_name']}")
print(f"📊 Model parameters: {model.num_parameters():,}")

## 🔄 Dataset Preparation

In [None]:
# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_data.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_data.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

print("✅ Datasets created")

In [None]:
# Preprocessing function
def preprocess_function(examples):
    """토크나이징 및 패딩."""
    # Tokenize inputs
    model_inputs = tokenizer(
        examples['dialogue'],
        max_length=CONFIG["max_input_length"],
        truncation=True,
        padding=False  # Dynamic padding handled by data collator
    )
    
    # Tokenize targets (if exists)
    if 'summary' in examples:
        labels = tokenizer(
            examples['summary'],
            max_length=CONFIG["max_target_length"],
            truncation=True,
            padding=False
        )
        model_inputs['labels'] = labels['input_ids']
    
    return model_inputs

# Apply preprocessing
print("🔄 Tokenizing datasets...")
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['dialogue', 'summary']
)
val_dataset = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['dialogue', 'summary']
)
test_dataset = test_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['dialogue']
)

print("✅ Tokenization complete")

In [None]:
# Data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

print("✅ Data collator created")

## 📈 Training Setup

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=CONFIG["checkpoint_dir"],
    
    # Training
    num_train_epochs=CONFIG["num_train_epochs"],
    per_device_train_batch_size=CONFIG["batch_size"],
    per_device_eval_batch_size=CONFIG["batch_size"] * 2,
    gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
    learning_rate=CONFIG["learning_rate"],
    warmup_ratio=CONFIG["warmup_ratio"],
    
    # Optimization
    fp16=CONFIG["fp16"],
    gradient_checkpointing=True,
    
    # Evaluation & Logging
    eval_strategy="steps",
    eval_steps=CONFIG["eval_steps"],
    save_strategy="steps",
    save_steps=CONFIG["save_steps"],
    logging_steps=CONFIG["logging_steps"],
    
    # Prediction
    predict_with_generate=True,
    generation_max_length=CONFIG["max_target_length"],
    generation_num_beams=CONFIG["num_beams"],
    
    # Saving
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    
    # WandB
    report_to="wandb",
    run_name=f"{CONFIG['exp_name']}-exp{CONFIG['exp_num']}",
    
    # Misc
    seed=CONFIG["seed"],
    dataloader_num_workers=4,
)

print("✅ Training arguments configured")

In [None]:
# Compute metrics function
def compute_metrics(eval_pred):
    """ROUGE 메트릭 계산."""
    predictions, labels = eval_pred
    
    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Decode labels (replace -100 with pad token)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute ROUGE
    result = compute_rouge(
        predictions=decoded_preds,
        references=decoded_labels,
        use_korean_tokenizer=True
    )
    
    return result

print("✅ Metrics function defined")

In [None]:
# Create trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("✅ Trainer created")

## 🚀 Training

In [None]:
# Train the model
print("🚀 Starting training...")
train_result = trainer.train()

print("✅ Training complete!")
print(f"📊 Final train loss: {train_result.metrics['train_loss']:.4f}")

## 📊 Evaluation

In [None]:
# Evaluate on validation set
print("📊 Evaluating on validation set...")
eval_results = trainer.evaluate()

print("\n✅ Evaluation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

# Calculate total ROUGE score
rouge_sum = eval_results['eval_rouge1'] + eval_results['eval_rouge2'] + eval_results['eval_rougeL']
print(f"\n📈 Total ROUGE Score: {rouge_sum:.2f}")

In [None]:
# Sample predictions
print("\n📝 Sample Predictions:")
sample_indices = np.random.choice(len(val_data), 3, replace=False)

for idx in sample_indices:
    sample = val_data.iloc[idx]
    
    # Tokenize input
    inputs = tokenizer(
        sample['dialogue'],
        return_tensors="pt",
        max_length=CONFIG["max_input_length"],
        truncation=True
    ).to(device)
    
    # Generate
    outputs = model.generate(
        **inputs,
        max_length=CONFIG["max_target_length"],
        num_beams=CONFIG["num_beams"],
        length_penalty=CONFIG["length_penalty"],
        no_repeat_ngram_size=CONFIG["no_repeat_ngram_size"],
        early_stopping=True
    )
    
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"\n--- Sample {idx} ---")
    print(f"Dialogue: {sample['dialogue'][:100]}...")
    print(f"Reference: {sample['summary']}")
    print(f"Prediction: {prediction}")

## 💾 Save Model

In [None]:
# Save best model
final_model_path = os.path.join(CONFIG["checkpoint_dir"], "final_model")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"✅ Model saved to {final_model_path}")

## 🎯 Test Predictions & Submission

In [None]:
# Generate predictions for test set
print("🎯 Generating test predictions...")

test_predictions = []
model.eval()

with torch.no_grad():
    for idx in tqdm(range(len(test_df))):
        dialogue = test_df.iloc[idx]['dialogue']
        
        # Tokenize
        inputs = tokenizer(
            dialogue,
            return_tensors="pt",
            max_length=CONFIG["max_input_length"],
            truncation=True
        ).to(device)
        
        # Generate
        outputs = model.generate(
            **inputs,
            max_length=CONFIG["max_target_length"],
            num_beams=CONFIG["num_beams"],
            length_penalty=CONFIG["length_penalty"],
            no_repeat_ngram_size=CONFIG["no_repeat_ngram_size"],
            early_stopping=True
        )
        
        # Decode
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        test_predictions.append(prediction)

print(f"✅ Generated {len(test_predictions)} predictions")

In [None]:
# Create submission file
submission = pd.DataFrame({
    'fname': test_df['fname'],
    'summary': test_predictions
})

submission_path = os.path.join(CONFIG["submission_dir"], "submission_baseline.csv")
submission.to_csv(submission_path, index=False)

print(f"✅ Submission file created: {submission_path}")
print(f"📊 Shape: {submission.shape}")
display(submission.head())

## 🔄 Git Auto-Backup

In [None]:
# Auto backup to Git
backup_success = auto_git_backup(
    exp_num=CONFIG["exp_num"],
    model_name=CONFIG["model_name"].split("/")[-1],
    rouge_score=rouge_sum,
    config=CONFIG
)

if backup_success:
    print("✅ Git backup successful!")
else:
    print("⚠️ Git backup failed (see logs above)")

## 📋 Summary

**완료된 작업**:
- ✅ KoBART 모델 로딩 및 설정
- ✅ 데이터셋 준비 (HuggingFace Dataset)
- ✅ 모델 학습 (3 epochs)
- ✅ ROUGE 평가 완료
- ✅ 첫 제출 파일 생성
- ✅ Git 자동 백업

**다음 단계**: Stage 3 - Hyperparameter Optimization