# 🗨️ Dialogue Summarization - Training Demo

> baseline.ipynb 재현을 위한 데모 노트북

이 노트북은 모듈화된 코드를 사용하여 baseline.ipynb와 동일한 학습을 수행합니다.

## 1. 환경 설정

In [1]:
import sys
import os

# 프로젝트 루트를 Python path에 추가
project_root = '/Competition/NLP/dialogue-summarization'
if project_root not in sys.path:
    sys.path.insert(0, project_root)

os.chdir(project_root)
print(f"Working directory: {os.getcwd()}")

Working directory: /Competition/NLP/dialogue-summarization


In [2]:
# 필요한 라이브러리 확인
!pip list | grep -E "torch|transformers|rouge|pandas|pyyaml"

pandas                    2.1.4
pytorch-lightning         2.1.2
rouge                     1.0.1
rouge-score               0.1.2
torch                     2.7.1+cu118
torchaudio                2.7.1+cu118
torchelastic              0.2.2
torchmetrics              1.2.1
torchvision               0.22.1+cu118
transformers              4.56.2


## 2. 모듈 Import

In [3]:
import yaml
import torch
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

# 우리가 만든 모듈들
from src.data.preprocessor import Preprocess
from src.data.dataset import DatasetForTrain, DatasetForVal
from src.models.model_loader import load_tokenizer_and_model
from src.evaluation.metrics import compute_metrics_for_trainer
from src.utils.seed import set_seed

print("✅ All modules imported successfully!")

✅ All modules imported successfully!


## 3. 설정 로드

In [4]:
# Config 파일 로드
with open('configs/train_config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# 주요 설정 출력
print("📋 Configuration:")
print(f"  Model: {config['general']['model_name']}")
print(f"  Epochs: {config['training']['num_train_epochs']}")
print(f"  Batch size: {config['training']['per_device_train_batch_size']}")
print(f"  Learning rate: {config['training']['learning_rate']}")
print(f"  Output: {config['general']['output_dir']}")

📋 Configuration:
  Model: digit82/kobart-summarization
  Epochs: 20
  Batch size: 50
  Learning rate: 1e-05
  Output: /Competition/NLP/dialogue-summarization/checkpoints/baseline_run


In [None]:
# Seed 고정
set_seed(config['training']['seed'])
print(f"✅ Random seed set to {config['training']['seed']}")

## 4. 모델 & 토크나이저 로드

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"🖥️  Device: {device}")

model, tokenizer = load_tokenizer_and_model(
    model_name=config['general']['model_name'],
    special_tokens=config['tokenizer']['special_tokens'],
    device=device
)

print(f"✅ Model loaded: {config['general']['model_name']}")
print(f"   Vocab size: {len(tokenizer)}")
print(f"   Special tokens: {len(config['tokenizer']['special_tokens'])}")

## 5. 데이터 준비

In [None]:
# Preprocessor 초기화
preprocessor = Preprocess(
    bos_token=tokenizer.bos_token,
    eos_token=tokenizer.eos_token
)

# 데이터 로드
data_path = config['general']['data_path']
train_data = preprocessor.make_set_as_df(f"{data_path}/train.csv", is_train=True)
val_data = preprocessor.make_set_as_df(f"{data_path}/dev.csv", is_train=True)

print(f"📊 Data loaded:")
print(f"   Train: {len(train_data)} samples")
print(f"   Val: {len(val_data)} samples")

In [None]:
# 샘플 확인
print("\n🔍 Sample data:")
sample = train_data.iloc[0]
print(f"Dialogue: {sample['dialogue'][:100]}...")
print(f"Summary: {sample['summary']}")

In [None]:
# 전처리 및 토크나이징
encoder_input_train, decoder_input_train, decoder_output_train = preprocessor.make_input(
    train_data, is_test=False
)
encoder_input_val, decoder_input_val, decoder_output_val = preprocessor.make_input(
    val_data, is_test=False
)

print("✅ Preprocessing completed")

In [None]:
# 토크나이징
tokenizer_config = config['tokenizer']

tokenized_encoder_inputs_train = tokenizer(
    encoder_input_train,
    return_tensors="pt",
    padding=True,
    add_special_tokens=True,
    truncation=True,
    max_length=tokenizer_config['encoder_max_len'],
    return_token_type_ids=False
)

tokenized_decoder_inputs_train = tokenizer(
    decoder_input_train,
    return_tensors="pt",
    padding=True,
    add_special_tokens=True,
    truncation=True,
    max_length=tokenizer_config['decoder_max_len'],
    return_token_type_ids=False
)

tokenized_decoder_outputs_train = tokenizer(
    decoder_output_train,
    return_tensors="pt",
    padding=True,
    add_special_tokens=True,
    truncation=True,
    max_length=tokenizer_config['decoder_max_len'],
    return_token_type_ids=False
)

print(f"✅ Tokenization completed")
print(f"   Encoder shape: {tokenized_encoder_inputs_train['input_ids'].shape}")
print(f"   Decoder shape: {tokenized_decoder_inputs_train['input_ids'].shape}")

In [None]:
# Validation 데이터도 토크나이징
tokenized_encoder_inputs_val = tokenizer(
    encoder_input_val,
    return_tensors="pt",
    padding=True,
    add_special_tokens=True,
    truncation=True,
    max_length=tokenizer_config['encoder_max_len'],
    return_token_type_ids=False
)

tokenized_decoder_inputs_val = tokenizer(
    decoder_input_val,
    return_tensors="pt",
    padding=True,
    add_special_tokens=True,
    truncation=True,
    max_length=tokenizer_config['decoder_max_len'],
    return_token_type_ids=False
)

tokenized_decoder_outputs_val = tokenizer(
    decoder_output_val,
    return_tensors="pt",
    padding=True,
    add_special_tokens=True,
    truncation=True,
    max_length=tokenizer_config['decoder_max_len'],
    return_token_type_ids=False
)

print("✅ Validation data tokenized")

In [None]:
# Dataset 생성
train_dataset = DatasetForTrain(
    tokenized_encoder_inputs_train,
    tokenized_decoder_inputs_train,
    tokenized_decoder_outputs_train,
    len(train_data)
)

val_dataset = DatasetForVal(
    tokenized_encoder_inputs_val,
    tokenized_decoder_inputs_val,
    tokenized_decoder_outputs_val,
    len(val_data)
)

print(f"✅ Datasets created")
print(f"   Train dataset: {len(train_dataset)} samples")
print(f"   Val dataset: {len(val_dataset)} samples")

## 6. 학습 설정

In [None]:
# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=config['general']['output_dir'],
    overwrite_output_dir=True,
    num_train_epochs=config['training']['num_train_epochs'],
    per_device_train_batch_size=config['training']['per_device_train_batch_size'],
    per_device_eval_batch_size=config['training']['per_device_eval_batch_size'],
    learning_rate=config['training']['learning_rate'],
    warmup_ratio=config['training']['warmup_ratio'],
    lr_scheduler_type=config['training']['lr_scheduler_type'],
    optim=config['training']['optim'],
    eval_strategy=config['training']['eval_strategy'],
    save_strategy=config['training']['save_strategy'],
    save_total_limit=config['training']['save_total_limit'],
    load_best_model_at_end=config['training']['load_best_model_at_end'],
    seed=config['training']['seed'],
    logging_dir=config['training']['logging_dir'],
    predict_with_generate=True,
    generation_max_length=tokenizer_config['decoder_max_len'],
    fp16=torch.cuda.is_available(),
)

print("✅ Training arguments configured")

In [None]:
# Data Collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

# Compute Metrics
compute_metrics = compute_metrics_for_trainer(
    tokenizer=tokenizer,
    remove_tokens=['<usr>', tokenizer.bos_token, tokenizer.eos_token, tokenizer.pad_token]
)

print("✅ Data collator and metrics configured")

In [None]:
# Trainer 생성
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("✅ Trainer initialized")

## 7. 학습 실행

⚠️ **주의**: 전체 학습은 시간이 오래 걸립니다 (수 시간 ~ 하루).

테스트용으로는 `num_train_epochs`를 1로 줄여서 실행하세요.

In [None]:
# 학습 시작
print("🚀 Training started...")
train_result = trainer.train()

print("\n✅ Training completed!")
print(f"   Train loss: {train_result.training_loss:.4f}")
print(f"   Train time: {train_result.metrics['train_runtime']:.2f}s")

## 8. 평가

In [None]:
# Validation 셋 평가
print("📊 Evaluating on validation set...")
eval_result = trainer.evaluate()

print("\n✅ Evaluation completed!")
print(f"   ROUGE-1: {eval_result['eval_rouge-1']:.2f}")
print(f"   ROUGE-2: {eval_result['eval_rouge-2']:.2f}")
print(f"   ROUGE-L: {eval_result['eval_rouge-l']:.2f}")
print(f"   ROUGE Sum: {eval_result['eval_rouge-1'] + eval_result['eval_rouge-2'] + eval_result['eval_rouge-l']:.2f}")

## 9. 모델 저장

In [None]:
# 최종 모델 저장
final_model_path = f"{config['general']['output_dir']}/final_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"✅ Model saved to: {final_model_path}")

## 10. 샘플 예측 확인

In [None]:
# 몇 개 샘플에 대해 예측 수행
import random
random.seed(42)

sample_indices = random.sample(range(len(val_data)), min(3, len(val_data)))

for idx in sample_indices:
    sample = val_data.iloc[idx]
    
    # 입력 토크나이징
    input_ids = tokenizer(
        sample['dialogue'],
        return_tensors='pt',
        max_length=tokenizer_config['encoder_max_len'],
        truncation=True
    ).input_ids.to(device)
    
    # 예측
    output_ids = model.generate(
        input_ids,
        max_length=tokenizer_config['decoder_max_len'],
        num_beams=5,
        early_stopping=True
    )
    
    predicted_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    print(f"\n{'='*80}")
    print(f"Sample {idx + 1}")
    print(f"{'='*80}")
    print(f"\n[Dialogue]\n{sample['dialogue'][:200]}...")
    print(f"\n[Ground Truth]\n{sample['summary']}")
    print(f"\n[Predicted]\n{predicted_summary}")

## 완료!

### 다음 단계:
1. **추론**: `generate_predictions.py` 실행하여 test 셋 예측
2. **제출**: 생성된 CSV 파일을 경진대회 플랫폼에 제출
3. **개선**: 하이퍼파라미터 튜닝, 다른 모델 시도