# 🔥 Full Pipeline - 모든 기법 통합
> PRD 계획에 따른 전체 파이프라인 통합 실행

**목표 성능**: ROUGE-F1 85+

In [1]:
# 환경 설정
import sys
import os
from pathlib import Path

# 프로젝트 루트 경로 추가
notebook_dir = Path.cwd()
project_root = notebook_dir.parent.parent.parent  # 3번만 parent 사용!

# 다른 프로젝트 경로 제거하고 현재 프로젝트 경로만 추가
sys.path = [p for p in sys.path if 'computer-vision-competition' not in p]
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project Root: {project_root}")
print(f"Current Dir: {notebook_dir}")

# 필요한 라이브러리 임포트
import yaml
import pandas as pd
import numpy as np
import torch
from datetime import datetime
from typing import List, Dict
import matplotlib.pyplot as plt
import optuna
import wandb

# 커스텀 모듈 임포트 - 04_multi_model_ensemble.ipynb에서 참고
from src.logging.notebook_logger import NotebookLogger
from src.utils.gpu_optimization.team_gpu_check import check_gpu_tier
from src.utils.visualizations.training_viz import TrainingVisualizer

print("Libraries imported successfully!")

Project Root: /home/ieyeppo/AI_Lab/natural-language-processing-competition
Current Dir: /home/ieyeppo/AI_Lab/natural-language-processing-competition/notebooks/team/CHH
✅ 나눔고딕 폰트 로드 성공
Libraries imported successfully!


In [2]:
# 설정 파일 로드
config_path = notebook_dir / 'configs' / 'config_full_pipeline.yaml'

with open(config_path, 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

print("=" * 50)
print("FULL PIPELINE CONFIGURATION")
print("=" * 50)
print(f"Pipeline Stages: {len(config['pipeline']['stages'])}")
for stage in config['pipeline']['stages']:
    print(f"  ✓ {stage}")

FULL PIPELINE CONFIGURATION
Pipeline Stages: 9
  ✓ data_quality_check
  ✓ data_preprocessing
  ✓ data_augmentation
  ✓ model_training
  ✓ cross_validation
  ✓ ensemble
  ✓ hyperparameter_optimization
  ✓ inference_optimization
  ✓ final_prediction


In [3]:
# 로그 디렉토리 생성
log_dir = Path(config['paths']['log_dir'])
print(f"Log Directory: {log_dir}")
log_dir.mkdir(parents=True, exist_ok=True)

# 타임스탬프 생성
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# 로거 초기화
log_file = log_dir / f'full_pipeline_{timestamp}.log'
logger = NotebookLogger(
    log_path=str(log_file),
    print_also=True
)

logger.write('='*50)
logger.write('FULL PIPELINE EXECUTION STARTED')
logger.write(f'Timestamp: {timestamp}')
logger.write(f'Config: {config_path}')
logger.write('='*50)

Log Directory: logs/full_pipeline
FULL PIPELINE EXECUTION STARTED
Timestamp: 20251010_092701
Config: /home/ieyeppo/AI_Lab/natural-language-processing-competition/notebooks/team/CHH/configs/config_full_pipeline.yaml


In [4]:
# GPU 최적화 체크
# 필요한 모듈 import
if 'check_gpu_tier' not in globals():
    try:
        from src.utils.gpu_optimization.team_gpu_check import check_gpu_tier
    except ImportError:
        print("Warning: Could not import check_gpu_tier")
        def check_gpu_tier():
            return "UNKNOWN"

# config가 로드되어 있는지 확인
if 'config' not in globals():
    print("Warning: config not loaded. Please run cell 2 first.")
else:
    if config['gpu']['auto_optimization']['enabled']:
        gpu_tier = check_gpu_tier()
        if 'logger' in globals():
            logger.write(f"GPU Tier: {gpu_tier}")
            logger.write(f"Auto-optimization enabled")
            
            if config['gpu']['auto_optimization']['find_optimal_batch_size']:
                logger.write("Finding optimal batch size...")
                # 최적 배치 크기 탐색 코드
        else:
            print(f"GPU Tier: {gpu_tier}")
            print(f"Auto-optimization enabled")

GPU Tier: LOW
Auto-optimization enabled
Finding optimal batch size...


In [5]:
# 성능 목표 확인
print("\n" + "=" * 50)
print("PERFORMANCE TARGETS")
print("=" * 50)
print(f"ROUGE-1: {config['performance_targets']['rouge_1']}")
print(f"ROUGE-2: {config['performance_targets']['rouge_2']}")
print(f"ROUGE-L: {config['performance_targets']['rouge_l']}")
print(f"Overall Target: {config['performance_targets']['overall']}")
print("=" * 50)


PERFORMANCE TARGETS
ROUGE-1: 0.45
ROUGE-2: 0.3
ROUGE-L: 0.4
Overall Target: 0.85


In [6]:
# 파이프라인 실행 상태 추적
# config가 로드되어 있는지 확인
if 'config' not in globals():
    print("Error: config not loaded. Please run cell 2 first.")
else:
    pipeline_status = {}
    for stage in config['pipeline']['stages']:
        pipeline_status[stage] = 'pending'

    def update_status(stage, status):
        pipeline_status[stage] = status
        if 'logger' in globals():
            logger.write(f"[{stage}] Status: {status}")
        else:
            print(f"[{stage}] Status: {status}")
        
    # 상태 표시
    for stage, status in pipeline_status.items():
        print(f"{stage:30s}: {status}")

data_quality_check            : pending
data_preprocessing            : pending
data_augmentation             : pending
model_training                : pending
cross_validation              : pending
ensemble                      : pending
hyperparameter_optimization   : pending
inference_optimization        : pending
final_prediction              : pending


In [7]:
# Stage 1: 데이터 품질 검증
if 'data_quality_check' in config['pipeline']['stages']:
    update_status('data_quality_check', 'running')
    logger.write("\n=== Data Quality Check ===")
    
    # 데이터 로드
    train_df = pd.read_csv(config['paths']['train_file'])
    logger.write(f"Loaded {len(train_df)} training samples")
    
    # 품질 검증 실행
    if config['data_quality']['enabled']:
        # 구조적 검증
        if config['data_quality']['checks']['structural']['check_nulls']:
            null_count = train_df.isnull().sum().sum()
            logger.write(f"Null values: {null_count}")
    
    update_status('data_quality_check', 'completed')

[data_quality_check] Status: running

=== Data Quality Check ===
Loaded 12457 training samples
Null values: 0
[data_quality_check] Status: completed


In [8]:
# Stage 1: 데이터 품질 검증 및 로드
# 필요한 import
import pandas as pd
from pathlib import Path

# notebook_dir 확인
if 'notebook_dir' not in globals():
    notebook_dir = Path.cwd()

# config와 필요한 함수 확인
if 'config' not in globals():
    print("Error: config not loaded. Please run cell 2 first.")
elif 'update_status' not in globals():
    print("Error: update_status function not defined. Please run cell 6 first.")
else:
    if 'data_quality_check' in config['pipeline']['stages']:
        update_status('data_quality_check', 'running')
        
        if 'logger' in globals():
            logger.write("\n=== Data Quality Check ===")
        else:
            print("\n=== Data Quality Check ===")
        
        # config 파일의 경로 사용
        def get_data_path(path_str):
            """config의 상대 경로를 절대 경로로 변환"""
            path = Path(path_str)
            if not path.is_absolute():
                path = notebook_dir / path
            return path
        
        # config에서 데이터 경로 가져오기
        train_path = get_data_path(config['paths']['train_file'])
        dev_path = get_data_path(config['paths']['dev_file'])
        test_path = get_data_path(config['paths']['test_file'])
        
        log_msg = f"Loading data from config paths:\n  - Train: {train_path}\n  - Dev: {dev_path}\n  - Test: {test_path}"
        if 'logger' in globals():
            logger.write(log_msg)
        else:
            print(log_msg)
        
        # 데이터 로드
        train_df = pd.read_csv(train_path)
        dev_df = pd.read_csv(dev_path)
        test_df = pd.read_csv(test_path)
        
        log_msg = f"Loaded {len(train_df)} training samples\nLoaded {len(dev_df)} dev samples\nLoaded {len(test_df)} test samples"
        if 'logger' in globals():
            logger.write(log_msg)
        else:
            print(log_msg)
        
        # 품질 검증 실행
        if config['data_quality']['enabled']:
            # 구조적 검증
            if config['data_quality']['checks']['structural']['check_nulls']:
                null_count = train_df.isnull().sum().sum()
                log_msg = f"Null values: {null_count}"
                if 'logger' in globals():
                    logger.write(log_msg)
                else:
                    print(log_msg)
            
            if config['data_quality']['checks']['structural']['check_duplicates']:
                dup_count = train_df.duplicated().sum()
                log_msg = f"Duplicate rows: {dup_count}"
                if 'logger' in globals():
                    logger.write(log_msg)
                else:
                    print(log_msg)
        
        update_status('data_quality_check', 'completed')

[data_quality_check] Status: running

=== Data Quality Check ===
Loading data from config paths:
  - Train: /home/ieyeppo/AI_Lab/natural-language-processing-competition/notebooks/team/CHH/../../../data/raw/train.csv
  - Dev: /home/ieyeppo/AI_Lab/natural-language-processing-competition/notebooks/team/CHH/../../../data/raw/dev.csv
  - Test: /home/ieyeppo/AI_Lab/natural-language-processing-competition/notebooks/team/CHH/../../../data/raw/test.csv
Loaded 12457 training samples
Loaded 499 dev samples
Loaded 499 test samples
Null values: 0
Duplicate rows: 0
[data_quality_check] Status: completed


In [9]:
# 시각화 설정
if config['visualization']['enabled']:
    viz = TrainingVisualizer()
    viz_dir = Path(config['visualization']['save_path'])
    viz_dir.mkdir(parents=True, exist_ok=True)
    logger.write(f"Visualizations will be saved to: {viz_dir}")

Visualizations will be saved to: ../logs/full_pipeline/visualizations


In [10]:
# WandB 초기화 (전체 파이프라인 추적)
if config['wandb']['mode'] != 'disabled':
    wandb.init(
        project=config['wandb']['project'],
        entity=config['wandb']['entity'],
        name=config['wandb']['name'],
        tags=config['wandb']['tags'],
        config=config
    )
    logger.write("WandB initialized for full pipeline tracking")

[34m[1mwandb[0m: Currently logged in as: [33mieyeppo-job[0m ([33mkimsunmin0227-hufs[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


WandB initialized for full pipeline tracking


## 전체 파이프라인 실행 코드는 config 파일 설정에 따라 구현

### 실행 단계:
1. 데이터 품질 검증
2. 데이터 전처리 및 증강
3. 모델 학습 (Multi-model)
4. K-Fold 교차 검증
5. Optuna 최적화
6. 앙상블 + TTA
7. 추론 최적화
8. 최종 예측 및 제출