# 📊 K-Fold 교차 검증 - 5-Fold Cross Validation
> PRD 계획에 따른 K-Fold 교차 검증으로 모델 안정성 평가

**목표 성능**: ROUGE-F1 72-75

In [1]:
# 환경 설정
import sys
import os
from pathlib import Path

# 프로젝트 루트 경로 추가
notebook_dir = Path.cwd()
project_root = notebook_dir.parent.parent.parent  # 3번만 parent 사용!

# 다른 프로젝트 경로 제거하고 현재 프로젝트 경로만 추가
sys.path = [p for p in sys.path if 'computer-vision-competition' not in p]
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project Root: {project_root}")
print(f"Current Dir: {notebook_dir}")

# 필요한 라이브러리 임포트
import yaml
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import torch
from datetime import datetime
import wandb

# 커스텀 모듈 임포트
from src.logging.notebook_logger import NotebookLogger
from src.utils.gpu_optimization.team_gpu_check import check_gpu_tier

print("Libraries imported successfully!")

Project Root: /home/ieyeppo/AI_Lab/natural-language-processing-competition
Current Dir: /home/ieyeppo/AI_Lab/natural-language-processing-competition/notebooks/team/CHH
Libraries imported successfully!


In [2]:
# 설정 파일 로드
config_path = notebook_dir / 'configs' / 'config_kfold.yaml'

with open(config_path, 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

print(f"K-Fold Splits: {config['kfold']['n_splits']}")
print(f"Model: {config['model']['name']}")
print(f"Ensemble Method: {config['kfold']['ensemble_method']}")
print(f"Save Each Fold: {config['kfold']['save_each_fold']}")

K-Fold Splits: 5
Model: upstage/SOLAR-10.7B-Instruct-v1.0
Ensemble Method: weighted_average
Save Each Fold: True


In [3]:
# 로그 디렉토리 생성
# config의 로그 경로 사용
def get_path(path_str):
    """config의 상대 경로를 절대 경로로 변환"""
    path = Path(path_str)
    if not path.is_absolute():
        path = notebook_dir / path
    return path

# config에 log_dir이 정의되어 있으면 사용, 없으면 기본값
if 'log_dir' in config['paths']:
    log_dir = get_path(config['paths']['log_dir'])
else:
    # 기본값: notebook_dir/logs/kfold
    log_dir = notebook_dir / 'logs' / 'kfold'

log_dir.mkdir(parents=True, exist_ok=True)

# 타임스탬프 생성
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# 로거 초기화
log_file = log_dir / f'kfold_{config["kfold"]["n_splits"]}fold_{timestamp}.log'
logger = NotebookLogger(
    log_path=str(log_file),
    print_also=True
)

logger.write('='*50)
logger.write('K-Fold Cross Validation Experiment')
logger.write(f'Timestamp: {timestamp}')
logger.write(f'Folds: {config["kfold"]["n_splits"]}')
logger.write('='*50)

K-Fold Cross Validation Experiment
Timestamp: 20251010_090402
Folds: 5


In [4]:
# K-Fold 설정
kfold = KFold(
    n_splits=config['kfold']['n_splits'],
    shuffle=config['kfold']['shuffle'],
    random_state=config['kfold']['random_state']
)

logger.write(f"KFold configured:")
logger.write(f"  - Splits: {config['kfold']['n_splits']}")
logger.write(f"  - Shuffle: {config['kfold']['shuffle']}")
logger.write(f"  - Random State: {config['kfold']['random_state']}")

KFold configured:
  - Splits: 5
  - Shuffle: True
  - Random State: 42


In [5]:
# 데이터 로드
# config 파일의 경로 사용
def get_data_path(path_str):
    """config의 상대 경로를 절대 경로로 변환"""
    path = Path(path_str)
    if not path.is_absolute():
        path = notebook_dir / path
    return path

# config에서 데이터 경로 가져오기
train_path = get_data_path(config['paths']['train_file'])

logger.write(f"Loading data from config path:")
logger.write(f"  - Train: {train_path}")

# 데이터 로드
train_df = pd.read_csv(train_path)

logger.write(f"\nData loaded: {len(train_df)} samples")

# Fold별 데이터 분할 확인
for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(train_df), 1):
    logger.write(f"Fold {fold_idx}: Train={len(train_idx)}, Val={len(val_idx)}")
    print(f"Fold {fold_idx}: Train={len(train_idx)}, Val={len(val_idx)}")

Loading data from config path:
  - Train: /home/ieyeppo/AI_Lab/natural-language-processing-competition/notebooks/team/CHH/../../../data/raw/train.csv

Data loaded: 12457 samples
Fold 1: Train=9965, Val=2492
Fold 1: Train=9965, Val=2492
Fold 2: Train=9965, Val=2492
Fold 2: Train=9965, Val=2492
Fold 3: Train=9966, Val=2491
Fold 3: Train=9966, Val=2491
Fold 4: Train=9966, Val=2491
Fold 4: Train=9966, Val=2491
Fold 5: Train=9966, Val=2491
Fold 5: Train=9966, Val=2491


In [None]:
# 데이터 품질 검증 시스템 (PRD 16_데이터_품질_검증_시스템.md)
import numpy as np
from typing import Dict, List

class KFoldDataValidator:
    """K-Fold용 데이터 품질 검증"""
    
    def __init__(self):
        self.validation_results = []
        
    def validate_fold_distribution(self, train_df: pd.DataFrame, kfold) -> Dict:
        """Fold 분포 검증"""
        results = {}
        
        # 주제 분포 확인
        if 'topic' in train_df.columns:
            topic_counts = train_df['topic'].value_counts()
            
            # 각 fold의 주제 분포 확인
            fold_topic_distributions = []
            for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(train_df)):
                fold_train = train_df.iloc[train_idx]
                fold_val = train_df.iloc[val_idx]
                
                train_topics = fold_train['topic'].value_counts(normalize=True)
                val_topics = fold_val['topic'].value_counts(normalize=True)
                
                # 분포 차이 계산 (KL divergence 근사)
                topic_diff = 0
                for topic in topic_counts.index:
                    train_prop = train_topics.get(topic, 0)
                    val_prop = val_topics.get(topic, 0)
                    if train_prop > 0 and val_prop > 0:
                        topic_diff += abs(train_prop - val_prop)
                
                fold_topic_distributions.append({
                    'fold': fold_idx + 1,
                    'distribution_diff': topic_diff,
                    'train_unique_topics': len(train_topics),
                    'val_unique_topics': len(val_topics)
                })
            
            results['topic_distributions'] = fold_topic_distributions
        
        # 텍스트 길이 분포 확인
        dialogue_lengths = train_df['dialogue'].str.len()
        
        fold_length_stats = []
        for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(train_df)):
            train_lengths = dialogue_lengths.iloc[train_idx]
            val_lengths = dialogue_lengths.iloc[val_idx]
            
            fold_length_stats.append({
                'fold': fold_idx + 1,
                'train_mean_length': train_lengths.mean(),
                'val_mean_length': val_lengths.mean(),
                'length_diff': abs(train_lengths.mean() - val_lengths.mean())
            })
        
        results['length_distributions'] = fold_length_stats
        
        # 데이터 누출 검사
        for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(train_df)):
            train_set = set(train_idx)
            val_set = set(val_idx)
            
            # 인덱스 중복 검사
            overlap = train_set.intersection(val_set)
            if overlap:
                logger.write(f"⚠️ WARNING: Data leakage detected in fold {fold_idx+1}: {len(overlap)} overlapping indices")
                results['data_leakage'] = True
            else:
                results['data_leakage'] = False
        
        self.validation_results = results
        return results
    
    def recommend_stratification(self) -> List[str]:
        """층화 추천"""
        recommendations = []
        
        if 'topic_distributions' in self.validation_results:
            max_diff = max([f['distribution_diff'] for f in self.validation_results['topic_distributions']])
            if max_diff > 0.2:
                recommendations.append("Consider stratified K-Fold based on topic distribution")
        
        if 'length_distributions' in self.validation_results:
            max_length_diff = max([f['length_diff'] for f in self.validation_results['length_distributions']])
            if max_length_diff > 500:
                recommendations.append("Consider stratification based on text length")
        
        if self.validation_results.get('data_leakage', False):
            recommendations.append("CRITICAL: Fix data leakage issue immediately")
        
        return recommendations

# K-Fold 데이터 검증 실행
kfold_validator = KFoldDataValidator()
validation_results = kfold_validator.validate_fold_distribution(train_df, kfold)

logger.write("\n=== K-Fold Data Validation ===")
if 'topic_distributions' in validation_results:
    logger.write("\nTopic Distribution Analysis:")
    for fold_stat in validation_results['topic_distributions'][:2]:  # 처음 2개 fold만 출력
        logger.write(f"  Fold {fold_stat['fold']}: Distribution diff={fold_stat['distribution_diff']:.3f}")

if 'length_distributions' in validation_results:
    logger.write("\nText Length Distribution:")
    for fold_stat in validation_results['length_distributions'][:2]:  # 처음 2개 fold만 출력
        logger.write(f"  Fold {fold_stat['fold']}: Mean length diff={fold_stat['length_diff']:.1f}")

recommendations = kfold_validator.recommend_stratification()
if recommendations:
    logger.write("\n📋 Stratification Recommendations:")
    for rec in recommendations:
        logger.write(f"  • {rec}")

In [None]:
# Solar API 교차 검증 시스템 (PRD 09_Solar_API_최적화.md, 10_교차_검증_시스템.md)
import requests
import json
from typing import Optional, Dict

class KFoldSolarValidator:
    """K-Fold용 Solar API 검증"""
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.upstage.ai/v1/solar"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
        self.fold_comparisons = []
        
    def validate_fold_predictions(self, fold_idx: int, model_predictions: List[str], 
                                 test_dialogues: List[str], sample_size: int = 5) -> Dict:
        """Fold 예측 검증"""
        comparisons = []
        
        # 랜덤 샘플 선택
        sample_indices = np.random.choice(len(model_predictions), 
                                        min(sample_size, len(model_predictions)), 
                                        replace=False)
        
        for idx in sample_indices:
            dialogue = test_dialogues[idx]
            model_pred = model_predictions[idx]
            
            # Solar API 예측
            api_pred = self.generate_with_solar(dialogue)
            
            if api_pred:
                # 길이 비교
                model_len = len(model_pred)
                api_len = len(api_pred)
                
                comparisons.append({
                    'model_length': model_len,
                    'api_length': api_len,
                    'model_summary': model_pred[:100],
                    'api_summary': api_pred[:100]
                })
        
        result = {
            'fold': fold_idx,
            'comparisons': comparisons,
            'avg_model_length': np.mean([c['model_length'] for c in comparisons]),
            'avg_api_length': np.mean([c['api_length'] for c in comparisons])
        }
        
        self.fold_comparisons.append(result)
        return result
    
    def generate_with_solar(self, dialogue: str, max_tokens: int = 150) -> Optional[str]:
        """Solar API로 요약 생성"""
        try:
            # 토큰 절약을 위한 텍스트 제한
            if len(dialogue) > 2000:
                dialogue = dialogue[:2000] + "..."
            
            prompt = f"""다음 대화를 3-5문장으로 간결하게 요약하세요:

{dialogue}

요약:"""
            
            payload = {
                "model": "solar-1-mini-chat",
                "messages": [
                    {"role": "system", "content": "당신은 전문적인 대화 요약 AI입니다."},
                    {"role": "user", "content": prompt}
                ],
                "max_tokens": max_tokens,
                "temperature": 0.3,
                "top_p": 0.9
            }
            
            response = requests.post(
                f"{self.base_url}/chat/completions",
                headers=self.headers,
                json=payload,
                timeout=30
            )
            
            if response.status_code == 200:
                result = response.json()
                return result['choices'][0]['message']['content']
            else:
                logger.write(f"Solar API error: {response.status_code}")
                return None
                
        except Exception as e:
            logger.write(f"Solar API exception: {e}")
            return None
    
    def get_fold_consensus(self) -> Dict:
        """Fold 간 일관성 분석"""
        if not self.fold_comparisons:
            return {}
        
        all_model_lengths = []
        all_api_lengths = []
        
        for fold_comp in self.fold_comparisons:
            all_model_lengths.append(fold_comp['avg_model_length'])
            all_api_lengths.append(fold_comp['avg_api_length'])
        
        return {
            'model_length_consistency': np.std(all_model_lengths),
            'api_length_consistency': np.std(all_api_lengths),
            'avg_model_vs_api_ratio': np.mean(all_model_lengths) / np.mean(all_api_lengths) if np.mean(all_api_lengths) > 0 else 0
        }

# 리스크 관리 시스템 (PRD 05_리스크_관리.md)
class KFoldRiskManager:
    """K-Fold 학습 리스크 관리"""
    
    def __init__(self):
        self.fold_risks = []
        self.critical_risks = []
        
    def assess_fold_risk(self, fold_idx: int, fold_result: Dict) -> Dict:
        """Fold별 리스크 평가"""
        risks = []
        
        # 성능 편차 리스크
        if 'best_rouge_l' in fold_result:
            rouge_score = fold_result['best_rouge_l']
            
            if rouge_score < 0.3:
                risks.append({
                    'type': 'poor_performance',
                    'severity': 'high',
                    'fold': fold_idx,
                    'metric': f'ROUGE-L: {rouge_score:.4f}',
                    'mitigation': 'Check data quality, increase epochs, or adjust hyperparameters'
                })
            
            # 이전 fold와 비교
            if self.fold_risks:
                prev_scores = [f.get('rouge_score', 0) for f in self.fold_risks]
                avg_prev = np.mean(prev_scores)
                if avg_prev > 0 and abs(rouge_score - avg_prev) > 0.1:
                    risks.append({
                        'type': 'inconsistent_performance',
                        'severity': 'medium',
                        'fold': fold_idx,
                        'metric': f'Deviation from avg: {abs(rouge_score - avg_prev):.4f}',
                        'mitigation': 'Review fold data distribution and training process'
                    })
        
        # 학습 안정성 리스크
        if 'train_loss' in fold_result and 'val_loss' in fold_result:
            train_loss = fold_result['train_loss']
            val_loss = fold_result['val_loss']
            
            if val_loss > train_loss * 2:
                risks.append({
                    'type': 'severe_overfitting',
                    'severity': 'critical',
                    'fold': fold_idx,
                    'metric': f'Val/Train ratio: {val_loss/train_loss:.2f}',
                    'mitigation': 'Apply regularization, reduce model complexity, or use early stopping'
                })
        
        # 메모리 리스크
        if torch.cuda.is_available():
            memory_used = torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated()
            if memory_used > 0.85:
                risks.append({
                    'type': 'memory_pressure',
                    'severity': 'medium',
                    'fold': fold_idx,
                    'metric': f'Memory usage: {memory_used:.1%}',
                    'mitigation': 'Reduce batch size or enable gradient accumulation'
                })
        
        # 리스크 기록
        fold_risk_summary = {
            'fold': fold_idx,
            'risks': risks,
            'risk_count': len(risks),
            'rouge_score': fold_result.get('best_rouge_l', 0)
        }
        
        self.fold_risks.append(fold_risk_summary)
        
        # 심각한 리스크 추적
        for risk in risks:
            if risk['severity'] == 'critical':
                self.critical_risks.append(risk)
                logger.write(f"⚠️ CRITICAL RISK in Fold {fold_idx}: {risk['type']} - {risk['metric']}")
        
        return fold_risk_summary
    
    def get_overall_risk_assessment(self) -> Dict:
        """전체 리스크 평가"""
        if not self.fold_risks:
            return {'status': 'no_data'}
        
        total_risks = sum([f['risk_count'] for f in self.fold_risks])
        avg_risk_per_fold = total_risks / len(self.fold_risks)
        
        # Fold 간 성능 편차
        rouge_scores = [f['rouge_score'] for f in self.fold_risks if f['rouge_score'] > 0]
        performance_variance = np.var(rouge_scores) if rouge_scores else 0
        
        assessment = {
            'total_risks': total_risks,
            'critical_risks': len(self.critical_risks),
            'avg_risks_per_fold': avg_risk_per_fold,
            'performance_variance': performance_variance,
            'risk_level': 'low' if avg_risk_per_fold < 1 else 'medium' if avg_risk_per_fold < 2 else 'high'
        }
        
        return assessment
    
    def suggest_improvements(self) -> List[str]:
        """개선 제안"""
        suggestions = []
        
        assessment = self.get_overall_risk_assessment()
        
        if assessment.get('performance_variance', 0) > 0.01:
            suggestions.append("High variance between folds - consider stratified sampling")
        
        if assessment.get('critical_risks', 0) > 0:
            suggestions.append("Critical risks detected - immediate attention required")
        
        if assessment.get('risk_level') == 'high':
            suggestions.append("Overall risk level is high - review training configuration")
        
        # 리스크 타입별 제안
        risk_types = {}
        for fold_risk in self.fold_risks:
            for risk in fold_risk['risks']:
                risk_type = risk['type']
                if risk_type not in risk_types:
                    risk_types[risk_type] = 0
                risk_types[risk_type] += 1
        
        if 'severe_overfitting' in risk_types:
            suggestions.append("Overfitting detected in multiple folds - increase regularization")
        
        if 'memory_pressure' in risk_types:
            suggestions.append("Memory issues detected - optimize batch size or use gradient accumulation")
        
        return suggestions

# Solar API 검증 초기화 (config에서 키 확인)
solar_validator = None
if 'solar_api' in config and 'api_key' in config['solar_api']:
    solar_validator = KFoldSolarValidator(config['solar_api']['api_key'])
    logger.write("Solar API validator initialized for K-Fold cross-validation")
else:
    logger.write("Solar API key not found - skipping API validation")

# 리스크 매니저 초기화
risk_manager = KFoldRiskManager()
logger.write("K-Fold risk management system initialized")

In [6]:
# GPU 체크
if torch.cuda.is_available():
    gpu_tier = check_gpu_tier()
    logger.write(f"\nGPU: {torch.cuda.get_device_name(0)}")
    logger.write(f"GPU Tier: {gpu_tier}")
    
    if config['gpu']['empty_cache_between_folds']:
        logger.write("Will clear GPU cache between folds")


GPU: NVIDIA GeForce RTX 4090
GPU Tier: LOW
Will clear GPU cache between folds


In [None]:
# 데이터셋 클래스 정의
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BartForConditionalGeneration

class DialogueSummaryDataset(Dataset):
    """대화 요약 데이터셋"""
    def __init__(self, dataframe, tokenizer, max_input_len=512, max_target_len=150, is_test=False):
        self.df = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len
        self.is_test = is_test
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        dialogue = row['dialogue']
        
        # 입력 토큰화
        inputs = self.tokenizer(
            dialogue,
            max_length=self.max_input_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        if not self.is_test:
            # 학습/검증 모드
            summary = row['summary']
            
            # 타겟 토큰화
            targets = self.tokenizer(
                summary,
                max_length=self.max_target_len,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            
            # 라벨 생성 - 패딩 토큰을 -100으로 마스킹 (중요!)
            labels = targets['input_ids'].squeeze()
            labels[labels == self.tokenizer.pad_token_id] = -100  # 패딩 토큰 마스킹
            
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'labels': labels
            }
        else:
            # 테스트 모드
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze()
            }

# ROUGE 스코어 계산 함수
from rouge import Rouge

def compute_rouge_scores(predictions, references):
    """ROUGE 점수 계산"""
    rouge = Rouge()
    
    # 빈 문자열 처리
    predictions = [p if p else "요약 없음" for p in predictions]
    references = [r if r else "요약 없음" for r in references]
    
    try:
        scores = rouge.get_scores(predictions, references, avg=True)
        return {
            'rouge-1': scores['rouge-1']['f'],
            'rouge-2': scores['rouge-2']['f'],
            'rouge-l': scores['rouge-l']['f']
        }
    except Exception as e:
        logger.write(f"ROUGE calculation error: {e}")
        return {
            'rouge-1': 0.0,
            'rouge-2': 0.0,
            'rouge-l': 0.0
        }

logger.write("Dataset class and ROUGE scorer defined")

In [None]:
# K-Fold 학습 함수 (PRD 전략 통합)
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm
import gc

def train_fold_with_validation(model, train_loader, val_loader, tokenizer, fold_idx, config, 
                              solar_validator=None, risk_manager=None):
    """향상된 Fold 학습 (Solar API 검증 및 리스크 모니터링 포함)"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    # learning_rate 변환
    learning_rate = config['training']['learning_rate']
    if isinstance(learning_rate, str):
        learning_rate = float(learning_rate)
    
    # 옵티마이저 설정
    optimizer = AdamW(
        model.parameters(),
        lr=learning_rate,
        weight_decay=config['training']['weight_decay']
    )
    
    num_epochs = config['training']['num_epochs']
    num_training_steps = num_epochs * len(train_loader)
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(num_training_steps * config['training']['warmup_ratio']),
        num_training_steps=num_training_steps
    )
    
    best_rouge_l = 0
    best_epoch = 0
    fold_history = {'train_loss': [], 'val_loss': [], 'rouge_l': [], 'risks': []}
    
    for epoch in range(num_epochs):
        # 학습
        model.train()
        total_loss = 0
        
        progress_bar = tqdm(train_loader, desc=f'Fold {fold_idx} Epoch {epoch+1}')
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            progress_bar.set_postfix({'loss': loss.item()})
            
            # 메모리 관리
            if batch.get('__index', 0) % 50 == 0:
                torch.cuda.empty_cache()
        
        avg_train_loss = total_loss / len(train_loader)
        fold_history['train_loss'].append(avg_train_loss)
        
        # 검증
        model.eval()
        val_loss = 0
        predictions = []
        references = []
        val_dialogues = []
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validating'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                
                # 예측 생성
                generated_ids = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=config['model']['max_target_length'],
                    num_beams=config['evaluation']['num_beams'],
                    early_stopping=True,
                    no_repeat_ngram_size=config['evaluation'].get('no_repeat_ngram_size', 2),
                    temperature=config['evaluation'].get('temperature', 1.0)
                )
                
                preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
                refs = tokenizer.batch_decode(labels, skip_special_tokens=True)
                
                predictions.extend(preds)
                references.extend(refs)
                
                # 대화 원문 저장 (Solar API 비교용)
                dialogues = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
                val_dialogues.extend(dialogues)
        
        avg_val_loss = val_loss / len(val_loader)
        fold_history['val_loss'].append(avg_val_loss)
        
        # ROUGE 계산
        rouge_scores = compute_rouge_scores(predictions[:100], references[:100])  # 샘플만 평가
        fold_history['rouge_l'].append(rouge_scores['rouge-l'])
        
        logger.write(f"\nFold {fold_idx} Epoch {epoch+1}:")
        logger.write(f"  Train Loss: {avg_train_loss:.4f}")
        logger.write(f"  Val Loss: {avg_val_loss:.4f}")
        logger.write(f"  ROUGE-L: {rouge_scores['rouge-l']:.4f}")
        
        # Solar API 검증 (2 에폭마다)
        if solar_validator and epoch % 2 == 0 and len(predictions) > 0:
            logger.write(f"  🔄 Solar API validation...")
            solar_result = solar_validator.validate_fold_predictions(
                fold_idx=fold_idx,
                model_predictions=predictions[:10],
                test_dialogues=val_dialogues[:10],
                sample_size=3
            )
            
            if solar_result and 'comparisons' in solar_result:
                logger.write(f"    Model avg length: {solar_result['avg_model_length']:.1f}")
                logger.write(f"    API avg length: {solar_result['avg_api_length']:.1f}")
        
        # 리스크 모니터링
        if risk_manager:
            fold_result = {
                'best_rouge_l': rouge_scores['rouge-l'],
                'train_loss': avg_train_loss,
                'val_loss': avg_val_loss
            }
            
            risk_assessment = risk_manager.assess_fold_risk(fold_idx, fold_result)
            fold_history['risks'].append(risk_assessment)
            
            if risk_assessment['risk_count'] > 0:
                logger.write(f"  ⚠️ {risk_assessment['risk_count']} risks detected")
                
                # 자동 완화 적용
                for risk in risk_assessment['risks']:
                    if risk['severity'] == 'critical':
                        logger.write(f"    Critical: {risk['type']}")
                        
                        # 학습률 조정
                        if risk['type'] == 'severe_overfitting':
                            for param_group in optimizer.param_groups:
                                param_group['lr'] *= 0.5
                            logger.write(f"    → Learning rate reduced")
        
        # Best model 추적
        if rouge_scores['rouge-l'] > best_rouge_l:
            best_rouge_l = rouge_scores['rouge-l']
            best_epoch = epoch
            
            # 모델 저장 (설정된 경우)
            if config['kfold']['save_each_fold']:
                output_dir = get_path(config['paths']['output_dir'])
                output_dir.mkdir(parents=True, exist_ok=True)
                model_path = output_dir / f'fold_{fold_idx}_best_model.pt'
                
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'rouge_scores': rouge_scores,
                    'epoch': epoch,
                    'fold_history': fold_history
                }, model_path)
                
                logger.write(f"  ✓ Saved best model for fold {fold_idx}")
        
        # Early stopping
        if config['training'].get('early_stopping_patience'):
            if epoch - best_epoch >= config['training']['early_stopping_patience']:
                logger.write(f"  Early stopping triggered at epoch {epoch+1}")
                break
    
    return best_rouge_l, fold_history

logger.write("Enhanced train fold function defined with PRD strategies")

In [None]:
# WandB 초기화
if config['wandb']['mode'] != 'disabled':
    wandb.init(
        project=config['wandb']['project'],
        entity=config['wandb']['entity'],
        name=config['wandb']['name'],
        tags=config['wandb']['tags'],
        config=config
    )
    logger.write("WandB initialized for K-Fold experiment")

# 모델 및 토크나이저 초기화 (한 번만)
logger.write(f"\nInitializing model: {config['model']['name']}")
tokenizer = AutoTokenizer.from_pretrained(config['model']['name'])
logger.write("Tokenizer loaded")

# K-Fold 메인 학습 루프 (PRD 전략 통합)
all_fold_results = []
all_fold_histories = []

logger.write("\n" + "="*50)
logger.write("Starting K-Fold Cross Validation with PRD Strategies")
logger.write("="*50)

for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(train_df), 1):
    logger.write(f"\n{'='*30}")
    logger.write(f"FOLD {fold_idx}/{config['kfold']['n_splits']}")
    logger.write(f"{'='*30}")
    
    # GPU 캐시 정리
    if config['gpu']['empty_cache_between_folds'] and torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        logger.write("GPU cache cleared")
    
    # Fold 데이터 분할
    fold_train_df = train_df.iloc[train_idx].reset_index(drop=True)
    fold_val_df = train_df.iloc[val_idx].reset_index(drop=True)
    
    logger.write(f"Fold {fold_idx} data split: Train={len(fold_train_df)}, Val={len(fold_val_df)}")
    
    # 데이터셋 생성
    train_dataset = DialogueSummaryDataset(
        fold_train_df, 
        tokenizer,
        max_input_len=config['model']['max_input_length'],
        max_target_len=config['model']['max_target_length']
    )
    
    val_dataset = DialogueSummaryDataset(
        fold_val_df,
        tokenizer,
        max_input_len=config['model']['max_input_length'],
        max_target_len=config['model']['max_target_length']
    )
    
    # DataLoader 생성
    train_loader = DataLoader(
        train_dataset,
        batch_size=config['training']['batch_size'],
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config['training']['batch_size'],
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )
    
    # 모델 초기화 (각 Fold마다 새로운 모델)
    model = BartForConditionalGeneration.from_pretrained(config['model']['name'])
    logger.write(f"Model initialized for fold {fold_idx}")
    
    # Fold 학습 (향상된 버전 사용)
    best_rouge_l, fold_history = train_fold_with_validation(
        model, train_loader, val_loader, tokenizer, fold_idx, config,
        solar_validator=solar_validator,  # Solar API 검증
        risk_manager=risk_manager  # 리스크 관리
    )
    
    # 결과 저장
    fold_result = {
        'fold': fold_idx,
        'best_rouge_l': best_rouge_l,
        'train_size': len(fold_train_df),
        'val_size': len(fold_val_df),
        'final_train_loss': fold_history['train_loss'][-1] if fold_history['train_loss'] else 0,
        'final_val_loss': fold_history['val_loss'][-1] if fold_history['val_loss'] else 0,
        'total_risks': sum([r['risk_count'] for r in fold_history.get('risks', [])])
    }
    all_fold_results.append(fold_result)
    all_fold_histories.append(fold_history)
    
    logger.write(f"\nFold {fold_idx} completed:")
    logger.write(f"  Best ROUGE-L: {best_rouge_l:.4f}")
    logger.write(f"  Total risks encountered: {fold_result['total_risks']}")
    
    # WandB 로깅
    if config['wandb']['mode'] != 'disabled':
        wandb.log({
            f'fold_{fold_idx}_rouge_l': best_rouge_l,
            f'fold_{fold_idx}_train_loss': fold_result['final_train_loss'],
            f'fold_{fold_idx}_val_loss': fold_result['final_val_loss'],
            f'fold_{fold_idx}_risks': fold_result['total_risks']
        })
    
    # 메모리 정리
    del model, train_dataset, val_dataset, train_loader, val_loader
    torch.cuda.empty_cache()
    gc.collect()

# Solar API 일관성 분석
if solar_validator:
    consensus = solar_validator.get_fold_consensus()
    if consensus:
        logger.write("\n=== Solar API Consensus Analysis ===")
        logger.write(f"Model length consistency (std): {consensus.get('model_length_consistency', 0):.2f}")
        logger.write(f"API length consistency (std): {consensus.get('api_length_consistency', 0):.2f}")
        logger.write(f"Model vs API ratio: {consensus.get('avg_model_vs_api_ratio', 1):.2f}")

# 리스크 관리 최종 평가
if risk_manager:
    overall_risk = risk_manager.get_overall_risk_assessment()
    logger.write("\n=== Overall Risk Assessment ===")
    logger.write(f"Total risks: {overall_risk.get('total_risks', 0)}")
    logger.write(f"Critical risks: {overall_risk.get('critical_risks', 0)}")
    logger.write(f"Risk level: {overall_risk.get('risk_level', 'unknown')}")
    
    improvements = risk_manager.suggest_improvements()
    if improvements:
        logger.write("\n📋 Improvement Suggestions:")
        for suggestion in improvements:
            logger.write(f"  • {suggestion}")

logger.write("\n" + "="*50)
logger.write("K-Fold Cross Validation with PRD Strategies Completed")
logger.write("="*50)

In [None]:
# K-Fold 학습 함수
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm
import gc

def train_fold(model, train_loader, val_loader, tokenizer, fold_idx, config):
    """단일 Fold 학습"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    # learning_rate 변환
    learning_rate = config['training']['learning_rate']
    if isinstance(learning_rate, str):
        learning_rate = float(learning_rate)
    
    # 옵티마이저 설정
    optimizer = AdamW(
        model.parameters(),
        lr=learning_rate,
        weight_decay=config['training']['weight_decay']
    )
    
    num_epochs = config['training']['num_epochs']
    num_training_steps = num_epochs * len(train_loader)
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(num_training_steps * config['training']['warmup_ratio']),
        num_training_steps=num_training_steps
    )
    
    best_rouge_l = 0
    best_epoch = 0
    fold_history = {'train_loss': [], 'val_loss': [], 'rouge_l': []}
    
    for epoch in range(num_epochs):
        # 학습
        model.train()
        total_loss = 0
        
        progress_bar = tqdm(train_loader, desc=f'Fold {fold_idx} Epoch {epoch+1}')
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            progress_bar.set_postfix({'loss': loss.item()})
        
        avg_train_loss = total_loss / len(train_loader)
        fold_history['train_loss'].append(avg_train_loss)
        
        # 검증
        model.eval()
        val_loss = 0
        predictions = []
        references = []
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validating'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                
                # 예측 생성
                generated_ids = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=config['model']['max_target_length'],
                    num_beams=config['evaluation']['num_beams'],
                    early_stopping=True
                )
                
                preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
                refs = tokenizer.batch_decode(labels, skip_special_tokens=True)
                
                predictions.extend(preds)
                references.extend(refs)
        
        avg_val_loss = val_loss / len(val_loader)
        fold_history['val_loss'].append(avg_val_loss)
        
        # ROUGE 계산
        rouge_scores = compute_rouge_scores(predictions[:100], references[:100])  # 샘플만 평가
        fold_history['rouge_l'].append(rouge_scores['rouge-l'])
        
        logger.write(f"Fold {fold_idx} Epoch {epoch+1}: Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}, ROUGE-L={rouge_scores['rouge-l']:.4f}")
        
        # Best model 추적
        if rouge_scores['rouge-l'] > best_rouge_l:
            best_rouge_l = rouge_scores['rouge-l']
            best_epoch = epoch
            
            # 모델 저장 (설정된 경우)
            if config['kfold']['save_each_fold']:
                output_dir = get_path(config['paths']['output_dir'])
                output_dir.mkdir(parents=True, exist_ok=True)
                model_path = output_dir / f'fold_{fold_idx}_best_model.pt'
                torch.save(model.state_dict(), model_path)
                logger.write(f"  Saved best model for fold {fold_idx}")
    
    return best_rouge_l, fold_history

logger.write("Train fold function defined")

In [ ]:
# WandB 초기화
if config['wandb']['mode'] != 'disabled':
    wandb.init(
        project=config['wandb']['project'],
        entity=config['wandb']['entity'],
        name=config['wandb']['name'],
        tags=config['wandb']['tags'],
        config=config
    )
    logger.write("WandB initialized for K-Fold experiment")

# 모델 및 토크나이저 초기화 (한 번만)
logger.write(f"\nInitializing model: {config['model']['name']}")
tokenizer = AutoTokenizer.from_pretrained(config['model']['name'])
logger.write("Tokenizer loaded")

# K-Fold 메인 학습 루프
all_fold_results = []
all_fold_histories = []

logger.write("\n" + "="*50)
logger.write("Starting K-Fold Cross Validation")
logger.write("="*50)

for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(train_df), 1):
    logger.write(f"\n{'='*30}")
    logger.write(f"FOLD {fold_idx}/{config['kfold']['n_splits']}")
    logger.write(f"{'='*30}")
    
    # GPU 캐시 정리
    if config['gpu']['empty_cache_between_folds'] and torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        logger.write("GPU cache cleared")
    
    # Fold 데이터 분할
    fold_train_df = train_df.iloc[train_idx].reset_index(drop=True)
    fold_val_df = train_df.iloc[val_idx].reset_index(drop=True)
    
    logger.write(f"Fold {fold_idx} data split: Train={len(fold_train_df)}, Val={len(fold_val_df)}")
    
    # 데이터셋 생성
    train_dataset = DialogueSummaryDataset(
        fold_train_df, 
        tokenizer,
        max_input_len=config['model']['max_input_length'],
        max_target_len=config['model']['max_target_length']
    )
    
    val_dataset = DialogueSummaryDataset(
        fold_val_df,
        tokenizer,
        max_input_len=config['model']['max_input_length'],
        max_target_len=config['model']['max_target_length']
    )
    
    # DataLoader 생성
    train_loader = DataLoader(
        train_dataset,
        batch_size=config['training']['batch_size'],
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config['training']['batch_size'],
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )
    
    # 모델 초기화 (각 Fold마다 새로운 모델)
    model = BartForConditionalGeneration.from_pretrained(config['model']['name'])
    logger.write(f"Model initialized for fold {fold_idx}")
    
    # Fold 학습
    best_rouge_l, fold_history = train_fold(
        model, train_loader, val_loader, tokenizer, fold_idx, config
    )
    
    # 결과 저장
    fold_result = {
        'fold': fold_idx,
        'best_rouge_l': best_rouge_l,
        'train_size': len(fold_train_df),
        'val_size': len(fold_val_df)
    }
    all_fold_results.append(fold_result)
    all_fold_histories.append(fold_history)
    
    logger.write(f"\nFold {fold_idx} completed - Best ROUGE-L: {best_rouge_l:.4f}")
    
    # WandB 로깅
    if config['wandb']['mode'] != 'disabled':
        wandb.log({
            f'fold_{fold_idx}_rouge_l': best_rouge_l,
            f'fold_{fold_idx}_train_loss': fold_history['train_loss'][-1],
            f'fold_{fold_idx}_val_loss': fold_history['val_loss'][-1]
        })
    
    # 메모리 정리
    del model, train_dataset, val_dataset, train_loader, val_loader
    torch.cuda.empty_cache()
    gc.collect()

logger.write("\n" + "="*50)
logger.write("K-Fold Cross Validation Completed")
logger.write("="*50)

## K-Fold 결과 분석 및 앙상블

In [None]:
# K-Fold 결과 분석
logger.write("\n" + "="*50)
logger.write("K-FOLD RESULTS ANALYSIS")
logger.write("="*50)

# 결과 DataFrame 생성
results_df = pd.DataFrame(all_fold_results)
logger.write("\nFold Results:")
logger.write(results_df.to_string())

# 통계 계산
mean_rouge = results_df['best_rouge_l'].mean()
std_rouge = results_df['best_rouge_l'].std()
min_rouge = results_df['best_rouge_l'].min()
max_rouge = results_df['best_rouge_l'].max()

logger.write(f"\n{'='*30}")
logger.write("ROUGE-L Statistics:")
logger.write(f"  Mean: {mean_rouge:.4f}")
logger.write(f"  Std:  {std_rouge:.4f}")
logger.write(f"  Min:  {min_rouge:.4f}")
logger.write(f"  Max:  {max_rouge:.4f}")
logger.write(f"  95% CI: [{mean_rouge - 1.96*std_rouge:.4f}, {mean_rouge + 1.96*std_rouge:.4f}]")
logger.write(f"{'='*30}")

# WandB 로깅
if config['wandb']['mode'] != 'disabled':
    wandb.log({
        'kfold_mean_rouge_l': mean_rouge,
        'kfold_std_rouge_l': std_rouge,
        'kfold_min_rouge_l': min_rouge,
        'kfold_max_rouge_l': max_rouge
    })

In [None]:
# 시각화
from src.utils.visualizations.training_viz import TrainingVisualizer

viz = TrainingVisualizer()

# 시각화 저장 경로
viz_dir = get_path(config.get('paths', {}).get('visualization_dir', 'visualizations'))
viz_dir.mkdir(parents=True, exist_ok=True)

# K-Fold 결과 시각화
if len(all_fold_results) > 0:
    # Fold별 ROUGE-L 점수를 위한 데이터 준비
    fold_rouge_scores = [{'rouge-l': r['best_rouge_l']} for r in all_fold_results]
    
    # K-Fold 결과 플롯
    viz.plot_kfold_results(
        fold_rouge_scores,
        save_path=viz_dir / f'kfold_{config["kfold"]["n_splits"]}fold_results.png'
    )
    
    logger.write(f"Visualization saved to {viz_dir}")

# 결과 저장 (JSON)
import json

results_summary = {
    'config': config,
    'fold_results': all_fold_results,
    'statistics': {
        'mean_rouge_l': float(mean_rouge),
        'std_rouge_l': float(std_rouge),
        'min_rouge_l': float(min_rouge),
        'max_rouge_l': float(max_rouge)
    },
    'timestamp': timestamp
}

results_path = log_dir / f'kfold_results_{timestamp}.json'
with open(results_path, 'w', encoding='utf-8') as f:
    json.dump(results_summary, f, indent=4, ensure_ascii=False)

logger.write(f"\nResults saved to {results_path}")

# 앙상블 예측 생성 (설정된 경우)
if config['kfold'].get('generate_ensemble_predictions', False):
    logger.write("\n" + "="*50)
    logger.write("GENERATING ENSEMBLE PREDICTIONS")
    logger.write("="*50)
    
    # 테스트 데이터 로드
    test_path = get_data_path(config['paths']['test_file'])
    test_df = pd.read_csv(test_path)
    logger.write(f"Test data loaded: {len(test_df)} samples")
    
    # 테스트 데이터셋
    test_dataset = DialogueSummaryDataset(
        test_df,
        tokenizer,
        max_input_len=config['model']['max_input_length'],
        max_target_len=config['model']['max_target_length'],
        is_test=True
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=config['inference']['batch_size'],
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )
    
    # 각 Fold 모델로 예측 생성
    all_fold_predictions = []
    
    output_dir = get_path(config['paths']['output_dir'])
    
    for fold_idx in range(1, config['kfold']['n_splits'] + 1):
        model_path = output_dir / f'fold_{fold_idx}_best_model.pt'
        
        if model_path.exists():
            logger.write(f"\nLoading fold {fold_idx} model...")
            
            # 모델 로드
            model = BartForConditionalGeneration.from_pretrained(config['model']['name'])
            model.load_state_dict(torch.load(model_path))
            model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
            model.eval()
            
            # 예측 생성
            fold_predictions = []
            
            with torch.no_grad():
                for batch in tqdm(test_loader, desc=f'Fold {fold_idx} predictions'):
                    input_ids = batch['input_ids'].to(model.device)
                    attention_mask = batch['attention_mask'].to(model.device)
                    
                    generated_ids = model.generate(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        max_length=config['inference']['max_length'],
                        num_beams=config['inference']['num_beams'],
                        early_stopping=config['inference']['early_stopping'],
                        no_repeat_ngram_size=config['inference']['no_repeat_ngram_size']
                    )
                    
                    preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
                    fold_predictions.extend(preds)
            
            all_fold_predictions.append(fold_predictions)
            logger.write(f"  Generated {len(fold_predictions)} predictions")
            
            # 메모리 정리
            del model
            torch.cuda.empty_cache()
            gc.collect()
    
    # 앙상블 (간단한 투표 방식)
    if len(all_fold_predictions) > 0:
        logger.write(f"\nEnsembling {len(all_fold_predictions)} fold predictions...")
        
        ensemble_predictions = []
        for i in range(len(test_df)):
            # 각 fold의 예측 수집
            fold_preds = [fold_pred[i] for fold_pred in all_fold_predictions]
            
            # 가장 빈번한 예측 선택 (간단한 투표)
            # 실제로는 더 정교한 앙상블 방법 사용 가능
            from collections import Counter
            most_common = Counter(fold_preds).most_common(1)[0][0]
            ensemble_predictions.append(most_common)
        
        # 제출 파일 생성
        submission_df = pd.DataFrame({
            'fname': test_df['fname'],
            'summary': ensemble_predictions
        })
        
        submission_dir = get_path(config['paths']['submission_dir'])
        submission_dir.mkdir(parents=True, exist_ok=True)
        
        submission_path = submission_dir / f'kfold_ensemble_submission_{timestamp}.csv'
        # index=True로 설정하여 인덱스를 포함시킴
        submission_df.to_csv(submission_path, index=True, encoding='utf-8')  # index=False -> index=True로 변경
        
        logger.write(f"\nEnsemble submission saved to {submission_path}")
        logger.write(f"Shape: {submission_df.shape}")
    else:
        logger.write("\nNo fold models found for ensemble prediction")

In [None]:
# 앙상블 예측 생성 (설정된 경우)
if config['kfold'].get('generate_ensemble_predictions', False):
    logger.write("\n" + "="*50)
    logger.write("GENERATING ENSEMBLE PREDICTIONS")
    logger.write("="*50)
    
    # 테스트 데이터 로드
    test_path = get_data_path(config['paths']['test_file'])
    test_df = pd.read_csv(test_path)
    logger.write(f"Test data loaded: {len(test_df)} samples")
    
    # 테스트 데이터셋
    test_dataset = DialogueSummaryDataset(
        test_df,
        tokenizer,
        max_input_len=config['model']['max_input_length'],
        max_target_len=config['model']['max_target_length'],
        is_test=True
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=config['inference']['batch_size'],
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )
    
    # 각 Fold 모델로 예측 생성
    all_fold_predictions = []
    
    output_dir = get_path(config['paths']['output_dir'])
    
    for fold_idx in range(1, config['kfold']['n_splits'] + 1):
        model_path = output_dir / f'fold_{fold_idx}_best_model.pt'
        
        if model_path.exists():
            logger.write(f"\nLoading fold {fold_idx} model...")
            
            # 모델 로드
            model = BartForConditionalGeneration.from_pretrained(config['model']['name'])
            model.load_state_dict(torch.load(model_path))
            model = model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
            model.eval()
            
            # 예측 생성
            fold_predictions = []
            
            with torch.no_grad():
                for batch in tqdm(test_loader, desc=f'Fold {fold_idx} predictions'):
                    input_ids = batch['input_ids'].to(model.device)
                    attention_mask = batch['attention_mask'].to(model.device)
                    
                    generated_ids = model.generate(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        max_length=config['inference']['max_length'],
                        num_beams=config['inference']['num_beams'],
                        early_stopping=config['inference']['early_stopping'],
                        no_repeat_ngram_size=config['inference']['no_repeat_ngram_size']
                    )
                    
                    preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
                    fold_predictions.extend(preds)
            
            all_fold_predictions.append(fold_predictions)
            logger.write(f"  Generated {len(fold_predictions)} predictions")
            
            # 메모리 정리
            del model
            torch.cuda.empty_cache()
            gc.collect()
    
    # 앙상블 (간단한 투표 방식)
    if len(all_fold_predictions) > 0:
        logger.write(f"\nEnsembling {len(all_fold_predictions)} fold predictions...")
        
        ensemble_predictions = []
        for i in range(len(test_df)):
            # 각 fold의 예측 수집
            fold_preds = [fold_pred[i] for fold_pred in all_fold_predictions]
            
            # 가장 빈번한 예측 선택 (간단한 투표)
            # 실제로는 더 정교한 앙상블 방법 사용 가능
            from collections import Counter
            most_common = Counter(fold_preds).most_common(1)[0][0]
            ensemble_predictions.append(most_common)
        
        # 제출 파일 생성
        submission_df = pd.DataFrame({
            'id': test_df['id'],
            'summary': ensemble_predictions
        })
        
        submission_dir = get_path(config['paths']['submission_dir'])
        submission_dir.mkdir(parents=True, exist_ok=True)
        
        submission_path = submission_dir / f'kfold_ensemble_submission_{timestamp}.csv'
        submission_df.to_csv(submission_path, index=False, encoding='utf-8')
        
        logger.write(f"\nEnsemble submission saved to {submission_path}")
        logger.write(f"Shape: {submission_df.shape}")
    else:
        logger.write("\nNo fold models found for ensemble prediction")

In [None]:
# 최종 요약
logger.write("\n" + "="*50)
logger.write("K-FOLD CROSS VALIDATION SUMMARY")
logger.write("="*50)
logger.write(f"Model: {config['model']['name']}")
logger.write(f"Folds: {config['kfold']['n_splits']}")
logger.write(f"Mean ROUGE-L: {mean_rouge:.4f} (±{std_rouge:.4f})")
logger.write(f"Best Fold: {results_df.loc[results_df['best_rouge_l'].idxmax(), 'fold']}")
logger.write(f"Best Score: {max_rouge:.4f}")
logger.write(f"Worst Fold: {results_df.loc[results_df['best_rouge_l'].idxmin(), 'fold']}")
logger.write(f"Worst Score: {min_rouge:.4f}")

if config['kfold']['save_each_fold']:
    logger.write(f"\nFold models saved to: {output_dir}")

logger.write(f"\nLog file: {log_file}")
logger.write("="*50)

# WandB 종료
if config['wandb']['mode'] != 'disabled':
    wandb.summary['kfold_final_mean_rouge'] = mean_rouge
    wandb.summary['kfold_final_std_rouge'] = std_rouge
    wandb.finish()

logger.write("\n✅ K-Fold Cross Validation completed successfully!")