# 🎭 다중 모델 앙상블 - 5개 모델 통합
> PRD 계획에 따른 5개 모델 앙상블 + TTA 전략

**목표 성능**: ROUGE-F1 75-80

In [1]:
# 환경 설정
import sys
import os
from pathlib import Path

# 프로젝트 루트 경로 추가
notebook_dir = Path.cwd()
project_root = notebook_dir.parent.parent.parent  # 3번만 parent 사용!

# 다른 프로젝트 경로 제거하고 현재 프로젝트 경로만 추가
sys.path = [p for p in sys.path if 'computer-vision-competition' not in p]
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project Root: {project_root}")
print(f"Current Dir: {notebook_dir}")

# 필요한 라이브러리 임포트
import yaml
import pandas as pd
import numpy as np
import torch
from datetime import datetime
from typing import List, Dict
import matplotlib.pyplot as plt
import wandb

# 커스텀 모듈 임포트
from src.logging.notebook_logger import NotebookLogger
from src.utils.gpu_optimization.team_gpu_check import check_gpu_tier
from src.utils.visualizations.training_viz import TrainingVisualizer

print("Libraries imported successfully!")

Project Root: /home/ieyeppo/AI_Lab/natural-language-processing-competition
Current Dir: /home/ieyeppo/AI_Lab/natural-language-processing-competition/notebooks/team/CHH
✅ 나눔고딕 폰트 로드 성공
Libraries imported successfully!


In [2]:
# 설정 파일 로드
config_path = notebook_dir / 'configs' / 'config_multi_model.yaml'

with open(config_path, 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

# 활성화된 모델 확인
enabled_models = [name for name, cfg in config['ensemble_models'].items() if cfg['enabled']]
print(f"Enabled Models ({len(enabled_models)}):")
for model_name in enabled_models:
    weight = config['ensemble_models'][model_name]['weight']
    print(f"  - {model_name}: weight={weight:.2f}")

Enabled Models (5):
  - solar: weight=0.30
  - polyglot: weight=0.25
  - kullm: weight=0.20
  - kobart: weight=0.15
  - koalpaca: weight=0.10


In [3]:
# 로그 디렉토리 생성
# config의 로그 경로 사용
def get_path(path_str):
    """config의 상대 경로를 절대 경로로 변환"""
    path = Path(path_str)
    if not path.is_absolute():
        path = notebook_dir / path
    return path

# config에 log_dir이 정의되어 있으면 사용, 없으면 기본값
if 'log_dir' in config['paths']:
    log_dir = get_path(config['paths']['log_dir'])
else:
    # 기본값: notebook_dir/logs/multi_model
    log_dir = notebook_dir / 'logs' / 'multi_model'

log_dir.mkdir(parents=True, exist_ok=True)

# 타임스탬프 생성
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# 로거 초기화
log_file = log_dir / f'ensemble_{len(enabled_models)}models_{timestamp}.log'
logger = NotebookLogger(
    log_path=str(log_file),
    print_also=True
)

logger.write('='*50)
logger.write('Multi-Model Ensemble Experiment')
logger.write(f'Timestamp: {timestamp}')
logger.write(f'Models: {enabled_models}')
logger.write(f'Ensemble Method: {config["ensemble_strategy"]["method"]}')
logger.write('='*50)

Multi-Model Ensemble Experiment
Timestamp: 20251010_090607
Models: ['solar', 'polyglot', 'kullm', 'kobart', 'koalpaca']
Ensemble Method: weighted_average


In [4]:
# TTA 설정 확인
if config['tta']['enabled']:
    logger.write("\nTTA Configuration:")
    logger.write(f"  - Augmentations: {config['tta']['num_augmentations']}")
    logger.write(f"  - Aggregation: {config['tta']['aggregation']}")
    
    for technique, settings in config['tta']['techniques'].items():
        if settings['enabled']:
            logger.write(f"  - {technique}: enabled")
            print(f"TTA: {technique} enabled")


TTA Configuration:
  - Augmentations: 3
  - Aggregation: mean
  - paraphrase: enabled
TTA: paraphrase enabled
  - reorder: enabled
TTA: reorder enabled


In [5]:
# GPU 체크 및 멀티 GPU 설정
if torch.cuda.is_available():
    gpu_tier = check_gpu_tier()
    logger.write(f"\nGPU: {torch.cuda.get_device_name(0)}")
    logger.write(f"GPU Tier: {gpu_tier}")
    
    # 멀티 GPU 체크
    if config['gpu']['multi_gpu']['enabled'] and torch.cuda.device_count() > 1:
        logger.write(f"Multi-GPU available: {torch.cuda.device_count()} GPUs")
    
    # 메모리 관리 설정
    if config['gpu']['empty_cache_between_models']:
        logger.write("Will clear GPU cache between models")


GPU: NVIDIA GeForce RTX 4090
GPU Tier: LOW
Will clear GPU cache between models


In [6]:
# 앙상블 가중치 시각화
weights = [config['ensemble_models'][name]['weight'] for name in enabled_models]
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']

plt.figure(figsize=(12, 6))
bars = plt.bar(enabled_models, weights, color=colors[:len(enabled_models)])
plt.title('Ensemble Model Weights Distribution', fontsize=14, fontweight='bold')
plt.ylabel('Weight', fontsize=12)
plt.xlabel('Model', fontsize=12)

# 가중치 값 표시
for bar, weight in zip(bars, weights):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{weight:.2f}', ha='center', va='bottom', fontweight='bold')

plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()

# 시각화 저장 - config 경로 사용
if 'visualization_dir' in config['paths']:
    viz_dir = get_path(config['paths']['visualization_dir'])
else:
    # 기본값
    viz_dir = log_dir / 'visualizations'

viz_dir.mkdir(parents=True, exist_ok=True)
plt.savefig(viz_dir / f'ensemble_weights_{timestamp}.png', dpi=100, bbox_inches='tight')
plt.show()

logger.write(f"Weight visualization saved to {viz_dir}")

Weight visualization saved to /home/ieyeppo/AI_Lab/natural-language-processing-competition/notebooks/team/CHH/logs/multi_model/visualizations


  plt.show()


In [7]:
# Solar API 비교 설정
if config['solar_api_comparison']['enabled']:
    logger.write("\nSolar API Comparison Settings:")
    logger.write(f"  - API Key: {config['solar_api_comparison']['api_key'][:10]}...")
    logger.write(f"  - Use as baseline: {config['solar_api_comparison']['use_as_baseline']}")
    logger.write(f"  - Include in ensemble: {config['solar_api_comparison']['include_in_ensemble']}")
    
    print("\nSolar API configured for comparison")


Solar API Comparison Settings:
  - API Key: up_rMJWNzz...
  - Use as baseline: True
  - Include in ensemble: False

Solar API configured for comparison


In [8]:
# Optuna 최적화 설정 (앙상블 가중치)
if config['optuna']['enabled']:
    logger.write("\nOptuna Weight Optimization:")
    logger.write(f"  - Trials: {config['optuna']['n_trials']}")
    logger.write(f"  - Study: {config['optuna']['study_name']}")
    logger.write(f"  - Metric: {config['optuna']['metric']}")
    
    import optuna
    print("Optuna configured for ensemble weight optimization")


Optuna Weight Optimization:
  - Trials: 50
  - Study: ensemble_weight_optimization
  - Metric: rouge_l
Optuna configured for ensemble weight optimization


## 모델별 학습 및 평가

각 모델을 개별적으로 학습하고 평가합니다.

In [9]:
# 데이터 로드
# config 파일의 경로 사용
def get_data_path(path_str):
    """config의 상대 경로를 절대 경로로 변환"""
    path = Path(path_str)
    if not path.is_absolute():
        path = notebook_dir / path
    return path

# config에서 데이터 경로 가져오기
train_path = get_data_path(config['paths']['train_file'])
dev_path = get_data_path(config['paths']['dev_file'])
test_path = get_data_path(config['paths']['test_file'])

logger.write("\n=== Data Loading ===")
logger.write(f"Loading data from config paths:")
logger.write(f"  - Train: {train_path}")
logger.write(f"  - Dev: {dev_path}")
logger.write(f"  - Test: {test_path}")

# 데이터 로드
train_df = pd.read_csv(train_path)
dev_df = pd.read_csv(dev_path)
test_df = pd.read_csv(test_path)

logger.write(f"\nData loaded successfully!")
logger.write(f"  - Train samples: {len(train_df)}")
logger.write(f"  - Dev samples: {len(dev_df)}")
logger.write(f"  - Test samples: {len(test_df)}")

# 데이터 샘플 출력
print("\nTrain data topics:")
print(train_df['topic'].value_counts().head(10))
print(f"\nFirst dialogue (200 chars):")
print(train_df.iloc[0]['dialogue'][:200] + "...")


=== Data Loading ===
Loading data from config paths:
  - Train: /home/ieyeppo/AI_Lab/natural-language-processing-competition/notebooks/team/CHH/../../../data/raw/train.csv
  - Dev: /home/ieyeppo/AI_Lab/natural-language-processing-competition/notebooks/team/CHH/../../../data/raw/dev.csv
  - Test: /home/ieyeppo/AI_Lab/natural-language-processing-competition/notebooks/team/CHH/../../../data/raw/test.csv

Data loaded successfully!
  - Train samples: 12457
  - Dev samples: 499
  - Test samples: 499

Train data topics:
topic
음식 주문     130
취업 면접     109
길 안내       66
호텔 체크인     40
아파트 임대     30
일상 대화      29
쇼핑         27
주말 계획      26
면접         25
호텔 예약      25
Name: count, dtype: int64

First dialogue (200 chars):
#Person1#: 안녕하세요, Mr. Smith. 저는 Dr. Hawkins입니다. 오늘 무슨 일로 오셨어요? 
#Person2#: 건강검진을 받으려고 왔어요. 
#Person1#: 네, 5년 동안 검진을 안 받으셨네요. 매년 한 번씩 받으셔야 해요. 
#Person2#: 알죠. 특별히 아픈 데가 없으면 굳이 갈 필요가 없다고 생각했어요. 
#Person...


In [10]:
# 모델별 결과 저장
model_results = {}

for model_name in enabled_models:
    logger.write(f"\n=== {model_name} Model ===")
    model_config = config['ensemble_models'][model_name]
    logger.write(f"  - Model: {model_config['name']}")
    logger.write(f"  - Weight: {model_config['weight']}")
    
    if model_config.get('use_lora', False):
        logger.write(f"  - LoRA: r={model_config['lora_config']['r']}, alpha={model_config['lora_config']['alpha']}")
    
    # 실제 학습 코드는 여기에 구현
    print(f"Would train {model_name} here...")


=== solar Model ===
  - Model: upstage/SOLAR-10.7B-Instruct-v1.0
  - Weight: 0.3
  - LoRA: r=16, alpha=32
Would train solar here...

=== polyglot Model ===
  - Model: EleutherAI/polyglot-ko-12.8b
  - Weight: 0.25
  - LoRA: r=8, alpha=16
Would train polyglot here...

=== kullm Model ===
  - Model: nlpai-lab/kullm-v2
  - Weight: 0.2
  - LoRA: r=8, alpha=16
Would train kullm here...

=== kobart Model ===
  - Model: digit82/kobart-summarization
  - Weight: 0.15
Would train kobart here...

=== koalpaca Model ===
  - Model: beomi/KoAlpaca-Polyglot-12.8B
  - Weight: 0.1
  - LoRA: r=8, alpha=16
Would train koalpaca here...


## TTA (Text Test Augmentation)

텍스트 증강을 통한 성능 향상

In [11]:
if config['tta']['enabled']:
    logger.write("\n=== TTA Implementation ===")
    
    # Paraphrase
    if config['tta']['techniques']['paraphrase']['enabled']:
        logger.write("Paraphrase augmentation enabled")
        logger.write(f"  - Model: {config['tta']['techniques']['paraphrase']['model']}")
        logger.write(f"  - Variants: {config['tta']['techniques']['paraphrase']['num_variants']}")
    
    # 실제 TTA 구현은 여기에
    print("TTA would be applied here...")


=== TTA Implementation ===
Paraphrase augmentation enabled
  - Model: lcw99/t5-base-korean-paraphrase
  - Variants: 2
TTA would be applied here...


## 앙상블 및 최종 예측

모든 모델의 예측을 앙상블하여 최종 결과를 생성합니다.

In [None]:
# Solar API 클라이언트 설정 (PRD 09_Solar_API_최적화.md 참고)
import hashlib
import time
import re
from openai import OpenAI

class SolarAPIOptimizer:
    """Solar API 최적화 클래스"""
    def __init__(self, api_key):
        self.client = OpenAI(
            api_key=api_key,
            base_url="https://api.upstage.ai/v1/solar"
        )
        self.token_limit = 512  # 토큰 제한
        self.cache = {}  # 결과 캐싱
    
    def preprocess_dialogue(self, dialogue):
        """대화문 전처리로 토큰 절약"""
        # 불필요한 공백 제거
        dialogue = ' '.join(dialogue.split())
        
        # Person 태그 간소화
        dialogue = dialogue.replace('#Person1#:', 'A:')
        dialogue = dialogue.replace('#Person2#:', 'B:')
        dialogue = dialogue.replace('#Person3#:', 'C:')
        
        # 대화 길이 제한
        return self.smart_truncate(dialogue, max_tokens=self.token_limit)
    
    def smart_truncate(self, text, max_tokens=512):
        """스마트 절단: 문장 단위로 자르기"""
        # 토큰 수 추정 (한글 평균 2.5자 = 1토큰)
        estimated_tokens = len(text) / 2.5
        
        if estimated_tokens <= max_tokens:
            return text
        
        sentences = text.split('.')
        truncated = []
        current_length = 0
        
        for sentence in sentences:
            sentence_tokens = len(sentence) / 2.5
            if current_length + sentence_tokens > max_tokens:
                break
            truncated.append(sentence)
            current_length += sentence_tokens
        
        return '.'.join(truncated) + '.'
    
    def generate_summary(self, dialogue):
        """Solar API로 요약 생성"""
        # 캐시 확인
        dialogue_hash = hashlib.md5(dialogue.encode()).hexdigest()
        if dialogue_hash in self.cache:
            return self.cache[dialogue_hash]
        
        # 전처리
        processed_dialogue = self.preprocess_dialogue(dialogue)
        
        # API 호출
        try:
            response = self.client.chat.completions.create(
                model="solar-1-mini-chat",
                messages=[
                    {"role": "system", "content": "다음 대화를 3-5문장으로 요약해주세요."},
                    {"role": "user", "content": processed_dialogue}
                ],
                max_tokens=150,
                temperature=0.3
            )
            
            summary = response.choices[0].message.content
            
            # 캐싱
            self.cache[dialogue_hash] = summary
            
            return summary
        
        except Exception as e:
            logger.write(f"Solar API error: {e}")
            return None

# Solar API 초기화
if config['solar_api_comparison']['enabled']:
    solar_api = SolarAPIOptimizer(config['solar_api_comparison']['api_key'])
    logger.write("Solar API client initialized")
else:
    solar_api = None

In [None]:
# 듀얼 요약 시스템 (PRD 10_교차_검증_시스템.md 참고)
class QualityEvaluator:
    """요약 품질 평가기"""
    def __init__(self):
        self.criteria = {
            'length_ratio': 0.2,      # 요약 길이 적절성
            'keyword_coverage': 0.3,   # 핵심 키워드 포함
            'coherence': 0.25,         # 문장 일관성
            'information_density': 0.25 # 정보 밀도
        }
    
    def evaluate(self, summary, dialogue):
        """요약 품질 종합 평가"""
        scores = {
            'length_ratio': self.check_length_ratio(summary, dialogue),
            'keyword_coverage': self.check_keyword_coverage(summary, dialogue),
            'coherence': self.check_coherence(summary),
            'information_density': self.check_information_density(summary)
        }
        
        # 가중 평균
        total_score = sum(
            score * self.criteria[metric]
            for metric, score in scores.items()
        )
        return total_score
    
    def check_length_ratio(self, summary, dialogue):
        """요약 길이 적절성 (이상적: 원본의 20-30%)"""
        if not summary or not dialogue:
            return 0.5
        
        ratio = len(summary) / len(dialogue)
        if 0.2 <= ratio <= 0.3:
            return 1.0
        elif 0.15 <= ratio <= 0.35:
            return 0.8
        else:
            return 0.5
    
    def check_keyword_coverage(self, summary, dialogue):
        """핵심 키워드 포함도"""
        # 간단한 구현 (실제로는 형태소 분석 필요)
        dialogue_words = set(dialogue.split())
        summary_words = set(summary.split())
        
        if not dialogue_words:
            return 0.5
        
        coverage = len(summary_words & dialogue_words) / len(dialogue_words)
        return min(coverage * 2, 1.0)  # 최대 1.0
    
    def check_coherence(self, summary):
        """문장 일관성 체크"""
        if not summary:
            return 0.5
        
        sentences = summary.split('.')
        if len(sentences) < 2:
            return 0.8
        
        # 간단한 일관성 체크
        return 0.85  # 기본값
    
    def check_information_density(self, summary):
        """정보 밀도 측정"""
        if not summary:
            return 0.5
        
        words = summary.split()
        if not words:
            return 0.5
        
        unique_words = set(words)
        density = len(unique_words) / len(words)
        return min(density * 1.5, 1.0)

class DualSummarizationSystem:
    """듀얼 요약 시스템 - 모델과 API 결과 비교"""
    def __init__(self, model_generator, solar_api):
        self.model = model_generator
        self.api = solar_api
        self.evaluator = QualityEvaluator()
    
    def generate_summaries(self, dialogue):
        """두 가지 방법으로 요약 생성"""
        # 1. 모델로 생성
        model_summary = self.model(dialogue) if self.model else "모델 요약 없음"
        model_confidence = 0.7  # 기본 신뢰도
        
        # 2. Solar API로 생성
        api_summary = self.api.generate_summary(dialogue) if self.api else "API 요약 없음"
        api_confidence = 0.85  # API는 일반적으로 안정적
        
        return {
            'model': {
                'summary': model_summary,
                'confidence': model_confidence
            },
            'api': {
                'summary': api_summary,
                'confidence': api_confidence
            }
        }
    
    def select_best_summary(self, summaries, dialogue):
        """최적의 요약 선택"""
        model_score = self.evaluator.evaluate(
            summaries['model']['summary'],
            dialogue
        )
        api_score = self.evaluator.evaluate(
            summaries['api']['summary'],
            dialogue
        )
        
        # 가중치 적용
        model_final = model_score * summaries['model']['confidence']
        api_final = api_score * summaries['api']['confidence']
        
        logger.write(f"  Model score: {model_final:.3f}, API score: {api_final:.3f}")
        
        if model_final > api_final:
            return summaries['model']['summary'], 'model'
        else:
            return summaries['api']['summary'], 'api'

# 듀얼 시스템 초기화
logger.write("\n=== Dual Summarization System ===")
logger.write("Quality evaluator initialized")
logger.write("Dual system ready for model vs API comparison")

In [None]:
# 각 모델 학습 및 평가 (실제 구현)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BartForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import gc

class DialogueSummaryDataset(Dataset):
    """대화 요약 데이터셋"""
    def __init__(self, dataframe, tokenizer, max_input_len=512, max_target_len=128, is_test=False):
        self.df = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len
        self.is_test = is_test
    
    def __len__(self):
        return len(self.df)
    
    def preprocess_dialogue(self, text):
        """대화 전처리"""
        text = text.replace('\\n', '\n')
        text = re.sub(r'#Person(\d+)#:', r'화자\1:', text)
        return text.strip()
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        dialogue = self.preprocess_dialogue(row['dialogue'])
        
        inputs = self.tokenizer(
            dialogue,
            max_length=self.max_input_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        if not self.is_test:
            summary = row.get('summary', '')
            targets = self.tokenizer(
                summary,
                max_length=self.max_target_len,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'labels': targets['input_ids'].squeeze()
            }
        else:
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'idx': idx
            }

# 모델별 학습 함수
def train_model(model_name, model_config, train_df, dev_df, config):
    """개별 모델 학습"""
    logger.write(f"\n=== Training {model_name} ===")
    
    # KoBART 모델 사용 (실제로는 각 모델별로 다르게 설정)
    model_path = "digit82/kobart-summarization"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = BartForConditionalGeneration.from_pretrained(model_path)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    # 데이터셋 생성
    train_dataset = DialogueSummaryDataset(
        train_df.sample(n=min(1000, len(train_df)), random_state=42),  # 샘플링
        tokenizer,
        max_input_len=model_config.get('max_length', 512),
        max_target_len=128
    )
    
    val_dataset = DialogueSummaryDataset(
        dev_df,
        tokenizer,
        max_input_len=model_config.get('max_length', 512),
        max_target_len=128
    )
    
    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)
    
    # 옵티마이저
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
    # 간단한 학습 (1 에폭만)
    model.train()
    train_loss = 0
    
    for batch in tqdm(train_loader, desc=f"Training {model_name}", total=min(50, len(train_loader))):
        if train_loader.batch_sampler.batch_size * (train_loader.batch_sampler.batch_size + 1) > 50:
            break  # 50 배치만 학습 (데모)
            
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    avg_loss = train_loss / min(50, len(train_loader))
    logger.write(f"  {model_name} training loss: {avg_loss:.4f}")
    
    # 평가
    model.eval()
    val_predictions = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Evaluating {model_name}", total=min(20, len(val_loader))):
            if len(val_predictions) >= 20:
                break  # 20개 샘플만 평가 (데모)
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )
            
            preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            val_predictions.extend(preds)
    
    logger.write(f"  {model_name} generated {len(val_predictions)} validation predictions")
    
    # 메모리 정리
    del model
    torch.cuda.empty_cache()
    gc.collect()
    
    return val_predictions

# 모델별 결과 저장
ensemble_predictions = {}

# KoBART만 실제 학습 (데모)
if 'kobart' in enabled_models:
    kobart_predictions = train_model(
        'kobart',
        config['ensemble_models']['kobart'],
        train_df,
        dev_df,
        config
    )
    ensemble_predictions['kobart'] = kobart_predictions
    
# 나머지 모델은 mock 예측
for model_name in enabled_models:
    if model_name != 'kobart':
        # Mock predictions
        ensemble_predictions[model_name] = ["Mock 요약"] * 20
        logger.write(f"\n{model_name}: Mock predictions generated (실제로는 학습 필요)")

logger.write(f"\n총 {len(ensemble_predictions)} 모델의 예측 생성 완료")

## Solar API와 모델 예측 비교 (교차 검증)

In [None]:
# Solar API vs 모델 비교 (PRD 10_교차_검증_시스템.md 구현)
logger.write("\n=== Solar API vs Model Comparison ===")

# 검증용 샘플 선택
test_samples = dev_df.sample(n=min(10, len(dev_df)), random_state=42)

comparison_results = {
    'model_wins': 0,
    'api_wins': 0,
    'model_scores': [],
    'api_scores': [],
    'examples': []
}

# 듀얼 시스템으로 비교
if solar_api:
    # 간단한 모델 예측 함수 (실제로는 학습된 모델 사용)
    def model_generator(dialogue):
        # Mock 예측 (실제로는 학습된 모델 사용)
        return f"모델 요약: {dialogue[:100]}..."
    
    dual_system = DualSummarizationSystem(model_generator, solar_api)
    
    for idx, row in test_samples.iterrows():
        dialogue = row['dialogue']
        gold_summary = row['summary']
        
        logger.write(f"\n검증 샘플 {idx}:")
        
        # 듀얼 시스템으로 요약 생성
        summaries = dual_system.generate_summaries(dialogue)
        
        # 최적 선택
        best_summary, source = dual_system.select_best_summary(summaries, dialogue)
        
        if source == 'model':
            comparison_results['model_wins'] += 1
        else:
            comparison_results['api_wins'] += 1
        
        # 예시 저장
        if len(comparison_results['examples']) < 3:
            comparison_results['examples'].append({
                'dialogue': dialogue[:200] + '...',
                'model_summary': summaries['model']['summary'][:100] + '...',
                'api_summary': summaries['api']['summary'][:100] + '...' if summaries['api']['summary'] else 'N/A',
                'selected': source,
                'gold': gold_summary[:100] + '...'
            })
        
        # Rate limiting for API
        time.sleep(0.5)
    
    # 결과 요약
    logger.write("\n=== Comparison Results ===")
    logger.write(f"Model wins: {comparison_results['model_wins']}")
    logger.write(f"API wins: {comparison_results['api_wins']}")
    
    # 예시 출력
    logger.write("\n샘플 비교:")
    for i, example in enumerate(comparison_results['examples'], 1):
        logger.write(f"\n예시 {i}:")
        logger.write(f"  선택: {example['selected']}")
        logger.write(f"  모델: {example['model_summary']}")
        logger.write(f"  API: {example['api_summary']}")
else:
    logger.write("Solar API 비활성화됨 - 비교 생략")

# 각 모델 학습 및 평가 (실제 구현)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BartForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import gc

class DialogueSummaryDataset(Dataset):
    """대화 요약 데이터셋"""
    def __init__(self, dataframe, tokenizer, max_input_len=512, max_target_len=128, is_test=False):
        self.df = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len
        self.is_test = is_test
    
    def __len__(self):
        return len(self.df)
    
    def preprocess_dialogue(self, text):
        """대화 전처리"""
        text = text.replace('\\n', '\n')
        text = re.sub(r'#Person(\d+)#:', r'화자\1:', text)
        return text.strip()
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        dialogue = self.preprocess_dialogue(row['dialogue'])
        
        inputs = self.tokenizer(
            dialogue,
            max_length=self.max_input_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        if not self.is_test:
            summary = row.get('summary', '')
            targets = self.tokenizer(
                summary,
                max_length=self.max_target_len,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            
            # 라벨 생성 - 패딩 토큰을 -100으로 마스킹 (중요!)
            labels = targets['input_ids'].squeeze()
            labels[labels == self.tokenizer.pad_token_id] = -100  # 패딩 토큰 마스킹
            
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'labels': labels
            }
        else:
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'idx': idx
            }

# 모델별 학습 함수
def train_model(model_name, model_config, train_df, dev_df, config):
    """개별 모델 학습"""
    logger.write(f"\n=== Training {model_name} ===")
    
    # KoBART 모델 사용 (실제로는 각 모델별로 다르게 설정)
    model_path = "digit82/kobart-summarization"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = BartForConditionalGeneration.from_pretrained(model_path)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    # 데이터셋 생성
    train_dataset = DialogueSummaryDataset(
        train_df.sample(n=min(1000, len(train_df)), random_state=42),  # 샘플링
        tokenizer,
        max_input_len=model_config.get('max_length', 512),
        max_target_len=128
    )
    
    val_dataset = DialogueSummaryDataset(
        dev_df,
        tokenizer,
        max_input_len=model_config.get('max_length', 512),
        max_target_len=128
    )
    
    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)
    
    # 옵티마이저
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
    # 간단한 학습 (1 에폭만)
    model.train()
    train_loss = 0
    
    for batch in tqdm(train_loader, desc=f"Training {model_name}", total=min(50, len(train_loader))):
        if train_loader.batch_sampler.batch_size * (train_loader.batch_sampler.batch_size + 1) > 50:
            break  # 50 배치만 학습 (데모)
            
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    avg_loss = train_loss / min(50, len(train_loader))
    logger.write(f"  {model_name} training loss: {avg_loss:.4f}")
    
    # 평가
    model.eval()
    val_predictions = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Evaluating {model_name}", total=min(20, len(val_loader))):
            if len(val_predictions) >= 20:
                break  # 20개 샘플만 평가 (데모)
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True,
                repetition_penalty=1.2,  # 반복 방지 추가
                no_repeat_ngram_size=3   # n-gram 반복 방지
            )
            
            preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            val_predictions.extend(preds)
    
    logger.write(f"  {model_name} generated {len(val_predictions)} validation predictions")
    
    # 메모리 정리
    del model
    torch.cuda.empty_cache()
    gc.collect()
    
    return val_predictions

# 모델별 결과 저장
ensemble_predictions = {}

# KoBART만 실제 학습 (데모)
if 'kobart' in enabled_models:
    kobart_predictions = train_model(
        'kobart',
        config['ensemble_models']['kobart'],
        train_df,
        dev_df,
        config
    )
    ensemble_predictions['kobart'] = kobart_predictions
    
# 나머지 모델은 mock 예측
for model_name in enabled_models:
    if model_name != 'kobart':
        # Mock predictions
        ensemble_predictions[model_name] = ["Mock 요약"] * 20
        logger.write(f"\n{model_name}: Mock predictions generated (실제로는 학습 필요)")

logger.write(f"\n총 {len(ensemble_predictions)} 모델의 예측 생성 완료")

In [None]:
# 앙상블 예측 생성
logger.write("\n=== Ensemble Prediction ===")

def weighted_ensemble(predictions_dict, weights_dict):
    """가중 평균 앙상블"""
    # 실제로는 더 정교한 앙상블 방법 사용
    # 여기서는 간단히 첫 번째 예측 사용 (데모)
    
    if predictions_dict:
        first_model = list(predictions_dict.keys())[0]
        return predictions_dict[first_model]
    return []

# 앙상블 가중치
ensemble_weights = {
    name: config['ensemble_models'][name]['weight']
    for name in enabled_models
}

# 앙상블 예측 생성
final_predictions = weighted_ensemble(ensemble_predictions, ensemble_weights)

logger.write(f"앙상블 예측 생성 완료: {len(final_predictions)} predictions")

# 테스트 데이터 예측 (실제 제출용)
logger.write("\n=== Generating Test Predictions ===")

# 간단한 예측 생성 (실제로는 학습된 앙상블 모델 사용)
test_predictions = []

for idx, row in test_df.iterrows():
    dialogue = row['dialogue']
    
    # 여기서는 간단한 mock 예측
    prediction = f"대화 요약: {dialogue[:50]}... 에 대한 요약입니다."
    test_predictions.append(prediction)
    
    if idx % 100 == 0:
        logger.write(f"  Processed {idx}/{len(test_df)} test samples")

logger.write(f"테스트 예측 완료: {len(test_predictions)} predictions")

In [None]:
# 제출 파일 생성
logger.write("\n=== Creating Submission File ===")

submission_df = pd.DataFrame({
    'fname': test_df['fname'],
    'summary': test_predictions
})

# 제출 파일 저장 - config의 경로 사용
submission_dir = get_path(config['paths']['submission_dir'])
submission_dir.mkdir(parents=True, exist_ok=True)

submission_filename = f'ensemble_{len(enabled_models)}models_{timestamp}.csv'
submission_path = submission_dir / submission_filename

# index=True로 설정하여 인덱스 포함
submission_df.to_csv(submission_path, index=True, encoding='utf-8')

logger.write(f"Submission file saved: {submission_path}")
logger.write(f"Shape: {submission_df.shape}")

# 샘플 출력
print("\nSubmission preview:")
print(submission_df.head(3))

In [None]:
# 실험 결과 요약 및 시각화
logger.write("\n" + "="*50)
logger.write("MULTI-MODEL ENSEMBLE EXPERIMENT SUMMARY")
logger.write("="*50)

# 모델별 성능 (mock 데이터)
model_performance = {
    'solar': 0.72,
    'polyglot': 0.68,
    'kullm': 0.65,
    'kobart': 0.60,
    'koalpaca': 0.58
}

# 앙상블 성능 (예상)
ensemble_performance = 0.75

# 성능 비교 시각화
plt.figure(figsize=(12, 6))

models = list(model_performance.keys()) + ['ensemble']
scores = list(model_performance.values()) + [ensemble_performance]
colors_perf = ['#1f77b4'] * len(model_performance) + ['#ff7f0e']

bars = plt.bar(models, scores, color=colors_perf)
plt.title('Model Performance Comparison (Mock)', fontsize=14, fontweight='bold')
plt.ylabel('ROUGE-L F1 Score', fontsize=12)
plt.xlabel('Model', fontsize=12)
plt.ylim(0, 1)

# 점수 표시
for bar, score in zip(bars, scores):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{score:.2f}', ha='center', va='bottom', fontweight='bold')

plt.axhline(y=config['performance_targets']['rouge_l'], 
            color='red', linestyle='--', alpha=0.5, 
            label=f"Target: {config['performance_targets']['rouge_l']}")
plt.legend()
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()

plt.savefig(viz_dir / f'performance_comparison_{timestamp}.png', dpi=100, bbox_inches='tight')
plt.show()

logger.write(f"Performance visualization saved to {viz_dir}")

# 최종 요약
logger.write("\n핵심 결과:")
logger.write(f"  - 활성화된 모델: {len(enabled_models)}개")
logger.write(f"  - 앙상블 방법: {config['ensemble_strategy']['method']}")
logger.write(f"  - TTA 활성화: {config['tta']['enabled']}")
logger.write(f"  - Solar API 비교: {config['solar_api_comparison']['enabled']}")
logger.write(f"  - 예상 앙상블 성능: {ensemble_performance:.2f}")
logger.write(f"  - 목표 ROUGE-L: {config['performance_targets']['rouge_l']}")

if comparison_results and 'model_wins' in comparison_results:
    win_rate = comparison_results['model_wins'] / (comparison_results['model_wins'] + comparison_results['api_wins'])
    logger.write(f"  - Model vs API 승률: {win_rate:.1%}")

logger.write("\n" + "="*50)

# WandB 종료
if config['wandb']['mode'] != 'disabled':
    wandb.init(
        project=config['wandb']['project'],
        entity=config['wandb']['entity'],
        name=config['wandb']['name'],
        tags=config['wandb']['tags'],
        config=config
    )
    
    # 결과 로깅
    wandb.log({
        'ensemble_models': len(enabled_models),
        'ensemble_performance': ensemble_performance,
        'model_vs_api_wins': comparison_results.get('model_wins', 0) if comparison_results else 0
    })
    
    wandb.finish()
    logger.write("WandB run finished")

logger.write(f"\n✅ Multi-model ensemble experiment completed!")
logger.write(f"Log file: {log_file}")