# 앙상블 모델 학습 및 평가 (Ensemble Model Training & Evaluation)

## 목표
- XGBoost와 LightGBM을 결합한 앙상블 모델 구현
- 다양한 앙상블 기법 비교 (Averaging, Weighted, Voting, Stacking)
- 최적의 앙상블 모델 선정 및 성능 향상 확인

## 작업 내역
1. 환경 설정 및 데이터 로드
2. 기존 학습된 모델 로드 (XGBoost, LightGBM)
3. Simple Averaging Ensemble
4. Weighted Ensemble (성능 기반)
5. Optimized Weighted Ensemble
6. Voting Ensemble (Soft/Hard)
7. Stacking Ensemble
8. 모델 성능 비교 및 시각화
9. 앙상블 분석 (가중치, 예측 차이, Hard cases)
10. 최종 모델 저장

## 1. 환경 설정

In [None]:
# 기본 라이브러리
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import pickle
import json
warnings.filterwarnings('ignore')

# 머신러닝 라이브러리
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, precision_recall_curve, auc,
    precision_score, recall_score, f1_score, average_precision_score,
    roc_curve
)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict

# 모델
import xgboost as xgb
import lightgbm as lgb

# 시각화 설정
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.family'] = 'DejaVu Sans'

# 출력 설정
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Random seed
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("Libraries loaded successfully")

In [None]:
# 커스텀 모듈 임포트
import sys
sys.path.append('..')

from pipeline.models import EnsembleModel, VotingEnsemble
from pipeline.evaluation import (
    ModelEvaluator,
    calculate_metrics,
    plot_roc_curve,
    plot_precision_recall_curve,
    plot_confusion_matrix
)

print("Custom modules loaded successfully")

## 2. 데이터 로드 및 전처리

In [None]:
# Feature engineering 완료된 데이터 로드
data_path = Path('../data/processed/featured_data.csv')

if not data_path.exists():
    print("Warning: featured_data.csv not found. Using preprocessed_data.csv instead.")
    data_path = Path('../data/processed/preprocessed_data.csv')

df = pd.read_csv(data_path)

print(f"Loaded data shape: {df.shape}")
print(f"\nColumns: {len(df.columns)}")
print(f"Rows: {len(df):,}")
print(f"Date range: {df['TA_YM'].min()} ~ {df['TA_YM'].max()}")

In [None]:
# 타겟 변수 및 제외 컬럼 정의
target_col = 'is_closed'

exclude_cols = [
    'ENCODED_MCT',
    'TA_YM',
    'is_closed',
    'will_close_1m',
    'will_close_3m',
    'months_until_close',
    'MCT_ME_D',
    'MCT_BSE_AR',
    'MCT_NM',
    'MCT_BRD_NUM',
    'MCT_SIGUNGU_NM',
    'HPSN_MCT_ZCD_NM',
    'HPSN_MCT_BZN_CD_NM',
    'ARE_D',
]

# Feature 컬럼 선택
feature_cols = [col for col in df.columns if col not in exclude_cols]
feature_cols = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()

print(f"Total features: {len(feature_cols)}")

In [None]:
# X, y 분리
X = df[feature_cols].copy()
y = df[target_col].copy()

# 결측값 처리 (중앙값으로 대체)
if X.isnull().sum().sum() > 0:
    print(f"Filling {X.isnull().sum().sum()} missing values with median...")
    X = X.fillna(X.median())

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nTarget distribution:")
print(y.value_counts())
print(f"Positive rate: {y.mean() * 100:.2f}%")

In [None]:
# Train/Validation/Test Split (시계열 고려)
df_with_features = df[['TA_YM', target_col]].join(X)

# Train: 2023 (202301 ~ 202312)
train_mask = (df_with_features['TA_YM'] >= 202301) & (df_with_features['TA_YM'] <= 202312)

# Validation: 2024 Jan-Jun (202401 ~ 202406)
val_mask = (df_with_features['TA_YM'] >= 202401) & (df_with_features['TA_YM'] <= 202406)

# Test: 2024 Jul-Dec (202407 ~ 202412)
test_mask = (df_with_features['TA_YM'] >= 202407) & (df_with_features['TA_YM'] <= 202412)

X_train = X[train_mask]
y_train = y[train_mask]

X_val = X[val_mask]
y_val = y[val_mask]

X_test = X[test_mask]
y_test = y[test_mask]

print("Data split completed:")
print(f"\nTrain: {X_train.shape[0]:,} samples, Positive: {y_train.sum():,} ({y_train.mean()*100:.2f}%)")
print(f"Val:   {X_val.shape[0]:,} samples, Positive: {y_val.sum():,} ({y_val.mean()*100:.2f}%)")
print(f"Test:  {X_test.shape[0]:,} samples, Positive: {y_test.sum():,} ({y_test.mean()*100:.2f}%)")

## 3. 기존 학습된 모델 로드

In [None]:
# 모델 디렉토리
model_dir = Path('../models')

# XGBoost (Tuned) 모델 로드
with open(model_dir / 'xgboost_best.pkl', 'rb') as f:
    xgb_model = pickle.load(f)
print("Loaded: XGBoost (Tuned)")

# LightGBM (Baseline) 모델 로드
with open(model_dir / 'lightgbm_baseline.pkl', 'rb') as f:
    lgb_model = pickle.load(f)
print("Loaded: LightGBM (Baseline)")

print("\nModels loaded successfully!")

In [None]:
# 기존 모델 성능 확인 (Validation)
print("="*80)
print("Baseline Model Performance - Validation Set")
print("="*80)

# XGBoost
y_val_pred_xgb = xgb_model.predict(X_val)
y_val_pred_proba_xgb = xgb_model.predict_proba(X_val)[:, 1]

print("\nXGBoost (Tuned):")
print(f"  Precision: {precision_score(y_val, y_val_pred_xgb):.4f}")
print(f"  Recall:    {recall_score(y_val, y_val_pred_xgb):.4f}")
print(f"  F1 Score:  {f1_score(y_val, y_val_pred_xgb):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_val, y_val_pred_proba_xgb):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_val, y_val_pred_proba_xgb):.4f}")

# LightGBM
y_val_pred_lgb = lgb_model.predict(X_val)
y_val_pred_proba_lgb = lgb_model.predict_proba(X_val)[:, 1]

print("\nLightGBM (Baseline):")
print(f"  Precision: {precision_score(y_val, y_val_pred_lgb):.4f}")
print(f"  Recall:    {recall_score(y_val, y_val_pred_lgb):.4f}")
print(f"  F1 Score:  {f1_score(y_val, y_val_pred_lgb):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_val, y_val_pred_proba_lgb):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_val, y_val_pred_proba_lgb):.4f}")

In [None]:
# 기존 모델 성능 확인 (Test)
print("="*80)
print("Baseline Model Performance - Test Set")
print("="*80)

# XGBoost
y_test_pred_xgb = xgb_model.predict(X_test)
y_test_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

print("\nXGBoost (Tuned):")
print(f"  Precision: {precision_score(y_test, y_test_pred_xgb):.4f}")
print(f"  Recall:    {recall_score(y_test, y_test_pred_xgb):.4f}")
print(f"  F1 Score:  {f1_score(y_test, y_test_pred_xgb):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_test, y_test_pred_proba_xgb):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_test, y_test_pred_proba_xgb):.4f}")

# LightGBM
y_test_pred_lgb = lgb_model.predict(X_test)
y_test_pred_proba_lgb = lgb_model.predict_proba(X_test)[:, 1]

print("\nLightGBM (Baseline):")
print(f"  Precision: {precision_score(y_test, y_test_pred_lgb):.4f}")
print(f"  Recall:    {recall_score(y_test, y_test_pred_lgb):.4f}")
print(f"  F1 Score:  {f1_score(y_test, y_test_pred_lgb):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_test, y_test_pred_proba_lgb):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_test, y_test_pred_proba_lgb):.4f}")

## 4. Simple Averaging Ensemble

가장 간단한 앙상블 방법: 두 모델의 예측 확률을 단순 평균

In [None]:
# Simple Averaging Ensemble
y_val_pred_proba_avg = (y_val_pred_proba_xgb + y_val_pred_proba_lgb) / 2
y_val_pred_avg = (y_val_pred_proba_avg >= 0.5).astype(int)

y_test_pred_proba_avg = (y_test_pred_proba_xgb + y_test_pred_proba_lgb) / 2
y_test_pred_avg = (y_test_pred_proba_avg >= 0.5).astype(int)

print("="*80)
print("Simple Averaging Ensemble")
print("="*80)

print("\nValidation Set:")
print(f"  Precision: {precision_score(y_val, y_val_pred_avg):.4f}")
print(f"  Recall:    {recall_score(y_val, y_val_pred_avg):.4f}")
print(f"  F1 Score:  {f1_score(y_val, y_val_pred_avg):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_val, y_val_pred_proba_avg):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_val, y_val_pred_proba_avg):.4f}")

print("\nTest Set:")
print(f"  Precision: {precision_score(y_test, y_test_pred_avg):.4f}")
print(f"  Recall:    {recall_score(y_test, y_test_pred_avg):.4f}")
print(f"  F1 Score:  {f1_score(y_test, y_test_pred_avg):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_test, y_test_pred_proba_avg):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_test, y_test_pred_proba_avg):.4f}")

## 5. Weighted Ensemble (성능 기반)

Validation set의 ROC-AUC 성능에 비례하는 가중치 사용

In [None]:
# Validation 성능 기반 가중치 계산
roc_auc_xgb = roc_auc_score(y_val, y_val_pred_proba_xgb)
roc_auc_lgb = roc_auc_score(y_val, y_val_pred_proba_lgb)

weight_xgb = roc_auc_xgb / (roc_auc_xgb + roc_auc_lgb)
weight_lgb = roc_auc_lgb / (roc_auc_xgb + roc_auc_lgb)

print(f"Performance-based weights:")
print(f"  XGBoost: {weight_xgb:.4f} (ROC-AUC: {roc_auc_xgb:.4f})")
print(f"  LightGBM: {weight_lgb:.4f} (ROC-AUC: {roc_auc_lgb:.4f})")

In [None]:
# Weighted Ensemble 예측
y_val_pred_proba_weighted = weight_xgb * y_val_pred_proba_xgb + weight_lgb * y_val_pred_proba_lgb
y_val_pred_weighted = (y_val_pred_proba_weighted >= 0.5).astype(int)

y_test_pred_proba_weighted = weight_xgb * y_test_pred_proba_xgb + weight_lgb * y_test_pred_proba_lgb
y_test_pred_weighted = (y_test_pred_proba_weighted >= 0.5).astype(int)

print("="*80)
print("Weighted Ensemble (Performance-based)")
print("="*80)

print("\nValidation Set:")
print(f"  Precision: {precision_score(y_val, y_val_pred_weighted):.4f}")
print(f"  Recall:    {recall_score(y_val, y_val_pred_weighted):.4f}")
print(f"  F1 Score:  {f1_score(y_val, y_val_pred_weighted):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_val, y_val_pred_proba_weighted):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_val, y_val_pred_proba_weighted):.4f}")

print("\nTest Set:")
print(f"  Precision: {precision_score(y_test, y_test_pred_weighted):.4f}")
print(f"  Recall:    {recall_score(y_test, y_test_pred_weighted):.4f}")
print(f"  F1 Score:  {f1_score(y_test, y_test_pred_weighted):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_test, y_test_pred_proba_weighted):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_test, y_test_pred_proba_weighted):.4f}")

## 6. Optimized Weighted Ensemble

`EnsembleModel.optimize_weights()` 메서드를 사용하여 최적 가중치 탐색

In [None]:
# EnsembleModel 초기화 (equal weights)
ensemble_model = EnsembleModel(models=[xgb_model, lgb_model], weights=[0.5, 0.5])

print("Optimizing ensemble weights using Validation set...")
print("This may take a few moments...\n")

# 가중치 최적화 (ROC-AUC 최대화)
optimized_weights = ensemble_model.optimize_weights(
    X_val=X_val,
    y_val=y_val,
    metric_func=roc_auc_score
)

print(f"Optimized weights:")
print(f"  XGBoost:  {optimized_weights[0]:.4f}")
print(f"  LightGBM: {optimized_weights[1]:.4f}")

In [None]:
# Optimized Ensemble 예측
y_val_pred_proba_optimized = ensemble_model.predict_proba(X_val)[:, 1]
y_val_pred_optimized = ensemble_model.predict(X_val)

y_test_pred_proba_optimized = ensemble_model.predict_proba(X_test)[:, 1]
y_test_pred_optimized = ensemble_model.predict(X_test)

print("="*80)
print("Optimized Weighted Ensemble")
print("="*80)

print("\nValidation Set:")
print(f"  Precision: {precision_score(y_val, y_val_pred_optimized):.4f}")
print(f"  Recall:    {recall_score(y_val, y_val_pred_optimized):.4f}")
print(f"  F1 Score:  {f1_score(y_val, y_val_pred_optimized):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_val, y_val_pred_proba_optimized):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_val, y_val_pred_proba_optimized):.4f}")

print("\nTest Set:")
print(f"  Precision: {precision_score(y_test, y_test_pred_optimized):.4f}")
print(f"  Recall:    {recall_score(y_test, y_test_pred_optimized):.4f}")
print(f"  F1 Score:  {f1_score(y_test, y_test_pred_optimized):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_test, y_test_pred_proba_optimized):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_test, y_test_pred_proba_optimized):.4f}")

## 7. Voting Ensemble

Soft voting (확률 평균)과 Hard voting (다수결) 비교

In [None]:
# Soft Voting Ensemble
voting_soft = VotingEnsemble(models=[xgb_model, lgb_model], voting='soft')

y_val_pred_proba_soft = voting_soft.predict_proba(X_val)[:, 1]
y_val_pred_soft = voting_soft.predict(X_val)

y_test_pred_proba_soft = voting_soft.predict_proba(X_test)[:, 1]
y_test_pred_soft = voting_soft.predict(X_test)

print("="*80)
print("Soft Voting Ensemble")
print("="*80)

print("\nValidation Set:")
print(f"  Precision: {precision_score(y_val, y_val_pred_soft):.4f}")
print(f"  Recall:    {recall_score(y_val, y_val_pred_soft):.4f}")
print(f"  F1 Score:  {f1_score(y_val, y_val_pred_soft):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_val, y_val_pred_proba_soft):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_val, y_val_pred_proba_soft):.4f}")

print("\nTest Set:")
print(f"  Precision: {precision_score(y_test, y_test_pred_soft):.4f}")
print(f"  Recall:    {recall_score(y_test, y_test_pred_soft):.4f}")
print(f"  F1 Score:  {f1_score(y_test, y_test_pred_soft):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_test, y_test_pred_proba_soft):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_test, y_test_pred_proba_soft):.4f}")

In [None]:
# Hard Voting Ensemble
voting_hard = VotingEnsemble(models=[xgb_model, lgb_model], voting='hard')

y_val_pred_hard = voting_hard.predict(X_val)
y_test_pred_hard = voting_hard.predict(X_test)

# Hard voting은 predict_proba가 없으므로 soft voting의 proba 사용
y_val_pred_proba_hard = voting_hard.predict_proba(X_val)[:, 1]
y_test_pred_proba_hard = voting_hard.predict_proba(X_test)[:, 1]

print("="*80)
print("Hard Voting Ensemble")
print("="*80)

print("\nValidation Set:")
print(f"  Precision: {precision_score(y_val, y_val_pred_hard):.4f}")
print(f"  Recall:    {recall_score(y_val, y_val_pred_hard):.4f}")
print(f"  F1 Score:  {f1_score(y_val, y_val_pred_hard):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_val, y_val_pred_proba_hard):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_val, y_val_pred_proba_hard):.4f}")

print("\nTest Set:")
print(f"  Precision: {precision_score(y_test, y_test_pred_hard):.4f}")
print(f"  Recall:    {recall_score(y_test, y_test_pred_hard):.4f}")
print(f"  F1 Score:  {f1_score(y_test, y_test_pred_hard):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_test, y_test_pred_proba_hard):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_test, y_test_pred_proba_hard):.4f}")

## 8. Stacking Ensemble

Base models의 예측을 feature로 사용하여 메타 모델(Logistic Regression) 학습

In [None]:
# Train set에 대한 Out-of-Fold 예측 생성 (5-fold)
from sklearn.model_selection import StratifiedKFold

print("Generating out-of-fold predictions for stacking...")
print("This may take a few minutes...\n")

n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)

# Train set OOF predictions
train_meta_features = np.zeros((len(X_train), 2))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    print(f"Fold {fold}/{n_folds}...")
    
    X_fold_train = X_train.iloc[train_idx]
    y_fold_train = y_train.iloc[train_idx]
    X_fold_val = X_train.iloc[val_idx]
    
    # XGBoost
    fold_xgb = xgb.XGBClassifier(**xgb_model.get_params())
    fold_xgb.fit(X_fold_train, y_fold_train, verbose=False)
    train_meta_features[val_idx, 0] = fold_xgb.predict_proba(X_fold_val)[:, 1]
    
    # LightGBM with suppressed output
    lgb_params = lgb_model.get_params()
    lgb_params['verbose'] = -1  # Suppress output
    fold_lgb = lgb.LGBMClassifier(**lgb_params)
    fold_lgb.fit(X_fold_train, y_fold_train)
    train_meta_features[val_idx, 1] = fold_lgb.predict_proba(X_fold_val)[:, 1]

print("\nOOF predictions completed!")

In [None]:
# Validation/Test set meta features
val_meta_features = np.column_stack([
    y_val_pred_proba_xgb,
    y_val_pred_proba_lgb
])

test_meta_features = np.column_stack([
    y_test_pred_proba_xgb,
    y_test_pred_proba_lgb
])

print(f"Train meta features shape: {train_meta_features.shape}")
print(f"Val meta features shape: {val_meta_features.shape}")
print(f"Test meta features shape: {test_meta_features.shape}")

In [None]:
# Meta model 학습 (Logistic Regression)
print("Training meta model (Logistic Regression)...\n")

meta_model = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
meta_model.fit(train_meta_features, y_train)

print("Meta model coefficients:")
print(f"  XGBoost weight:  {meta_model.coef_[0][0]:.4f}")
print(f"  LightGBM weight: {meta_model.coef_[0][1]:.4f}")
print(f"  Intercept: {meta_model.intercept_[0]:.4f}")

In [None]:
# Stacking Ensemble 예측
y_val_pred_proba_stack = meta_model.predict_proba(val_meta_features)[:, 1]
y_val_pred_stack = meta_model.predict(val_meta_features)

y_test_pred_proba_stack = meta_model.predict_proba(test_meta_features)[:, 1]
y_test_pred_stack = meta_model.predict(test_meta_features)

print("="*80)
print("Stacking Ensemble (Logistic Regression)")
print("="*80)

print("\nValidation Set:")
print(f"  Precision: {precision_score(y_val, y_val_pred_stack):.4f}")
print(f"  Recall:    {recall_score(y_val, y_val_pred_stack):.4f}")
print(f"  F1 Score:  {f1_score(y_val, y_val_pred_stack):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_val, y_val_pred_proba_stack):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_val, y_val_pred_proba_stack):.4f}")

print("\nTest Set:")
print(f"  Precision: {precision_score(y_test, y_test_pred_stack):.4f}")
print(f"  Recall:    {recall_score(y_test, y_test_pred_stack):.4f}")
print(f"  F1 Score:  {f1_score(y_test, y_test_pred_stack):.4f}")
print(f"  ROC-AUC:   {roc_auc_score(y_test, y_test_pred_proba_stack):.4f}")
print(f"  PR-AUC:    {average_precision_score(y_test, y_test_pred_proba_stack):.4f}")

## 9. 모델 성능 비교

In [None]:
# 모든 모델 성능 비교 테이블
comparison_results = []

models_info = [
    ('XGBoost (Tuned)', y_val_pred_xgb, y_val_pred_proba_xgb, y_test_pred_xgb, y_test_pred_proba_xgb),
    ('LightGBM (Baseline)', y_val_pred_lgb, y_val_pred_proba_lgb, y_test_pred_lgb, y_test_pred_proba_lgb),
    ('Simple Averaging', y_val_pred_avg, y_val_pred_proba_avg, y_test_pred_avg, y_test_pred_proba_avg),
    ('Weighted (Perf-based)', y_val_pred_weighted, y_val_pred_proba_weighted, y_test_pred_weighted, y_test_pred_proba_weighted),
    ('Optimized Weighted', y_val_pred_optimized, y_val_pred_proba_optimized, y_test_pred_optimized, y_test_pred_proba_optimized),
    ('Soft Voting', y_val_pred_soft, y_val_pred_proba_soft, y_test_pred_soft, y_test_pred_proba_soft),
    ('Hard Voting', y_val_pred_hard, y_val_pred_proba_hard, y_test_pred_hard, y_test_pred_proba_hard),
    ('Stacking', y_val_pred_stack, y_val_pred_proba_stack, y_test_pred_stack, y_test_pred_proba_stack),
]

for name, val_pred, val_proba, test_pred, test_proba in models_info:
    result = {
        'Model': name,
        'Val_Precision': precision_score(y_val, val_pred),
        'Val_Recall': recall_score(y_val, val_pred),
        'Val_F1': f1_score(y_val, val_pred),
        'Val_ROC_AUC': roc_auc_score(y_val, val_proba),
        'Val_PR_AUC': average_precision_score(y_val, val_proba),
        'Test_Precision': precision_score(y_test, test_pred),
        'Test_Recall': recall_score(y_test, test_pred),
        'Test_F1': f1_score(y_test, test_pred),
        'Test_ROC_AUC': roc_auc_score(y_test, test_proba),
        'Test_PR_AUC': average_precision_score(y_test, test_proba),
    }
    comparison_results.append(result)

comparison_df = pd.DataFrame(comparison_results)

print("\n" + "="*80)
print("MODEL COMPARISON TABLE")
print("="*80)
display(comparison_df.round(4))

In [None]:
# 가장 성능이 좋은 모델 찾기 (Test ROC-AUC 기준)
best_idx = comparison_df['Test_ROC_AUC'].idxmax()
best_model_name = comparison_df.loc[best_idx, 'Model']
best_test_roc_auc = comparison_df.loc[best_idx, 'Test_ROC_AUC']
best_test_f1 = comparison_df.loc[best_idx, 'Test_F1']

print("\n" + "="*80)
print("BEST MODEL")
print("="*80)
print(f"\nModel: {best_model_name}")
print(f"Test ROC-AUC: {best_test_roc_auc:.4f}")
print(f"Test F1 Score: {best_test_f1:.4f}")

# Baseline (XGBoost Tuned) 대비 개선율
baseline_roc_auc = comparison_df.loc[comparison_df['Model'] == 'XGBoost (Tuned)', 'Test_ROC_AUC'].values[0]
improvement = ((best_test_roc_auc - baseline_roc_auc) / baseline_roc_auc) * 100

print(f"\nImprovement over XGBoost (Tuned):")
print(f"  Baseline ROC-AUC: {baseline_roc_auc:.4f}")
print(f"  Best ROC-AUC:     {best_test_roc_auc:.4f}")
print(f"  Improvement:      {improvement:+.2f}%")

## 10. 성능 시각화

In [None]:
# ROC Curves 비교 (Test Set)
fig, ax = plt.subplots(figsize=(12, 8))

colors = plt.cm.tab10(np.linspace(0, 1, len(models_info)))

for i, (name, _, _, _, test_proba) in enumerate(models_info):
    fpr, tpr, _ = roc_curve(y_test, test_proba)
    roc_auc = auc(fpr, tpr)
    ax.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.4f})', linewidth=2, color=colors[i])

ax.plot([0, 1], [0, 1], 'k--', label='Random', linewidth=1)
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('ROC Curves Comparison - Test Set', fontsize=14, fontweight='bold')
ax.legend(loc='lower right', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Precision-Recall Curves 비교 (Test Set)
fig, ax = plt.subplots(figsize=(12, 8))

for i, (name, _, _, _, test_proba) in enumerate(models_info):
    precision, recall, _ = precision_recall_curve(y_test, test_proba)
    pr_auc = average_precision_score(y_test, test_proba)
    ax.plot(recall, precision, label=f'{name} (AUC = {pr_auc:.4f})', linewidth=2, color=colors[i])

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('Recall', fontsize=12)
ax.set_ylabel('Precision', fontsize=12)
ax.set_title('Precision-Recall Curves Comparison - Test Set', fontsize=14, fontweight='bold')
ax.legend(loc='upper right', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 메트릭별 성능 비교 Bar Chart (Test Set)
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

metrics = ['Test_Precision', 'Test_Recall', 'Test_F1', 'Test_ROC_AUC']
metric_names = ['Precision', 'Recall', 'F1 Score', 'ROC-AUC']

for idx, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
    ax = axes[idx // 2, idx % 2]
    
    sorted_df = comparison_df.sort_values(metric, ascending=True)
    
    bars = ax.barh(range(len(sorted_df)), sorted_df[metric].values)
    
    # 최고 성능 모델 강조
    best_idx_metric = sorted_df[metric].idxmax()
    bars[list(sorted_df.index).index(best_idx_metric)].set_color('red')
    
    ax.set_yticks(range(len(sorted_df)))
    ax.set_yticklabels(sorted_df['Model'].values)
    ax.set_xlabel(metric_name, fontsize=11)
    ax.set_title(f'{metric_name} Comparison (Test Set)', fontsize=12, fontweight='bold')
    ax.grid(True, alpha=0.3, axis='x')
    
    # 값 표시
    for i, v in enumerate(sorted_df[metric].values):
        ax.text(v, i, f' {v:.4f}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

## 11. 앙상블 분석

### 11.1 가중치 분석

In [None]:
# 다양한 앙상블 방법의 가중치 비교
weights_comparison = pd.DataFrame({
    'Method': ['Simple Avg', 'Perf-based', 'Optimized', 'Stacking (coef)'],
    'XGBoost_Weight': [
        0.5,
        weight_xgb,
        optimized_weights[0],
        meta_model.coef_[0][0]
    ],
    'LightGBM_Weight': [
        0.5,
        weight_lgb,
        optimized_weights[1],
        meta_model.coef_[0][1]
    ]
})

print("\nWeights Comparison:")
display(weights_comparison.round(4))

# 시각화
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(weights_comparison))
width = 0.35

ax.bar(x - width/2, weights_comparison['XGBoost_Weight'], width, label='XGBoost')
ax.bar(x + width/2, weights_comparison['LightGBM_Weight'], width, label='LightGBM')

ax.set_ylabel('Weight', fontsize=12)
ax.set_title('Ensemble Weights Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(weights_comparison['Method'])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

### 11.2 모델 간 예측 차이 분석

In [None]:
# XGBoost와 LightGBM의 예측 확률 차이 분석 (Test Set)
pred_diff = np.abs(y_test_pred_proba_xgb - y_test_pred_proba_lgb)

print(f"Prediction Difference Statistics (Test Set):")
print(f"  Mean:   {pred_diff.mean():.4f}")
print(f"  Median: {np.median(pred_diff):.4f}")
print(f"  Std:    {pred_diff.std():.4f}")
print(f"  Max:    {pred_diff.max():.4f}")

# 예측 차이가 큰 케이스
high_diff_threshold = np.percentile(pred_diff, 90)
high_diff_indices = np.where(pred_diff > high_diff_threshold)[0]

print(f"\nHigh disagreement cases (>90th percentile, diff > {high_diff_threshold:.4f}):")
print(f"  Count: {len(high_diff_indices)}")
print(f"  True positive rate: {y_test.iloc[high_diff_indices].mean():.2%}")

In [None]:
# 예측 차이 분포 시각화
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Histogram
axes[0].hist(pred_diff, bins=50, alpha=0.7, edgecolor='black')
axes[0].axvline(pred_diff.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {pred_diff.mean():.4f}')
axes[0].axvline(high_diff_threshold, color='orange', linestyle='--', linewidth=2, label=f'90th percentile: {high_diff_threshold:.4f}')
axes[0].set_xlabel('Absolute Prediction Difference', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Prediction Differences (XGBoost vs LightGBM)', fontsize=12, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Scatter plot
scatter = axes[1].scatter(y_test_pred_proba_xgb, y_test_pred_proba_lgb, 
                          c=y_test, alpha=0.5, cmap='RdYlGn_r', s=20)
axes[1].plot([0, 1], [0, 1], 'k--', linewidth=1, label='Perfect agreement')
axes[1].set_xlabel('XGBoost Predicted Probability', fontsize=12)
axes[1].set_ylabel('LightGBM Predicted Probability', fontsize=12)
axes[1].set_title('XGBoost vs LightGBM Predictions', fontsize=12, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.colorbar(scatter, ax=axes[1], label='True Label')

plt.tight_layout()
plt.show()

### 11.3 Hard Cases 분석

In [None]:
# 모든 모델이 틀리는 케이스 (False Negatives)
# True positive인데 모든 모델이 negative로 예측

false_negatives_xgb = (y_test == 1) & (y_test_pred_xgb == 0)
false_negatives_lgb = (y_test == 1) & (y_test_pred_lgb == 0)
false_negatives_ensemble = (y_test == 1) & (y_test_pred_optimized == 0)

# 모든 모델이 놓친 케이스
hard_fn = false_negatives_xgb & false_negatives_lgb & false_negatives_ensemble

print(f"Hard Cases Analysis (Test Set):")
print(f"\nFalse Negatives:")
print(f"  XGBoost:  {false_negatives_xgb.sum()} ({false_negatives_xgb.sum()/y_test.sum()*100:.1f}% of positives)")
print(f"  LightGBM: {false_negatives_lgb.sum()} ({false_negatives_lgb.sum()/y_test.sum()*100:.1f}% of positives)")
print(f"  Ensemble: {false_negatives_ensemble.sum()} ({false_negatives_ensemble.sum()/y_test.sum()*100:.1f}% of positives)")
print(f"\nHard cases (all models missed): {hard_fn.sum()} ({hard_fn.sum()/y_test.sum()*100:.1f}% of positives)")

In [None]:
# False Positives 분석
false_positives_xgb = (y_test == 0) & (y_test_pred_xgb == 1)
false_positives_lgb = (y_test == 0) & (y_test_pred_lgb == 1)
false_positives_ensemble = (y_test == 0) & (y_test_pred_optimized == 1)

# 모든 모델이 잘못 예측한 케이스
hard_fp = false_positives_xgb & false_positives_lgb & false_positives_ensemble

print(f"\nFalse Positives:")
print(f"  XGBoost:  {false_positives_xgb.sum()} ({false_positives_xgb.sum()/(y_test==0).sum()*100:.1f}% of negatives)")
print(f"  LightGBM: {false_positives_lgb.sum()} ({false_positives_lgb.sum()/(y_test==0).sum()*100:.1f}% of negatives)")
print(f"  Ensemble: {false_positives_ensemble.sum()} ({false_positives_ensemble.sum()/(y_test==0).sum()*100:.1f}% of negatives)")
print(f"\nHard cases (all models wrong): {hard_fp.sum()} ({hard_fp.sum()/(y_test==0).sum()*100:.1f}% of negatives)")

In [None]:
# Hard cases의 예측 확률 분포
if hard_fn.sum() > 0:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # False Negatives (hard cases)
    axes[0].hist(y_test_pred_proba_xgb[hard_fn], bins=20, alpha=0.5, label='XGBoost', edgecolor='black')
    axes[0].hist(y_test_pred_proba_lgb[hard_fn], bins=20, alpha=0.5, label='LightGBM', edgecolor='black')
    axes[0].hist(y_test_pred_proba_optimized[hard_fn], bins=20, alpha=0.5, label='Ensemble', edgecolor='black')
    axes[0].axvline(0.5, color='red', linestyle='--', linewidth=2, label='Threshold')
    axes[0].set_xlabel('Predicted Probability', fontsize=12)
    axes[0].set_ylabel('Frequency', fontsize=12)
    axes[0].set_title(f'Predicted Probabilities for Hard False Negatives (n={hard_fn.sum()})', fontsize=12, fontweight='bold')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # False Positives (hard cases)
    if hard_fp.sum() > 0:
        axes[1].hist(y_test_pred_proba_xgb[hard_fp], bins=20, alpha=0.5, label='XGBoost', edgecolor='black')
        axes[1].hist(y_test_pred_proba_lgb[hard_fp], bins=20, alpha=0.5, label='LightGBM', edgecolor='black')
        axes[1].hist(y_test_pred_proba_optimized[hard_fp], bins=20, alpha=0.5, label='Ensemble', edgecolor='black')
        axes[1].axvline(0.5, color='red', linestyle='--', linewidth=2, label='Threshold')
        axes[1].set_xlabel('Predicted Probability', fontsize=12)
        axes[1].set_ylabel('Frequency', fontsize=12)
        axes[1].set_title(f'Predicted Probabilities for Hard False Positives (n={hard_fp.sum()})', fontsize=12, fontweight='bold')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("No hard false negative cases found.")

## 12. 최종 모델 저장

In [None]:
# 최종 모델 저장 (Optimized Weighted Ensemble)
print("Saving ensemble models...\n")

# 1. Best ensemble model (Optimized Weighted)
with open(model_dir / 'ensemble_best.pkl', 'wb') as f:
    pickle.dump(ensemble_model, f)
print(f"Saved: {model_dir / 'ensemble_best.pkl'}")

# 2. Simple averaging (backup)
simple_ensemble = EnsembleModel(models=[xgb_model, lgb_model], weights=[0.5, 0.5])
with open(model_dir / 'ensemble_simple_avg.pkl', 'wb') as f:
    pickle.dump(simple_ensemble, f)
print(f"Saved: {model_dir / 'ensemble_simple_avg.pkl'}")

# 3. Stacking model
stacking_model = {
    'base_models': [xgb_model, lgb_model],
    'meta_model': meta_model
}
with open(model_dir / 'ensemble_stacking.pkl', 'wb') as f:
    pickle.dump(stacking_model, f)
print(f"Saved: {model_dir / 'ensemble_stacking.pkl'}")

In [None]:
# 결과 저장
ensemble_results = {
    'best_model': best_model_name,
    'test_metrics': {
        'roc_auc': float(best_test_roc_auc),
        'f1_score': float(best_test_f1),
        'precision': float(comparison_df.loc[best_idx, 'Test_Precision']),
        'recall': float(comparison_df.loc[best_idx, 'Test_Recall']),
        'pr_auc': float(comparison_df.loc[best_idx, 'Test_PR_AUC']),
    },
    'optimized_weights': {
        'xgboost': float(optimized_weights[0]),
        'lightgbm': float(optimized_weights[1])
    },
    'stacking_coefficients': {
        'xgboost': float(meta_model.coef_[0][0]),
        'lightgbm': float(meta_model.coef_[0][1]),
        'intercept': float(meta_model.intercept_[0])
    },
    'improvement_over_baseline': {
        'baseline_roc_auc': float(baseline_roc_auc),
        'improvement_percent': float(improvement)
    }
}

with open(model_dir / 'ensemble_results.json', 'w') as f:
    json.dump(ensemble_results, f, indent=2)

print(f"\nSaved: {model_dir / 'ensemble_results.json'}")

In [None]:
# 비교 테이블 CSV 저장
comparison_df.to_csv(model_dir / 'ensemble_comparison.csv', index=False)
print(f"Saved: {model_dir / 'ensemble_comparison.csv'}")

## 13. 결론

### 완료된 작업
1. ✅ XGBoost와 LightGBM 기존 모델 로드
2. ✅ Simple Averaging Ensemble 구현
3. ✅ Weighted Ensemble (성능 기반) 구현
4. ✅ Optimized Weighted Ensemble 구현
5. ✅ Voting Ensemble (Soft/Hard) 구현
6. ✅ Stacking Ensemble 구현
7. ✅ 모델 성능 비교 및 시각화
8. ✅ 앙상블 분석 (가중치, 예측 차이, Hard cases)
9. ✅ 최종 모델 저장

### 주요 결과
- 총 8개 모델 비교 (단일 모델 2개 + 앙상블 6개)
- 앙상블 모델이 단일 모델보다 일반적으로 더 나은 성능
- Optimized Weighted Ensemble이 자동으로 최적 가중치 탐색
- Stacking은 메타 학습을 통해 모델 간 상호작용 학습

### 다음 단계
1. **임계값 최적화**: Business metric 기반 threshold 조정
2. **모델 해석**: SHAP을 앙상블 모델에 적용
3. **프로덕션 배포**: 모델 서빙 파이프라인 구축
4. **모니터링**: 성능 모니터링 및 재학습 전략

In [None]:
print("\n" + "="*80)
print("ENSEMBLE MODEL TRAINING COMPLETED")
print("="*80)
print(f"\nBest Model: {best_model_name}")
print(f"Test ROC-AUC: {best_test_roc_auc:.4f}")
print(f"Test F1 Score: {best_test_f1:.4f}")
print(f"Improvement over baseline: {improvement:+.2f}%")
print(f"\nModels saved to: {model_dir}")