In [None]:
import numpy as np
import xgboost as xgb
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

def objective(trial, X, y, n_splits=5):
    # 기본 파라미터 설정
    params = {
        "random_state": 42,
        "verbosity": 0,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "gpu_hist",
        "device": "cuda:0"  # 최신 XGBoost 버전 호환성
    }

    # 최적화할 하이퍼파라미터 설정
    params.update({
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, (len(y) - sum(y)) / sum(y))
    })

    # 교차 검증 설정
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # DMatrix 생성
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        # 모델 학습
        evals_result = {}
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=10000,
            evals=[(dval, 'validation')],
            early_stopping_rounds=50,
            evals_result=evals_result,
            verbose_eval=False
        )

        # 검증 세트에 대한 예측 및 성능 평가
        y_pred = model.predict(dval)
        fold_score = roc_auc_score(y_val, y_pred)
        scores.append(fold_score)

    return np.mean(scores)

def tune_xgboost(X, y, n_trials=100, n_splits=5):
    study = optuna.create_study(
        direction="maximize",
        study_name="xgboost_optimization",
        pruner=optuna.pruners.MedianPruner(
            n_startup_trials=5,
            n_warmup_steps=5,
            interval_steps=3
        )
    )

    # 최적화 실행
    study.optimize(
        lambda trial: objective(trial, X, y, n_splits),
        n_trials=n_trials,
        timeout=None,
        show_progress_bar=True
    )

    # 최적의 하이퍼파라미터 출력
    print("\n=== Best Trial ===")
    print(f"Value (AUC): {study.best_value:.4f}")
    print("Best parameters:")
    for key, value in study.best_params.items():
        print(f"    {key}: {value}")
    
    return study.best_params, study.best_value

if __name__ == "__main__":
    X = train_processed.drop(columns=["임신 성공 여부"])
    y = train_processed["임신 성공 여부"]
    
    # 메모리 최적화 (대용량 데이터를 위한 옵션)
    def optimize_dtypes(df):
        """데이터프레임의 데이터 타입을 최적화하여 메모리 사용량 감소"""
        for col in df.columns:
            if df[col].dtype == 'float64':
                df[col] = df[col].astype('float32')
            elif df[col].dtype == 'int64':
                df[col] = df[col].astype('int32')
        return df

    X = optimize_dtypes(X)
    
    # 데이터 정보 출력
    print(f"데이터 크기: {X.shape[0]} 행, {X.shape[1]} 열")
    
    # 하이퍼파라미터 최적화 실행
    best_params, best_score = tune_xgboost(X, y, n_trials=100)
    
    # 최적 파라미터를 파일로 저장
    import json
    with open('best_xgboost_params.json', 'w') as f:
        json.dump(best_params, f, indent=4)
    
    print("최적 파라미터가 'best_xgboost_params.json'에 저장되었습니다.")