## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [1]:
import pandas as pd
import optuna
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import lightgbm as lgb

### Data Load

In [2]:
# 데이터 로드
Total_train = pd.read_csv('../data/Total_train_dataset_43.csv')
Total_test = pd.read_csv('../data/Total_test_dataset_43.csv')

In [3]:
# ID 열을 제외한 특성과 타겟 변수 분리
Total_X = Total_train.drop(['임신_성공_여부', 'ID'], axis=1)
Total_y = Total_train['임신_성공_여부']

### 인코딩 

In [4]:
Total_categorical_columns = [
    "시술_당시_나이",
    "난자_기증자_나이",
    "정자_기증자_나이"
]

In [5]:
# 모든 범주형 변수를 문자열로 변환
Total_X[Total_categorical_columns] = Total_X[Total_categorical_columns].astype(str)
Total_test[Total_categorical_columns] = Total_test[Total_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
Total_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

Total_X[Total_categorical_columns] = Total_encoder.fit_transform(Total_X[Total_categorical_columns])
Total_test[Total_categorical_columns] = Total_encoder.transform(Total_test[Total_categorical_columns])

## Modeling

In [6]:
# 데이터 분할
Total_X_train, Total_X_test, Total_y_train, Total_y_test = train_test_split(Total_X, 
                                                                            Total_y, 
                                                                            test_size=0.2, 
                                                                            random_state=42,
                                                                            stratify=Total_y)

### Total 데이터

optuna

In [7]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# 목적 함수 정의
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 8000),
        'learning_rate': trial.suggest_float('learning_rate', 0.0005, 0.5),
        'max_depth': trial.suggest_int('max_depth', 5, 500),

        'alpha': trial.suggest_float('alpha', 0.01, 100, log=True),
        'gamma': trial.suggest_float('gamma', 0.01, 100, log=True),

        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 100),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 100),

        'subsample': trial.suggest_float('subsample', 0.1, 1),

        'objective': 'binary:logistic',   # 이진 분류
        'tree_method': 'hist',            # 트리 메소드
        'random_state': 42,
        'eval_metric': 'auc',             # 평가 지표
        'n_jobs': -1,
    }

    model = XGBClassifier(**param)
    model.fit(Total_X_train, Total_y_train)

    y_pred_proba = model.predict_proba(Total_X_test)[:, 1]

    auc = roc_auc_score(Total_y_test, y_pred_proba)
    return auc

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=800)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


[I 2025-02-19 02:21:54,220] A new study created in memory with name: no-name-55229aa4-8ba5-4dc6-a0b6-7f426c1319d9
[I 2025-02-19 02:22:55,826] Trial 0 finished with value: 0.7355554750066655 and parameters: {'n_estimators': 3707, 'learning_rate': 0.1542625717641242, 'max_depth': 111, 'alpha': 0.015060222578592812, 'gamma': 0.18911226801222358, 'reg_alpha': 78.53091570222456, 'reg_lambda': 28.611547404842813, 'subsample': 0.7129540836932237}. Best is trial 0 with value: 0.7355554750066655.
[I 2025-02-19 02:25:08,664] Trial 1 finished with value: 0.7337150745109663 and parameters: {'n_estimators': 7798, 'learning_rate': 0.09633352672766317, 'max_depth': 211, 'alpha': 0.33580244303400064, 'gamma': 0.13504217563815496, 'reg_alpha': 64.00027683942399, 'reg_lambda': 60.86368769061172, 'subsample': 0.7903815749953501}. Best is trial 0 with value: 0.7355554750066655.
[I 2025-02-19 02:25:49,170] Trial 2 finished with value: 0.703199151722895 and parameters: {'n_estimators': 1543, 'learning_rate'

KeyboardInterrupt: 

.