## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

### Data Load

In [2]:
# 데이터 로드
IVF_train = pd.read_csv('../data/IVF_train_dataset_22.csv')
IVF_test = pd.read_csv('../data/IVF_test_dataset_22.csv')

DI_train = pd.read_csv('../data/DI_train_dataset_22.csv')
DI_test = pd.read_csv('../data/DI_test_dataset_22.csv')

In [3]:
# ID 열을 제외한 특성과 타겟 변수 분리
IVF_X = IVF_train.drop(['임신_성공_여부', 'ID'], axis=1)
IVF_y = IVF_train['임신_성공_여부']

DI_X = DI_train.drop(['임신_성공_여부', 'ID'], axis=1)
DI_y = DI_train['임신_성공_여부']

### 인코딩 

In [4]:
IVF_categorical_columns = [
    "시술_시기_코드",
    "시술_당시_나이",
    "임신_시도_또는_마지막_임신_경과_연수",
    "배란_유도_유형",
    "배아_생성_주요_이유",
    "난자_출처",
    "정자_출처",
    "난자_기증자_나이",
    "정자_기증자_나이",
    "변환된_특정_시술_유형",
    "채취_해동_차이",
    "해동_혼합_차이",
    "혼합_이식_차이",
    "이식_해동_차이"
]

In [5]:
DI_categorical_columns = [
    "시술_시기_코드",
    "시술_당시_나이",
    "임신_시도_또는_마지막_임신_경과_연수",
    "정자_기증자_나이",
    "변환된_특정_시술_유형"
]

In [6]:
# 모든 범주형 변수를 문자열로 변환
IVF_X[IVF_categorical_columns] = IVF_X[IVF_categorical_columns].astype(str)
DI_X[DI_categorical_columns] = DI_X[DI_categorical_columns].astype(str)
IVF_test[IVF_categorical_columns] = IVF_test[IVF_categorical_columns].astype(str)
DI_test[DI_categorical_columns] = DI_test[DI_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
IVF_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
DI_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

IVF_X[IVF_categorical_columns] = IVF_encoder.fit_transform(IVF_X[IVF_categorical_columns])
DI_X[DI_categorical_columns] = DI_encoder.fit_transform(DI_X[DI_categorical_columns])
IVF_test[IVF_categorical_columns] = IVF_encoder.transform(IVF_test[IVF_categorical_columns])
DI_test[DI_categorical_columns] = DI_encoder.transform(DI_test[DI_categorical_columns])

## Modeling

In [7]:
# 데이터 분할
IVF_X_train, IVF_X_test, IVF_y_train, IVF_y_test = train_test_split(IVF_X, IVF_y, test_size=0.2, random_state=42)
DI_X_train, DI_X_test, DI_y_train, DI_y_test = train_test_split(DI_X, DI_y, test_size=0.2, random_state=42)

### DI 데이터

In [8]:
from imblearn.over_sampling import BorderlineSMOTE

# Borderline-SMOTE를 사용하여 소수 클래스 오버샘플링
borderline_smote = BorderlineSMOTE(random_state=42)
IVF_X_train_resampled, IVF_y_train_resampled = borderline_smote.fit_resample(IVF_X_train, IVF_y_train)
DI_X_train_resampled, DI_y_train_resampled = borderline_smote.fit_resample(DI_X_train, DI_y_train)

In [9]:
import optuna
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# 목적 함수 정의
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'max_depth': trial.suggest_int('max_depth', -1, 512),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 1e-8, 1.0, log=True),
        'random_state': 42,
        'boosting_type': 'gbdt',
        'verbose': -1
    }

    model = lgb.LGBMClassifier(**param)
    model.fit(DI_X_train, DI_y_train)
    
    y_pred = model.predict(DI_X_test)
    y_pred_proba = model.predict_proba(DI_X_test)[:, 1]
    
    auc = roc_auc_score(DI_y_test, y_pred_proba)
    return auc

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1500)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-02-07 02:03:23,977] A new study created in memory with name: no-name-a97defcc-b869-43c8-83ff-60a801ab25c7
[I 2025-02-07 02:03:25,925] Trial 0 finished with value: 0.6887068588903451 and parameters: {'n_estimators': 1833, 'num_leaves': 379, 'max_depth': 246, 'learning_rate': 0.0009122166420055654, 'min_child_samples': 81, 'subsample': 0.24000490585093345, 'colsample_bytree': 0.8913957898498783, 'reg_alpha': 0.002872468828326346, 'reg_lambda': 0.00022224268722833272, 'min_split_gain': 6.5594569028917895e-06}. Best is trial 0 with value: 0.6887068588903451.
[I 2025-02-07 02:03:28,962] Trial 1 finished with value: 0.6427260812581913 and parameters: {'n_estimators': 2879, 'num_leaves': 405, 'max_depth': 427, 'learning_rate': 0.029053949111820736, 'min_child_samples': 57, 'subsample': 0.7485419524815546, 'colsample_bytree': 0.4766694037506676, 'reg_alpha': 0.08973357834091314, 'reg_lambda': 9.707423164393733e-08, 'min_split_gain': 3.3700159172190753e-06}. Best is trial 0 with value: 

Best trial:
  Value: 0.7150311271297509
  Params: 
    n_estimators: 3398
    num_leaves: 2
    max_depth: 314
    learning_rate: 0.03099949800355808
    min_child_samples: 86
    subsample: 0.5080031586338798
    colsample_bytree: 0.20066288141726088
    reg_alpha: 0.002546182516957053
    reg_lambda: 1.241057302157224e-06
    min_split_gain: 0.00010748443078497889


.