## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

### Data Load

In [2]:
# 데이터 로드
IVF_train = pd.read_csv('../data/IVF_train_dataset_28.csv')
IVF_test = pd.read_csv('../data/IVF_test_dataset_28.csv')

DI_train = pd.read_csv('../data/DI_train_dataset_28.csv')
DI_test = pd.read_csv('../data/DI_test_dataset_28.csv')

In [3]:
# ID 열을 제외한 특성과 타겟 변수 분리
IVF_X = IVF_train.drop(['임신_성공_여부', 'ID'], axis=1)
IVF_y = IVF_train['임신_성공_여부']

DI_X = DI_train.drop(['임신_성공_여부', 'ID'], axis=1)
DI_y = DI_train['임신_성공_여부']

In [4]:
print(f"IVF_X shape: {IVF_X.shape}")
print(f"IVF_test shape: {IVF_test.drop('ID', axis=1).shape}")
print(f"DI_X shape: {DI_X.shape}")
print(f"DI_test shape: {DI_test.drop('ID', axis=1).shape}")

IVF_X shape: (250052, 60)
IVF_test shape: (87891, 60)
DI_X shape: (6289, 31)
DI_test shape: (2176, 31)


### 인코딩 

In [5]:
IVF_categorical_columns = [
    "시술_시기_코드",
    "시술_당시_나이",
    "특정_시술_유형",
    "배란_유도_유형",
    "난자_출처",
    "정자_출처",
    "난자_기증자_나이",
    "정자_기증자_나이"
]

In [6]:
DI_categorical_columns = [
    "시술_시기_코드",
    "시술_당시_나이",
    "특정_시술_유형",
    "정자_기증자_나이"
]

In [7]:
# 모든 범주형 변수를 문자열로 변환
IVF_X[IVF_categorical_columns] = IVF_X[IVF_categorical_columns].astype(str)
DI_X[DI_categorical_columns] = DI_X[DI_categorical_columns].astype(str)
IVF_test[IVF_categorical_columns] = IVF_test[IVF_categorical_columns].astype(str)
DI_test[DI_categorical_columns] = DI_test[DI_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
IVF_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
DI_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

IVF_X[IVF_categorical_columns] = IVF_encoder.fit_transform(IVF_X[IVF_categorical_columns])
DI_X[DI_categorical_columns] = DI_encoder.fit_transform(DI_X[DI_categorical_columns])
IVF_test[IVF_categorical_columns] = IVF_encoder.transform(IVF_test[IVF_categorical_columns])
DI_test[DI_categorical_columns] = DI_encoder.transform(DI_test[DI_categorical_columns])

## Modeling

In [8]:
# 데이터 분할
IVF_X_train, IVF_X_test, IVF_y_train, IVF_y_test = train_test_split(IVF_X, IVF_y, test_size=0.2, random_state=42)
DI_X_train, DI_X_test, DI_y_train, DI_y_test = train_test_split(DI_X, DI_y, test_size=0.2, random_state=42)

### IVF 데이터

In [9]:
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

# 목적 함수 정의
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 5000),
        'max_depth': trial.suggest_int('max_depth', 3, 300),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 2048),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'random_state': 42,
        'objective': 'binary',
        'metric': 'auc',
        'verbose': -1   
    }

    model = lgb.LGBMClassifier(**param)
    model.fit(IVF_X_train, IVF_y_train)
    
    y_pred_proba = model.predict_proba(IVF_X_test)[:, 1]
    
    auc = roc_auc_score(IVF_y_test, y_pred_proba)
    return auc

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=300)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-02-13 14:13:59,260] A new study created in memory with name: no-name-ef7ea65a-b4ba-4560-b13e-95e0f45fc446
[I 2025-02-13 14:16:31,551] Trial 0 finished with value: 0.7199471634341209 and parameters: {'n_estimators': 4426, 'max_depth': 118, 'learning_rate': 0.004227099568169079, 'num_leaves': 799, 'min_child_samples': 73, 'subsample': 0.8689426151433917, 'reg_alpha': 1.0331244795727095, 'reg_lambda': 6.97792563936559}. Best is trial 0 with value: 0.7199471634341209.
[I 2025-02-13 14:17:38,804] Trial 1 finished with value: 0.7355066974881046 and parameters: {'n_estimators': 3645, 'max_depth': 124, 'learning_rate': 0.00145293663996384, 'num_leaves': 364, 'min_child_samples': 48, 'subsample': 0.778242285655243, 'reg_alpha': 0.26763406691926805, 'reg_lambda': 8.370253959803302}. Best is trial 1 with value: 0.7355066974881046.
[I 2025-02-13 14:19:44,788] Trial 2 finished with value: 0.7059568274335698 and parameters: {'n_estimators': 3970, 'max_depth': 17, 'learning_rate': 0.010628852

Best trial:
  Value: 0.7398223789384005
  Params: 
    n_estimators: 2814
    max_depth: 244
    learning_rate: 0.006850295058863781
    num_leaves: 24
    min_child_samples: 2
    subsample: 0.9851938647381172
    reg_alpha: 0.0012856900858348812
    reg_lambda: 0.003702943734266382


In [10]:
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

# 목적 함수 정의
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 2000, 8000),
        'max_depth': trial.suggest_int('max_depth', 3, 300),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 50),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 5.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 5.0, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10),
        'random_state': 42,
        'objective': 'binary',
        'metric': 'auc',
        'verbose': -1,
        'n_jobs': -1  
    }

    model = lgb.LGBMClassifier(**param)
    model.fit(IVF_X_train, IVF_y_train)
    
    y_pred_proba = model.predict_proba(IVF_X_test)[:, 1]
    
    auc = roc_auc_score(IVF_y_test, y_pred_proba)
    return auc

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=800)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-02-13 16:12:29,710] A new study created in memory with name: no-name-298339ae-8423-484e-b4c4-82a3b9c26dcc
[I 2025-02-13 16:13:47,286] Trial 0 finished with value: 0.7353407103021721 and parameters: {'n_estimators': 4259, 'max_depth': 186, 'learning_rate': 0.0001738264941632431, 'num_leaves': 295, 'min_child_samples': 50, 'subsample': 0.9662882120798342, 'reg_alpha': 0.010712570525885774, 'reg_lambda': 0.13521441739866488, 'scale_pos_weight': 3.277021261611888}. Best is trial 0 with value: 0.7353407103021721.
[I 2025-02-13 16:18:39,170] Trial 1 finished with value: 0.732802388707101 and parameters: {'n_estimators': 6885, 'max_depth': 33, 'learning_rate': 0.0003260476588540067, 'num_leaves': 879, 'min_child_samples': 48, 'subsample': 0.8749991426716331, 'reg_alpha': 0.006644447165563893, 'reg_lambda': 0.013551902810444891, 'scale_pos_weight': 9.404820743734728}. Best is trial 0 with value: 0.7353407103021721.
[I 2025-02-13 16:19:04,025] Trial 2 finished with value: 0.731280158290

Best trial:
  Value: 0.7397174743186166
  Params: 
    n_estimators: 2656
    max_depth: 106
    learning_rate: 0.006251889835434849
    num_leaves: 28
    min_child_samples: 3
    subsample: 0.9762937945738778
    reg_alpha: 0.01264405404255148
    reg_lambda: 5.722792999151612e-05
    scale_pos_weight: 1.1413422550626438


.