## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [9]:
import pandas as pd
import optuna
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import lightgbm as lgb

### Data Load

In [10]:
# 데이터 로드
Total_train = pd.read_csv('../data/Total_train_dataset_42.csv')
Total_test = pd.read_csv('../data/Total_test_dataset_42.csv')

In [11]:
# ID 열을 제외한 특성과 타겟 변수 분리
Total_X = Total_train.drop(['임신_성공_여부', 'ID'], axis=1)
Total_y = Total_train['임신_성공_여부']

### 인코딩 

In [12]:
Total_categorical_columns = [
    "시술_당시_나이",
    "난자_출처",
    "정자_출처",
    "난자_기증자_나이",
    "정자_기증자_나이"
]

In [13]:
# 모든 범주형 변수를 문자열로 변환
Total_X[Total_categorical_columns] = Total_X[Total_categorical_columns].astype(str)
Total_test[Total_categorical_columns] = Total_test[Total_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
Total_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

Total_X[Total_categorical_columns] = Total_encoder.fit_transform(Total_X[Total_categorical_columns])
Total_test[Total_categorical_columns] = Total_encoder.transform(Total_test[Total_categorical_columns])

## Modeling

In [14]:
# from sklearn.utils import resample
# from sklearn.model_selection import train_test_split
# import numpy as np

# # 다운샘플링을 위한 데이터 준비
# # 예시로 클래스 0과 클래스 1이 있다고 가정
# class_0 = Total_X[Total_y == 0]
# class_1 = Total_X[Total_y == 1]

# # 클래스 0을 클래스 1의 수에 맞춰 다운샘플링
# class_0_downsampled = resample(class_0, 
#                                replace=False,    # 복원 추출하지 않음
#                                n_samples=len(class_1),  # 클래스 1의 수에 맞춤
#                                random_state=42)  # 재현성을 위한 랜덤 시드

# # 다운샘플링된 데이터와 클래스 1 데이터를 합침
# X_downsampled = np.vstack((class_0_downsampled, class_1))
# y_downsampled = np.hstack((np.zeros(len(class_0_downsampled)), np.ones(len(class_1))))

In [15]:
# 데이터 분할
Total_X_train, Total_X_test, Total_y_train, Total_y_test = train_test_split(Total_X, 
                                                                            Total_y, 
                                                                            test_size=0.2, 
                                                                            random_state=42,
                                                                            stratify=Total_y)

### Total 데이터

optuna

In [16]:
# 목적 함수 정의
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 3500),
        'num_leaves': trial.suggest_int('num_leaves', 500, 3500),
        'max_depth': trial.suggest_int('max_depth', 5, 350),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2),
        'min_child_samples': trial.suggest_int('min_child_samples', 3, 350),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),

        'random_state': 42,
        'n_jobs': -1,
        'metric': 'auc',
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': -1
    }

    model = lgb.LGBMClassifier(**param)
    model.fit(Total_X_train, Total_y_train)
    
    y_pred_proba = model.predict_proba(Total_X_test)[:, 1]
    
    auc = roc_auc_score(Total_y_test, y_pred_proba)
    return auc

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-02-19 00:14:18,718] A new study created in memory with name: no-name-22d322e5-48eb-4824-94fb-22b056ed75c9
[I 2025-02-19 00:14:35,234] Trial 0 finished with value: 0.7155434077636846 and parameters: {'n_estimators': 2166, 'num_leaves': 919, 'max_depth': 276, 'learning_rate': 0.12298338520138044, 'min_child_samples': 154, 'subsample': 0.8908234068708585, 'colsample_bytree': 0.7846042603074108, 'reg_alpha': 5.644640257732113, 'reg_lambda': 9.536086549031863}. Best is trial 0 with value: 0.7155434077636846.
[I 2025-02-19 00:14:52,489] Trial 1 finished with value: 0.7129168236277944 and parameters: {'n_estimators': 2960, 'num_leaves': 1728, 'max_depth': 145, 'learning_rate': 0.13810937855612546, 'min_child_samples': 114, 'subsample': 0.6297244552990051, 'colsample_bytree': 0.9893658713112174, 'reg_alpha': 5.58664065046797, 'reg_lambda': 6.931972234523675}. Best is trial 0 with value: 0.7155434077636846.
[I 2025-02-19 00:15:06,961] Trial 2 finished with value: 0.7272115455562151 and 

KeyboardInterrupt: 

[I 2025-02-16 14:01:21,962] Trial 25 finished with value: 0.7450160562701532 and parameters: {'n_estimators': 2379, 'num_leaves': 1214, 'max_depth': 320, 'learning_rate': 0.0016910462182733038, 'min_child_samples': 321, 'subsample': 0.9245577554285245, 'colsample_bytree': 0.8027573271184746, 'reg_alpha': 8.590190533433615, 'reg_lambda': 6.615930936160601}. Best is trial 25 with value: 0.7450160562701532

.