## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

### Data Load

In [2]:
# 데이터 로드
IVF_train = pd.read_csv('../data/IVF_train_dataset_53.csv')
IVF_test = pd.read_csv('../data/IVF_test_dataset_53.csv')

In [3]:
# ID 열을 제외한 특성과 타겟 변수 분리
IVF_X = IVF_train.drop(['임신_성공_여부', 'ID'], axis=1)
IVF_y = IVF_train['임신_성공_여부']

In [4]:
print(f"IVF_X shape: {IVF_X.shape}")
print(f"IVF_test shape: {IVF_test.drop('ID', axis=1).shape}")

IVF_X shape: (250052, 94)
IVF_test shape: (87891, 94)


### 인코딩 

In [5]:
IVF_categorical_columns = [
    # 없음
]

In [6]:
# 모든 범주형 변수를 문자열로 변환
IVF_X[IVF_categorical_columns] = IVF_X[IVF_categorical_columns].astype(str)
IVF_test[IVF_categorical_columns] = IVF_test[IVF_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
IVF_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

IVF_X[IVF_categorical_columns] = IVF_encoder.fit_transform(IVF_X[IVF_categorical_columns])
IVF_test[IVF_categorical_columns] = IVF_encoder.transform(IVF_test[IVF_categorical_columns])

## Modeling

In [7]:
# 데이터 분할
IVF_X_train, IVF_X_test, IVF_y_train, IVF_y_test = train_test_split(IVF_X, IVF_y, test_size=0.2, random_state=42)

### IVF 데이터

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# 목적 함수 정의
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        
        'random_state': 42,
        # 'n_jobs': -1
    }

    model = RandomForestClassifier(**param)
    model.fit(IVF_X_train, IVF_y_train)
    
    y_pred_proba = model.predict_proba(IVF_X_test)[:, 1]
    
    auc = roc_auc_score(IVF_y_test, y_pred_proba)
    return auc

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=800)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-02-25 01:18:41,938] A new study created in memory with name: no-name-710ce70c-84d8-4078-b78c-bff31404d9dc
[I 2025-02-25 01:26:08,913] Trial 0 finished with value: 0.7288625318704042 and parameters: {'n_estimators': 3350, 'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 16}. Best is trial 0 with value: 0.7288625318704042.
[I 2025-02-25 01:29:50,482] Trial 1 finished with value: 0.7154013668730672 and parameters: {'n_estimators': 3305, 'max_depth': 4, 'min_samples_split': 17, 'min_samples_leaf': 11}. Best is trial 0 with value: 0.7288625318704042.
[I 2025-02-25 01:35:11,107] Trial 2 finished with value: 0.7351249815618514 and parameters: {'n_estimators': 1367, 'max_depth': 33, 'min_samples_split': 13, 'min_samples_leaf': 15}. Best is trial 2 with value: 0.7351249815618514.
[I 2025-02-25 01:42:10,544] Trial 3 finished with value: 0.7341649178839755 and parameters: {'n_estimators': 1694, 'max_depth': 27, 'min_samples_split': 19, 'min_samples_leaf': 7}. Best is trial 2 wi

.