## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [7]:
import pandas as pd
import optuna
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import lightgbm as lgb

### Data Load

In [8]:
# 데이터 로드
Total_train = pd.read_csv('../data/Total_train_dataset_30.csv')
Total_test = pd.read_csv('../data/Total_test_dataset_30.csv')

In [9]:
# ID 열을 제외한 특성과 타겟 변수 분리
Total_X = Total_train.drop(['임신_성공_여부', 'ID'], axis=1)
Total_y = Total_train['임신_성공_여부']

### 인코딩 

In [10]:
Total_categorical_columns = [
    "시술_시기_코드",
    "시술_당시_나이",
    "특정_시술_유형",
    "배란_유도_유형",
    "단일_배아_이식_여부",
    "착상_전_유전_진단_사용_여부",
    "배아_생성_주요_이유",
    "총_생성_배아_수",
    "미세주입된_난자_수",
    "미세주입에서_생성된_배아_수",
    "이식된_배아_수",
    "미세주입_배아_이식_수",
    "저장된_배아_수",
    "미세주입_후_저장된_배아_수",
    "해동된_배아_수",
    "해동_난자_수",
    "수집된_신선_난자_수",
    "저장된_신선_난자_수",
    "혼합된_난자_수",
    "파트너_정자와_혼합된_난자_수",
    "기증자_정자와_혼합된_난자_수",
    "난자_출처",
    "정자_출처",
    "난자_기증자_나이",
    "정자_기증자_나이",
    "동결_배아_사용_여부",
    "신선_배아_사용_여부",
    "기증_배아_사용_여부",
    "대리모_여부",
    "ICSI_배아_이식_비율",
    "ICSI_배아_생성_비율",
    "ICSI_성공률",
    "해동_배아_비율",
    "총_배아_수",
    "이식된_배아_수_비율",
    "저장된_배아_비율",
    "정자와_혼합된_난자_비율",
    "사용된_신선_난자_수",
    "사용된_신선_난자_수_비율",
    "사용된_난자_수",
    "혼합된_난자_수_비율"
]

In [11]:
# 모든 범주형 변수를 문자열로 변환
Total_X[Total_categorical_columns] = Total_X[Total_categorical_columns].astype(str)
Total_test[Total_categorical_columns] = Total_test[Total_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
Total_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

Total_X[Total_categorical_columns] = Total_encoder.fit_transform(Total_X[Total_categorical_columns])
Total_test[Total_categorical_columns] = Total_encoder.transform(Total_test[Total_categorical_columns])

## Modeling

In [12]:
# 데이터 분할
Total_X_train, Total_X_test, Total_y_train, Total_y_test = train_test_split(Total_X, Total_y, test_size=0.2, random_state=42)

### Total 데이터

optuna

[I 2025-02-14 15:54:28,379] Trial 199 finished with value: 0.7376697843111978 and parameters: {'n_estimators': 6571, 'num_leaves': 43, 'learning_rate': 0.0012149625067437468, 'reg_alpha': 4.999132138818733, 'reg_lambda': 0.03825845725250004}. Best is trial 199 with value: 0.737669

In [13]:
# 목적 함수 정의
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 2000, 10000),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'learning_rate': trial.suggest_float('learning_rate', 1e-6, 1e-1, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-6, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-6, 10.0, log=True),

        'random_state': 42,
        'n_jobs': -1,
        'metric': 'auc',
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': -1
    }

    model = lgb.LGBMClassifier(**param)
    model.fit(Total_X_train, Total_y_train)
    
    y_pred_proba = model.predict_proba(Total_X_test)[:, 1]
    
    auc = roc_auc_score(Total_y_test, y_pred_proba)
    return auc

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1200)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-02-14 11:22:10,673] A new study created in memory with name: no-name-d9794f84-9c04-40df-9afd-ea20c8cc6f6a
[I 2025-02-14 11:24:42,651] Trial 0 finished with value: 0.6951098609511569 and parameters: {'n_estimators': 8652, 'num_leaves': 418, 'learning_rate': 0.026122046917325448, 'reg_alpha': 0.02447576576829815, 'reg_lambda': 0.13878380706152424}. Best is trial 0 with value: 0.6951098609511569.
[I 2025-02-14 11:25:33,648] Trial 1 finished with value: 0.6932801363413399 and parameters: {'n_estimators': 2491, 'num_leaves': 494, 'learning_rate': 0.08765061757358393, 'reg_alpha': 0.0004385723016211602, 'reg_lambda': 0.004206849001430877}. Best is trial 0 with value: 0.6951098609511569.
[I 2025-02-14 11:26:30,075] Trial 2 finished with value: 0.7341151844016731 and parameters: {'n_estimators': 4384, 'num_leaves': 119, 'learning_rate': 0.00014349599535543064, 'reg_alpha': 0.0004157198479262423, 'reg_lambda': 0.026565984817790485}. Best is trial 2 with value: 0.7341151844016731.
[I 202

KeyboardInterrupt: 

.