## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

### Data Load

In [2]:
# 데이터 로드
DI_train = pd.read_csv('../data/DI_train_dataset_50.csv')
DI_test = pd.read_csv('../data/DI_test_dataset_50.csv')

In [3]:
# ID 열을 제외한 특성과 타겟 변수 분리
DI_X = DI_train.drop(['임신_성공_여부', 'ID'], axis=1)
DI_y = DI_train['임신_성공_여부']

In [4]:
print(f"DI_X shape: {DI_X.shape}")
print(f"DI_test shape: {DI_test.drop('ID', axis=1).shape}")

DI_X shape: (6283, 22)
DI_test shape: (2176, 22)


### 인코딩 

In [5]:
DI_categorical_columns = [
    "특정_시술_유형",
    "정자_기증자_나이"
]

In [6]:
# 모든 범주형 변수를 문자열로 변환
DI_X[DI_categorical_columns] = DI_X[DI_categorical_columns].astype(str)
DI_test[DI_categorical_columns] = DI_test[DI_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
DI_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

DI_X[DI_categorical_columns] = DI_encoder.fit_transform(DI_X[DI_categorical_columns])
DI_test[DI_categorical_columns] = DI_encoder.transform(DI_test[DI_categorical_columns])

## Modeling

In [7]:
# 데이터 분할
DI_X_train, DI_X_test, DI_y_train, DI_y_test = train_test_split(DI_X, DI_y, test_size=0.2, random_state=42)

### DI 데이터

In [8]:
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

# 목적 함수 정의
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 5000),
        'num_leaves': trial.suggest_int('num_leaves', 1000, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.00005, 0.5),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 500),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 200.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 200.0),

        'random_state': 42,
        'n_jobs': -1,
        'metric': 'auc',
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': -1   
    }

    model = lgb.LGBMClassifier(**param)
    model.fit(DI_X_train, DI_y_train)
    
    y_pred_proba = model.predict_proba(DI_X_test)[:, 1]
    
    auc = roc_auc_score(DI_y_test, y_pred_proba)
    return auc

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2000)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-02-22 22:45:12,488] A new study created in memory with name: no-name-796d8115-64e7-44c1-a359-0466c92e31c8
[I 2025-02-22 22:45:13,069] Trial 0 finished with value: 0.6661432689180247 and parameters: {'n_estimators': 4390, 'num_leaves': 2025, 'learning_rate': 0.05688533420753473, 'min_child_samples': 105, 'reg_alpha': 34.59759388296175, 'reg_lambda': 88.77351038758006}. Best is trial 0 with value: 0.6661432689180247.
[I 2025-02-22 22:45:13,478] Trial 1 finished with value: 0.6098013705543904 and parameters: {'n_estimators': 4116, 'num_leaves': 2218, 'learning_rate': 0.08806525950435057, 'min_child_samples': 99, 'reg_alpha': 115.11658316740045, 'reg_lambda': 76.21657855255357}. Best is trial 0 with value: 0.6661432689180247.
[I 2025-02-22 22:45:13,593] Trial 2 finished with value: 0.6683050885476833 and parameters: {'n_estimators': 1416, 'num_leaves': 1023, 'learning_rate': 0.22991545191026336, 'min_child_samples': 311, 'reg_alpha': 55.984409055576734, 'reg_lambda': 32.76888210791

Best trial:
  Value: 0.6825181413398236
  Params: 
    n_estimators: 3199
    num_leaves: 1693
    learning_rate: 0.3779934369562745
    min_child_samples: 246
    reg_alpha: 5.632304254879699
    reg_lambda: 8.873297193631082


In [9]:
# import optuna
# import lightgbm as lgb
# from sklearn.metrics import roc_auc_score

# # 목적 함수 정의
# def objective(trial):
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 500, 9000),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 2048),
#         'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
#         'subsample': trial.suggest_float('subsample', 0.2, 1.0),
#         'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
#         'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),

#         'random_state': 42,
#         'scale_pos_weight': 6.757,
#         'objective': 'binary',
#         'metric': 'auc',
#         'verbose': -1     
#     }

#     model = lgb.LGBMClassifier(**param)
#     model.fit(DI_X_train, DI_y_train)
    
#     y_pred_proba = model.predict_proba(DI_X_test)[:, 1]
    
#     auc = roc_auc_score(DI_y_test, y_pred_proba)
#     return auc

# # Optuna 스터디 생성 및 최적화 실행
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=2000)

# # 최적의 하이퍼파라미터 출력
# print("Best trial:")
# trial = study.best_trial
# print(f"  Value: {trial.value}")
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")

.