## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [2]:
!pip install xgboost



In [3]:
from xgboost import XGBClassifier

In [4]:
import pandas as pd
import optuna
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import lightgbm as lgb

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Data Load

In [6]:
# 데이터 로드
Total_train = pd.read_csv('/content/drive/MyDrive/LG_Aimers_6th/Total_train_dataset_32.csv')
Total_test = pd.read_csv('/content/drive/MyDrive/LG_Aimers_6th/Total_test_dataset_32.csv')

In [7]:
# ID 열을 제외한 특성과 타겟 변수 분리
Total_X = Total_train.drop(['임신_성공_여부', 'ID'], axis=1)
Total_y = Total_train['임신_성공_여부']

### 인코딩

In [8]:
Total_categorical_columns = [
    "시술_당시_나이",
    "배란_유도_유형",
    "난자_출처",
    "정자_출처",
    "난자_기증자_나이",
    "정자_기증자_나이"
]

In [9]:
# 모든 범주형 변수를 문자열로 변환
Total_X[Total_categorical_columns] = Total_X[Total_categorical_columns].astype(str)
Total_test[Total_categorical_columns] = Total_test[Total_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
Total_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

Total_X[Total_categorical_columns] = Total_encoder.fit_transform(Total_X[Total_categorical_columns])
Total_test[Total_categorical_columns] = Total_encoder.transform(Total_test[Total_categorical_columns])

## Modeling

In [10]:
# 데이터 분할
Total_X_train, Total_X_test, Total_y_train, Total_y_test = train_test_split(Total_X, Total_y, test_size=0.2, random_state=42)

### Total 데이터

optuna

In [13]:
# 목적 함수 정의
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 4000),
        'learning_rate': trial.suggest_float('learning_rate', 0.002, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 15),

        'alpha': trial.suggest_float('alpha', 0.00001, 0.01, log=True),
        'gamma': trial.suggest_float('gamma', 0.00001, 0.01, log=True),

        'reg_alpha': trial.suggest_float('reg_alpha', 0.3, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 1),

        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'objective': 'binary:logistic',  # 이진 분류
        'tree_method': 'hist',            # 트리 메소드
        'device': 'cuda',                 # GPU 사용 설정

        'random_state': 42,
        'eval_metric': 'auc',             # 평가 지표
    }

    model = XGBClassifier(**param)
    model.fit(Total_X_train, Total_y_train)

    y_pred_proba = model.predict_proba(Total_X_test)[:, 1]

    auc = roc_auc_score(Total_y_test, y_pred_proba)
    return auc

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


[I 2025-02-16 06:11:44,262] A new study created in memory with name: no-name-679f06f9-0022-4d1a-98a9-a9d9b2094e4c
[I 2025-02-16 06:12:12,784] Trial 0 finished with value: 0.7184404527524599 and parameters: {'n_estimators': 2702, 'learning_rate': 0.030063459602288976, 'max_depth': 8, 'alpha': 0.004416113975428578, 'gamma': 0.00032161335787041196, 'reg_alpha': 0.8916248668129136, 'reg_lambda': 0.6363838018094243, 'colsample_bytree': 0.9942028707250581, 'subsample': 0.39525736349704066}. Best is trial 0 with value: 0.7184404527524599.
[I 2025-02-16 06:14:19,105] Trial 1 finished with value: 0.6791416729055249 and parameters: {'n_estimators': 3553, 'learning_rate': 0.08486943320232009, 'max_depth': 12, 'alpha': 2.215450173540796e-05, 'gamma': 0.006255511221221006, 'reg_alpha': 0.39037183761315947, 'reg_lambda': 0.7432740154295269, 'colsample_bytree': 0.9983126697782241, 'subsample': 0.5737765030643056}. Best is trial 0 with value: 0.7184404527524599.
[I 2025-02-16 06:14:26,205] Trial 2 fin

Best trial:
  Value: 0.739541101909633
  Params: 
    n_estimators: 2779
    learning_rate: 0.008864633411108202
    max_depth: 4
    alpha: 0.006494427230531438
    gamma: 1.6682946942701627e-05
    reg_alpha: 0.91386214962831
    reg_lambda: 0.6093389417370965
    colsample_bytree: 0.8753058213498945
    subsample: 0.7097325041124595


.