## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

### Data Load

In [2]:
# 데이터 로드
IVF_train = pd.read_csv('../data/IVF_train_dataset_26.csv')
IVF_test = pd.read_csv('../data/IVF_test_dataset_26.csv')

DI_train = pd.read_csv('../data/DI_train_dataset_26_xx.csv')
DI_test = pd.read_csv('../data/DI_test_dataset_26_xx.csv')

In [3]:
# ID 열을 제외한 특성과 타겟 변수 분리
IVF_X = IVF_train.drop(['임신_성공_여부', 'ID'], axis=1)
IVF_y = IVF_train['임신_성공_여부']

DI_X = DI_train.drop(['임신_성공_여부', 'ID'], axis=1)
DI_y = DI_train['임신_성공_여부']

In [4]:
print(f"IVF_X shape: {IVF_X.shape}")
print(f"IVF_test shape: {IVF_test.drop('ID', axis=1).shape}")
print(f"DI_X shape: {DI_X.shape}")
print(f"DI_test shape: {DI_test.drop('ID', axis=1).shape}")

IVF_X shape: (250052, 77)
IVF_test shape: (87891, 77)
DI_X shape: (6290, 26)
DI_test shape: (2176, 26)


### 인코딩 

In [5]:
IVF_categorical_columns = [
    "시술_시기_코드",
    "시술_당시_나이",
    "특정_시술_유형",
    "배란_유도_유형",
    "난자_출처",
    "정자_출처",
    "난자_기증자_나이",
    "정자_기증자_나이",
    "채취_해동_차이",
    "해동_혼합_차이",
    "혼합_이식_차이",
    "이식_해동_차이"
]

In [6]:
DI_categorical_columns = [
    "시술_시기_코드",
    "시술_당시_나이",
    "특정_시술_유형",
    "정자_기증자_나이"
]

In [7]:
# 모든 범주형 변수를 문자열로 변환
IVF_X[IVF_categorical_columns] = IVF_X[IVF_categorical_columns].astype(str)
DI_X[DI_categorical_columns] = DI_X[DI_categorical_columns].astype(str)
IVF_test[IVF_categorical_columns] = IVF_test[IVF_categorical_columns].astype(str)
DI_test[DI_categorical_columns] = DI_test[DI_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
IVF_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
DI_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

IVF_X[IVF_categorical_columns] = IVF_encoder.fit_transform(IVF_X[IVF_categorical_columns])
DI_X[DI_categorical_columns] = DI_encoder.fit_transform(DI_X[DI_categorical_columns])
IVF_test[IVF_categorical_columns] = IVF_encoder.transform(IVF_test[IVF_categorical_columns])
DI_test[DI_categorical_columns] = DI_encoder.transform(DI_test[DI_categorical_columns])

## Modeling

In [8]:
# 데이터 분할
IVF_X_train, IVF_X_test, IVF_y_train, IVF_y_test = train_test_split(IVF_X, IVF_y, test_size=0.2, random_state=42)
DI_X_train, DI_X_test, DI_y_train, DI_y_test = train_test_split(DI_X, DI_y, test_size=0.2, random_state=42)

### DI 데이터

In [10]:
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd

# 목적 함수 정의
def objective(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 500, 5000),
        'depth': trial.suggest_int('depth', 3, 16),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_seed': 42,
        'eval_metric': 'AUC',
        'logging_level': 'Silent'
    }

    model = CatBoostClassifier(**param)
    model.fit(DI_X_train, DI_y_train, eval_set=(DI_X_test, DI_y_test), early_stopping_rounds=100, verbose=False)
    
    y_pred_proba = model.predict_proba(DI_X_test)[:, 1]
    
    auc = roc_auc_score(DI_y_test, y_pred_proba)
    return auc

# Optuna 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=4000)

# 모든 행과 열이 잘리지 않도록 출력 설정
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-02-11 22:14:03,519] A new study created in memory with name: no-name-0952a6df-9f8e-4646-8d40-4f851c60e701
[I 2025-02-11 22:14:04,507] Trial 0 finished with value: 0.707159239842726 and parameters: {'iterations': 1818, 'depth': 4, 'learning_rate': 0.04799965080494408, 'l2_leaf_reg': 0.9966020937245892, 'border_count': 181, 'random_strength': 0.10181049760020378, 'bagging_temperature': 0.5228048926179374}. Best is trial 0 with value: 0.707159239842726.
[I 2025-02-11 22:16:11,868] Trial 1 finished with value: 0.7019604630843163 and parameters: {'iterations': 3487, 'depth': 14, 'learning_rate': 0.013429403743572162, 'l2_leaf_reg': 0.010606332612964477, 'border_count': 244, 'random_strength': 0.6802322942372138, 'bagging_temperature': 0.3231929900356806}. Best is trial 0 with value: 0.707159239842726.
[I 2025-02-11 22:16:15,060] Trial 2 finished with value: 0.7080329838357362 and parameters: {'iterations': 1109, 'depth': 4, 'learning_rate': 0.009426545369399094, 'l2_leaf_reg': 0.809

Best trial:
  Value: 0.7265508955875928
  Params: 
    iterations: 2450
    depth: 6
    learning_rate: 0.05561117585406376
    l2_leaf_reg: 0.01615957660842957
    border_count: 67
    random_strength: 1.2393361428020702
    bagging_temperature: 0.7898024543180192


.