## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

### Import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

In [2]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    # 모델 학습
    model.fit(X_train, y_train)
    
    # 예측
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # 평가
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    cm = confusion_matrix(y_test, y_pred)
    
    # 결과 출력
    print()
    print("--- Model Performance ---")
    print(f"Model Accuracy: {accuracy}")
    print(f"Model F1 Score: {f1}")
    print(f"Model AUC: {auc}")
    
    # 혼동 행렬 출력
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    disp.plot(cmap=plt.cm.Blues)
    plt.show()
    
    return y_pred, y_pred_proba

### Data Load

In [3]:
# 데이터 로드
IVF_train = pd.read_csv('../data/IVF_train_dataset_54.csv')
IVF_test = pd.read_csv('../data/IVF_test_dataset_54.csv')

DI_train = pd.read_csv('../data/DI_train_dataset_54.csv')
DI_test = pd.read_csv('../data/DI_test_dataset_54.csv')

In [4]:
# ID 열을 제외한 특성과 타겟 변수 분리
IVF_X = IVF_train.drop(['임신_성공_여부', 'ID'], axis=1)
IVF_y = IVF_train['임신_성공_여부']

DI_X = DI_train.drop(['임신_성공_여부', 'ID'], axis=1)
DI_y = DI_train['임신_성공_여부']

In [5]:
print(f"IVF_X shape: {IVF_X.shape}")
print(f"IVF_test shape: {IVF_test.drop('ID', axis=1).shape}")

print(f"DI_X shape: {DI_X.shape}")
print(f"DI_test shape: {DI_test.drop('ID', axis=1).shape}")

IVF_X shape: (250052, 77)
IVF_test shape: (87891, 77)
DI_X shape: (6289, 31)
DI_test shape: (2176, 31)


### 인코딩 

In [6]:
IVF_categorical_columns = [
    "시술_시기_코드",
    "시술_당시_나이",
    "특정_시술_유형",
    "배란_유도_유형",
    "난자_출처",
    "정자_출처",
    "난자_기증자_나이",
    "정자_기증자_나이",
    "채취_해동_차이",
    "해동_혼합_차이",
    "혼합_이식_차이",
    "이식_해동_차이"
]

In [7]:
DI_categorical_columns = [
    "시술_시기_코드",
    "시술_당시_나이",
    "특정_시술_유형",
    "정자_기증자_나이"
]

In [8]:
# 모든 범주형 변수를 문자열로 변환
IVF_X[IVF_categorical_columns] = IVF_X[IVF_categorical_columns].astype(str)
DI_X[DI_categorical_columns] = DI_X[DI_categorical_columns].astype(str)
IVF_test[IVF_categorical_columns] = IVF_test[IVF_categorical_columns].astype(str)
DI_test[DI_categorical_columns] = DI_test[DI_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
IVF_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
DI_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

IVF_X[IVF_categorical_columns] = IVF_encoder.fit_transform(IVF_X[IVF_categorical_columns])
DI_X[DI_categorical_columns] = DI_encoder.fit_transform(DI_X[DI_categorical_columns])
IVF_test[IVF_categorical_columns] = IVF_encoder.transform(IVF_test[IVF_categorical_columns])
DI_test[DI_categorical_columns] = DI_encoder.transform(DI_test[DI_categorical_columns])

## Modeling

In [9]:
# 데이터 분할
IVF_X_train, IVF_X_test, IVF_y_train, IVF_y_test = train_test_split(IVF_X, IVF_y, test_size=0.2, random_state=42)
DI_X_train, DI_X_test, DI_y_train, DI_y_test = train_test_split(DI_X, DI_y, test_size=0.2, random_state=42)

In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import lightgbm as lgb
from catboost import CatBoostClassifier
import xgboost as xgb
import optuna

# 데이터 준비 함수
def prepare_data(X, y):
    return X, y

# 모델 학습 및 예측 함수
def train_and_predict(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba > 0.5).astype(int)
    return y_pred, y_pred_proba

# Optuna를 사용한 하이퍼파라미터 최적화 함수
def optimize_model(model_class, X_train, y_train, n_trials=1):
    def objective(trial):
        if model_class == lgb.LGBMClassifier:
            params = {
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
                'num_leaves': trial.suggest_int('num_leaves', 20, 150),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'verbose': -1
            }
        elif model_class == CatBoostClassifier:
            params = {
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
                'depth': trial.suggest_int('depth', 4, 10),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 1e-1, log=True),
                'border_count': trial.suggest_int('border_count', 32, 255),
                'verbose': 0
            }
        elif model_class == xgb.XGBClassifier:
            params = {
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
            }
        model = model_class(**params)
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_train)[:, 1]
        auc = roc_auc_score(y_train, y_pred_proba)
        return auc

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    return model_class(**study.best_params)

# IVF 데이터 준비
X_train, y_train = prepare_data(IVF_X_train, IVF_y_train)

# 모델 최적화 및 학습
lgb_model = optimize_model(lgb.LGBMClassifier, X_train, y_train)
cat_model = optimize_model(CatBoostClassifier, X_train, y_train)
xgb_model = optimize_model(xgb.XGBClassifier, X_train, y_train)

# DI 데이터 준비
X_train, y_train = prepare_data(DI_X_train, DI_y_train)

# 모델 최적화 및 학습
lgb_model = optimize_model(lgb.LGBMClassifier, X_train, y_train)
cat_model = optimize_model(CatBoostClassifier, X_train, y_train)
xgb_model = optimize_model(xgb.XGBClassifier, X_train, y_train)

# 전체 학습 데이터를 사용하여 모델 학습 및 예측
IVF_X, IVF_y = prepare_data(IVF_X_train, IVF_y_train)
DI_X, DI_y = prepare_data(DI_X_train, DI_y_train)

# IVF 데이터 학습 및 예측
IVF_lgb_pred, IVF_lgb_pred_proba = train_and_predict(lgb_model, IVF_X, IVF_y, IVF_X_test)
IVF_cat_pred, IVF_cat_pred_proba = train_and_predict(cat_model, IVF_X, IVF_y, IVF_X_test)
IVF_xgb_pred, IVF_xgb_pred_proba = train_and_predict(xgb_model, IVF_X, IVF_y, IVF_X_test)

# DI 데이터 학습 및 예측
DI_lgb_pred, DI_lgb_pred_proba = train_and_predict(lgb_model, DI_X, DI_y, DI_X_test)
DI_cat_pred, DI_cat_pred_proba = train_and_predict(cat_model, DI_X, DI_y, DI_X_test)
DI_xgb_pred, DI_xgb_pred_proba = train_and_predict(xgb_model, DI_X, DI_y, DI_X_test)

# 스태킹을 위한 데이터 준비
IVF_stack_train = np.vstack((IVF_lgb_pred_proba, IVF_cat_pred_proba, IVF_xgb_pred_proba)).T
DI_stack_train = np.vstack((DI_lgb_pred_proba, DI_cat_pred_proba, DI_xgb_pred_proba)).T

# 메타 모델 학습 및 예측
meta_model = LogisticRegression()
meta_model.fit(np.vstack((IVF_stack_train, DI_stack_train)), np.hstack((IVF_y, DI_y)))

# IVF 데이터에 대한 메타 모델 예측
IVF_stack_test = np.vstack((IVF_lgb_pred_proba, IVF_cat_pred_proba, IVF_xgb_pred_proba)).T
IVF_meta_pred_test_proba = meta_model.predict_proba(IVF_stack_test)[:, 1]

# DI 데이터에 대한 메타 모델 예측
DI_stack_test = np.vstack((DI_lgb_pred_proba, DI_cat_pred_proba, DI_xgb_pred_proba)).T
DI_meta_pred_test_proba = meta_model.predict_proba(DI_stack_test)[:, 1]

# 예측 결과 병합
IVF_X_test['probability'] = IVF_meta_pred_test_proba
DI_X_test['probability'] = DI_meta_pred_test_proba

# 최종 제출 파일 생성
submission = pd.concat([IVF_X_test[['ID', 'probability']], DI_X_test[['ID', 'probability']]], axis=0)
submission = submission.sort_values(by='ID')

# 제출 파일 저장
submission.to_csv('../submission/code54_stacking.csv', index=False, encoding='utf-8')

[I 2025-02-26 23:33:47,899] A new study created in memory with name: no-name-bdc79d1b-61d7-46ea-84c6-a388bfca4e80
[I 2025-02-26 23:33:49,224] Trial 0 finished with value: 0.7500638653809963 and parameters: {'learning_rate': 0.06929149475402178, 'num_leaves': 41, 'min_child_samples': 8, 'subsample': 0.9247459756023721, 'colsample_bytree': 0.6710284392415388}. Best is trial 0 with value: 0.7500638653809963.
[I 2025-02-26 23:33:49,225] A new study created in memory with name: no-name-17603d6d-9b44-4a94-b324-7fd609b63a0f
[I 2025-02-26 23:34:12,238] Trial 0 finished with value: 0.7841891477564719 and parameters: {'learning_rate': 0.09648395324636447, 'depth': 6, 'l2_leaf_reg': 0.0017136802737390804, 'border_count': 91}. Best is trial 0 with value: 0.7841891477564719.
[I 2025-02-26 23:34:12,238] A new study created in memory with name: no-name-06abcf13-7389-4fed-98d5-8c1627384bf5
[I 2025-02-26 23:34:13,125] Trial 0 finished with value: 0.733822123456658 and parameters: {'learning_rate': 0.06

0:	learn: 0.6712654	total: 24.4ms	remaining: 24.4s
1:	learn: 0.6520321	total: 48.9ms	remaining: 24.4s
2:	learn: 0.6384739	total: 73.5ms	remaining: 24.4s
3:	learn: 0.6227285	total: 96.7ms	remaining: 24.1s
4:	learn: 0.6095764	total: 122ms	remaining: 24.3s
5:	learn: 0.6022344	total: 143ms	remaining: 23.7s
6:	learn: 0.5947143	total: 165ms	remaining: 23.4s
7:	learn: 0.5848844	total: 190ms	remaining: 23.5s
8:	learn: 0.5792618	total: 211ms	remaining: 23.2s
9:	learn: 0.5741795	total: 233ms	remaining: 23.1s
10:	learn: 0.5696187	total: 256ms	remaining: 23s
11:	learn: 0.5649718	total: 280ms	remaining: 23.1s
12:	learn: 0.5578269	total: 309ms	remaining: 23.5s
13:	learn: 0.5519203	total: 338ms	remaining: 23.8s
14:	learn: 0.5487429	total: 364ms	remaining: 23.9s
15:	learn: 0.5459510	total: 388ms	remaining: 23.9s
16:	learn: 0.5425609	total: 415ms	remaining: 24s
17:	learn: 0.5397502	total: 443ms	remaining: 24.2s
18:	learn: 0.5376692	total: 469ms	remaining: 24.2s
19:	learn: 0.5354447	total: 494ms	remaini

ValueError: Found input variables with inconsistent numbers of samples: [51269, 205072]

.