## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

### Data Load

In [2]:
# 데이터 로드
Total_train = pd.read_csv('../data/Total_train_dataset_38.csv')
Total_test = pd.read_csv('../data/Total_test_dataset_38.csv')

In [3]:
# ID 열을 제외한 특성과 타겟 변수 분리
Total_X = Total_train.drop(['임신_성공_여부', 'ID'], axis=1)
Total_y = Total_train['임신_성공_여부']

### 인코딩 

In [4]:
Total_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256344 entries, 0 to 256343
Data columns (total 100 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   시술_당시_나이               256344 non-null  object 
 1   임신_시도_또는_마지막_임신_경과_연수  256344 non-null  float64
 2   배란_자극_여부               256344 non-null  int64  
 3   배란_유도_유형               256344 non-null  object 
 4   단일_배아_이식_여부            256344 non-null  float64
 5   착상_전_유전_검사_사용_여부       256344 non-null  float64
 6   착상_전_유전_진단_사용_여부       256344 non-null  float64
 7   남성_주_불임_원인             256344 non-null  int64  
 8   남성_부_불임_원인             256344 non-null  int64  
 9   여성_주_불임_원인             256344 non-null  int64  
 10  여성_부_불임_원인             256344 non-null  int64  
 11  부부_주_불임_원인             256344 non-null  int64  
 12  부부_부_불임_원인             256344 non-null  int64  
 13  불임원인여부_불명확             256344 non-null  int64  
 14  불임_원인_-_난관_질환          256344 non-n

In [5]:
Total_categorical_columns = [
    "시술_당시_나이",
    "배란_유도_유형",
    "난자_출처",
    "정자_출처",
    "난자_기증자_나이",
    "정자_기증자_나이"
]

In [6]:
# 모든 범주형 변수를 문자열로 변환
Total_X[Total_categorical_columns] = Total_X[Total_categorical_columns].astype(str)
Total_test[Total_categorical_columns] = Total_test[Total_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
Total_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

Total_X[Total_categorical_columns] = Total_encoder.fit_transform(Total_X[Total_categorical_columns])
Total_test[Total_categorical_columns] = Total_encoder.transform(Total_test[Total_categorical_columns])

## Modeling

In [7]:
# 데이터 분할
Total_X_train, Total_X_test, Total_y_train, Total_y_test = train_test_split(Total_X, Total_y, test_size=0.2, random_state=42)

### Total 데이터

In [8]:
# %pip install flaml

In [None]:
import pandas as pd
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# 데이터 분할
Total_X_train, Total_X_test, Total_y_train, Total_y_test = train_test_split(Total_X, Total_y, test_size=0.2, random_state=42)

# AutoML 초기화 (로그 출력을 줄이기 위해 log_type 설정)
automl = AutoML(
    log_type="silent",
    metric='roc_auc',  # 최적화할 평가 지표
    time_budget=60*30,  # 학습에 사용할 최대 시간 (초)
    task="classification",  # 분류
    )

# 모델 학습
automl.fit(X_train=Total_X_train, y_train=Total_y_train)

# 최적의 모델 출력
print(automl.model.estimator)

# 예측
y_pred = automl.predict(Total_X_test)
y_pred_proba = automl.predict_proba(Total_X_test)[:, 1]

# 평가
accuracy = accuracy_score(Total_y_test, y_pred)
f1 = f1_score(Total_y_test, y_pred)
auc = roc_auc_score(Total_y_test, y_pred_proba)
cm = confusion_matrix(Total_y_test, y_pred)

# 결과 출력
print()
print("--- Model Performance ---")
print(f"Model Accuracy: {accuracy}")
print(f"Model F1 Score: {f1}")
print(f"Model AUC: {auc}")

# 혼동 행렬 출력
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=automl.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.show()

[flaml.automl.logger: 02-18 00:30:49] {1680} INFO - task = classification
[flaml.automl.logger: 02-18 00:30:49] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 02-18 00:30:49] {1789} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 02-18 00:30:49] {1901} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 02-18 00:30:49] {2219} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 02-18 00:30:49] {2346} INFO - Estimated sufficient time budget=14029s. Estimated necessary time budget=344s.
[flaml.automl.logger: 02-18 00:30:49] {2398} INFO -  at 5.4s,	estimator lgbm's best error=0.2924,	best estimator lgbm's best error=0.2924
[flaml.automl.logger: 02-18 00:30:49] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 02-18 00:30:49] {2398} INFO -  at 5.5s,	estimator lgbm's best error=0.2924,	best estimator lgbm's best error=0.2924
[flaml.automl.lo



[flaml.automl.logger: 02-18 00:32:48] {2398} INFO -  at 123.7s,	estimator lrl1's best error=0.2864,	best estimator xgb_limitdepth's best error=0.2679
[flaml.automl.logger: 02-18 00:32:48] {2219} INFO - iteration 75, current learner xgb_limitdepth




[flaml.automl.logger: 02-18 00:32:51] {2398} INFO -  at 126.7s,	estimator xgb_limitdepth's best error=0.2679,	best estimator xgb_limitdepth's best error=0.2679
[flaml.automl.logger: 02-18 00:32:51] {2219} INFO - iteration 76, current learner xgb_limitdepth
[flaml.automl.logger: 02-18 00:32:55] {2398} INFO -  at 131.3s,	estimator xgb_limitdepth's best error=0.2679,	best estimator xgb_limitdepth's best error=0.2679
[flaml.automl.logger: 02-18 00:32:55] {2219} INFO - iteration 77, current learner xgb_limitdepth
[flaml.automl.logger: 02-18 00:32:57] {2398} INFO -  at 133.5s,	estimator xgb_limitdepth's best error=0.2679,	best estimator xgb_limitdepth's best error=0.2679
[flaml.automl.logger: 02-18 00:32:57] {2219} INFO - iteration 78, current learner rf
[flaml.automl.logger: 02-18 00:32:58] {2398} INFO -  at 134.0s,	estimator rf's best error=0.2811,	best estimator xgb_limitdepth's best error=0.2679
[flaml.automl.logger: 02-18 00:32:58] {2219} INFO - iteration 79, current learner rf
[flaml.a



[flaml.automl.logger: 02-18 00:34:22] {2398} INFO -  at 217.7s,	estimator extra_tree's best error=0.2878,	best estimator xgb_limitdepth's best error=0.2679
[flaml.automl.logger: 02-18 00:34:22] {2219} INFO - iteration 96, current learner extra_tree
[flaml.automl.logger: 02-18 00:34:22] {2398} INFO -  at 218.2s,	estimator extra_tree's best error=0.2878,	best estimator xgb_limitdepth's best error=0.2679
[flaml.automl.logger: 02-18 00:34:22] {2219} INFO - iteration 97, current learner rf
[flaml.automl.logger: 02-18 00:34:23] {2398} INFO -  at 218.9s,	estimator rf's best error=0.2790,	best estimator xgb_limitdepth's best error=0.2679
[flaml.automl.logger: 02-18 00:34:23] {2219} INFO - iteration 98, current learner xgb_limitdepth
[flaml.automl.logger: 02-18 00:34:38] {2398} INFO -  at 233.9s,	estimator xgb_limitdepth's best error=0.2679,	best estimator xgb_limitdepth's best error=0.2679
[flaml.automl.logger: 02-18 00:34:38] {2219} INFO - iteration 99, current learner extra_tree
[flaml.autom



[flaml.automl.logger: 02-18 00:35:23] {2398} INFO -  at 279.5s,	estimator lgbm's best error=0.2707,	best estimator xgb_limitdepth's best error=0.2667
[flaml.automl.logger: 02-18 00:35:23] {2219} INFO - iteration 118, current learner lgbm
[flaml.automl.logger: 02-18 00:35:24] {2398} INFO -  at 280.1s,	estimator lgbm's best error=0.2707,	best estimator xgb_limitdepth's best error=0.2667
[flaml.automl.logger: 02-18 00:35:24] {2219} INFO - iteration 119, current learner xgb_limitdepth
[flaml.automl.logger: 02-18 00:35:39] {2398} INFO -  at 294.6s,	estimator xgb_limitdepth's best error=0.2666,	best estimator xgb_limitdepth's best error=0.2666
[flaml.automl.logger: 02-18 00:35:39] {2219} INFO - iteration 120, current learner xgb_limitdepth
[flaml.automl.logger: 02-18 00:35:55] {2398} INFO -  at 311.4s,	estimator xgb_limitdepth's best error=0.2666,	best estimator xgb_limitdepth's best error=0.2666
[flaml.automl.logger: 02-18 00:35:55] {2219} INFO - iteration 121, current learner lgbm
[flaml.a



[flaml.automl.logger: 02-18 00:39:18] {2398} INFO -  at 514.1s,	estimator extra_tree's best error=0.2780,	best estimator xgb_limitdepth's best error=0.2666
[flaml.automl.logger: 02-18 00:39:18] {2219} INFO - iteration 160, current learner catboost
[flaml.automl.logger: 02-18 00:39:19] {2398} INFO -  at 514.8s,	estimator catboost's best error=0.2734,	best estimator xgb_limitdepth's best error=0.2666
[flaml.automl.logger: 02-18 00:39:19] {2219} INFO - iteration 161, current learner xgboost
[flaml.automl.logger: 02-18 00:39:19] {2398} INFO -  at 515.3s,	estimator xgboost's best error=0.2723,	best estimator xgb_limitdepth's best error=0.2666
[flaml.automl.logger: 02-18 00:39:19] {2219} INFO - iteration 162, current learner extra_tree
[flaml.automl.logger: 02-18 00:39:21] {2398} INFO -  at 517.0s,	estimator extra_tree's best error=0.2780,	best estimator xgb_limitdepth's best error=0.2666
[flaml.automl.logger: 02-18 00:39:21] {2219} INFO - iteration 163, current learner catboost
[flaml.autom



[flaml.automl.logger: 02-18 00:39:54] {2398} INFO -  at 549.8s,	estimator xgb_limitdepth's best error=0.2665,	best estimator xgb_limitdepth's best error=0.2665
[flaml.automl.logger: 02-18 00:39:54] {2219} INFO - iteration 167, current learner catboost
[flaml.automl.logger: 02-18 00:39:56] {2398} INFO -  at 551.7s,	estimator catboost's best error=0.2734,	best estimator xgb_limitdepth's best error=0.2665
[flaml.automl.logger: 02-18 00:39:56] {2219} INFO - iteration 168, current learner xgboost
[flaml.automl.logger: 02-18 00:39:57] {2398} INFO -  at 553.3s,	estimator xgboost's best error=0.2715,	best estimator xgb_limitdepth's best error=0.2665
[flaml.automl.logger: 02-18 00:39:57] {2219} INFO - iteration 169, current learner xgboost
[flaml.automl.logger: 02-18 00:39:58] {2398} INFO -  at 554.2s,	estimator xgboost's best error=0.2715,	best estimator xgb_limitdepth's best error=0.2665
[flaml.automl.logger: 02-18 00:39:58] {2219} INFO - iteration 170, current learner extra_tree
[flaml.autom



[flaml.automl.logger: 02-18 02:19:33] {2398} INFO -  at 6528.9s,	estimator rf's best error=0.2687,	best estimator xgb_limitdepth's best error=0.2657
[flaml.automl.logger: 02-18 02:19:33] {2219} INFO - iteration 368, current learner lgbm
[flaml.automl.logger: 02-18 02:19:45] {2398} INFO -  at 6540.9s,	estimator lgbm's best error=0.2662,	best estimator xgb_limitdepth's best error=0.2657
[flaml.automl.logger: 02-18 02:19:45] {2219} INFO - iteration 369, current learner lgbm
[flaml.automl.logger: 02-18 02:19:50] {2398} INFO -  at 6545.7s,	estimator lgbm's best error=0.2662,	best estimator xgb_limitdepth's best error=0.2657
[flaml.automl.logger: 02-18 02:19:50] {2219} INFO - iteration 370, current learner lgbm
[flaml.automl.logger: 02-18 02:19:59] {2398} INFO -  at 6554.9s,	estimator lgbm's best error=0.2662,	best estimator xgb_limitdepth's best error=0.2657
[flaml.automl.logger: 02-18 02:19:59] {2219} INFO - iteration 371, current learner xgb_limitdepth
[flaml.automl.logger: 02-18 02:20:19

----

.