## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

### Data Load

In [2]:
# 데이터 로드
DI_train = pd.read_csv('../data/DI_train_dataset_53.csv')
DI_test = pd.read_csv('../data/DI_test_dataset_53.csv')

### 인코딩 

In [3]:
from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.core as ag

train_data = TabularDataset(DI_train)
test_data = TabularDataset(DI_test)

label = '임신_성공_여부'
eval_metric = 'roc_auc'

In [None]:
from autogluon.tabular import TabularPredictor

# 시간 제한 설정 
time_limit =  2 * 60 * 60

# # GPU를 사용할 수 없는 모델을 제외하도록 설정
# exclude_model_types = [
#     'KNN',  # K-Nearest Neighbors
#     'RF',   # Random Forest
#     'XT',   # Extra Trees
#     'LR',   # Linear Regression
#     'NN'    # Tabular Neural Network
# ]

# TabularPredictor 객체 생성 및 학습
predictor = TabularPredictor(
    label=label,
    eval_metric=eval_metric,
    path='AutogluonModels/ag-20250224_code53_DI'  # 모델 저장 경로
).fit(
    train_data,
    presets='best_quality',  # 'best_quality', 'medium_quality', 'good_quality' 등의 프리셋 설정
    # num_stack_levels=0,  # 스택 레벨 설정 / dynamic_stacking=True(디폴트)인 경우 무시
    num_bag_folds=5,  # 배깅 설정
    time_limit=time_limit,  # 시간 제한 설정
    # num_gpus=1,  # GPU 사용 설정
    # excluded_model_types=exclude_model_types  # 제외할 모델 유형 설정
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.8
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          16
Memory Avail:       10.92 GB / 15.86 GB (68.9%)
Disk Space Avail:   180.66 GB / 476.30 GB (37.9%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 150s of the 600s of remaining time (25%).
	Running

In [None]:
print(predictor.leaderboard(silent = True))

                          model  score_val eval_metric  pred_time_val  \
0           WeightedEnsemble_L3   0.725367     roc_auc       3.127192   
1         ExtraTreesEntr_BAG_L2   0.718222     roc_auc       2.656744   
2         ExtraTreesGini_BAG_L2   0.708253     roc_auc       2.667659   
3            CatBoost_r9_BAG_L2   0.700075     roc_auc       2.476366   
4             LightGBMXT_BAG_L2   0.699705     roc_auc       2.474246   
5               CatBoost_BAG_L2   0.698803     roc_auc       2.455585   
6           WeightedEnsemble_L2   0.690819     roc_auc       0.462723   
7          CatBoost_r177_BAG_L2   0.689251     roc_auc       2.470236   
8          CatBoost_r137_BAG_L1   0.687894     roc_auc       0.017632   
9           CatBoost_r50_BAG_L1   0.686979     roc_auc       0.036760   
10          LightGBM_r96_BAG_L2   0.686639     roc_auc       2.502582   
11      RandomForestEntr_BAG_L2   0.686125     roc_auc       2.676092   
12              CatBoost_BAG_L1   0.685897     roc_

In [None]:
# predictor.feature_importance(train_data)

In [None]:
# 최적의 모델 가져오기
model_to_use = predictor.model_best

# 확률 예측
prob_predictions = predictor.predict_proba(test_data, model=model_to_use)

In [None]:
# 예측 결과를 test_data에 추가
test_data['probability'] = prob_predictions.iloc[:, 1]

# 최종 제출 파일 생성
submission = test_data[['ID', 'probability']]
submission = submission.sort_values(by='ID')

# 제출 파일 저장
submission.to_csv('../submission/code53_DI_lgbm2.csv', index=False, encoding='utf-8')

# 예측 결과 확인
print(submission.head())

           ID  probability
0  TEST_00026     0.083821
1  TEST_00051     0.204894
2  TEST_00076     0.011036
3  TEST_00088     0.153002
4  TEST_00100     0.100460


In [None]:
import pandas as pd

# 첫 번째 제출 파일 읽기
submission_ivf = pd.read_csv('../submission/code53_IVF_lgbm2.csv')
submission_di = pd.read_csv('../submission/code53_DI_lgbm2.csv')

# 두 데이터프레임 병합 (ID를 기준으로)
merged_submission = pd.concat([submission_ivf, submission_di]).sort_values(by='ID')

# 병합된 데이터프레임 저장
merged_submission.to_csv('../submission/code53_merged_lgbm2.csv', index=False, encoding='utf-8')

# 예측 결과 확인
print(merged_submission.head())

           ID  probability
0  TEST_00000     0.001556
1  TEST_00001     0.002249
2  TEST_00002     0.151962
3  TEST_00003     0.103942
4  TEST_00004     0.512530


2025-02-24 21:47:51,578	ERROR worker.py:422 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2025-02-24 21:47:51,581	ERROR worker.py:422 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2025-02-24 21:49:03,187	ERROR worker.py:422 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2025-02-24 21:49:12,469	ERROR worker.py:422 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2025-02-24 21:49:12,469	ERROR worker.py:422 -- Unhandled error (suppress with 'RAY_IGNORE_UN

데이콘 PUBLIC xx

----

.