## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [2]:
# %pip install pandas numpy scikit-learn matplotlib pycaret lightgbm xgboost catboost

In [3]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

### Data Load

In [4]:
# 데이터 로드
Total_train = pd.read_csv('../data/Total_train_dataset_47.csv')
Total_test = pd.read_csv('../data/Total_test_dataset_47.csv')

In [5]:
# ID 열을 제외한 특성과 타겟 변수 분리
Total_X = Total_train.drop(['임신_성공_여부', 'ID'], axis=1)
Total_y = Total_train['임신_성공_여부']

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 데이터 로드
Total_train = pd.read_csv('../data/Total_train_dataset_47.csv')

# ID 열을 제외한 특성과 타겟 변수 분리
Total_X = Total_train.drop(['임신_성공_여부', 'ID'], axis=1)
Total_y = Total_train['임신_성공_여부']


### 인코딩 

In [7]:
Total_categorical_columns = [
    "시술_당시_나이",
    "난자_기증자_나이",
    "정자_기증자_나이"
]

In [8]:
# 모든 범주형 변수를 문자열로 변환
Total_X[Total_categorical_columns] = Total_X[Total_categorical_columns].astype(str)
Total_test[Total_categorical_columns] = Total_test[Total_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
Total_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

Total_X[Total_categorical_columns] = Total_encoder.fit_transform(Total_X[Total_categorical_columns])
Total_test[Total_categorical_columns] = Total_encoder.transform(Total_test[Total_categorical_columns])

## Modeling

In [9]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from pycaret.classification import *

In [10]:
# 필요한 라이브러리 임포트
from pycaret.classification import *
import pandas as pd

# 특성 이름의 공백을 밑줄로 대체
Total_X.columns = Total_X.columns.str.replace(' ', '_')
Total_test.columns = Total_test.columns.str.replace(' ', '_')

# 데이터 분할
Total_X_train, Total_X_test, Total_y_train, Total_y_test = train_test_split(Total_X,
                                                                            Total_y,
                                                                            test_size=0.2,
                                                                            random_state=42,
                                                                            stratify=Total_y)

# PyCaret 설정
clf = setup(data=pd.concat([Total_X_train, Total_y_train], axis=1), 
            target='임신_성공_여부', 
            session_id=42, 
            fix_imbalance=True, 
            normalize=True, 
            feature_selection=True)

# 모델 비교 및 최상의 모델 선택
best_model = compare_models(n_select=5, sort='AUC')

Unnamed: 0,Description,Value
0,Session id,42
1,Target,임신_성공_여부
2,Target type,Binary
3,Original data shape,"(205075, 95)"
4,Transformed data shape,"(274453, 19)"
5,Transformed train set shape,"(212930, 19)"
6,Transformed test set shape,"(61523, 19)"
7,Numeric features,94
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7403,0.7332,0.2284,0.4945,0.3124,0.1782,0.1985,4.607
gbc,Gradient Boosting Classifier,0.7228,0.7303,0.3856,0.4567,0.4182,0.2379,0.2394,10.978
catboost,CatBoost Classifier,0.7434,0.7298,0.1827,0.5094,0.2689,0.1534,0.1832,22.516
xgboost,Extreme Gradient Boosting,0.7407,0.7289,0.2055,0.496,0.2906,0.164,0.1879,4.501
ada,Ada Boost Classifier,0.6788,0.7193,0.5694,0.4121,0.478,0.2546,0.2617,5.885
ridge,Ridge Classifier,0.6117,0.717,0.7521,0.3747,0.5002,0.237,0.2758,3.605
lda,Linear Discriminant Analysis,0.6117,0.717,0.7522,0.3747,0.5002,0.2371,0.2759,3.637
lr,Logistic Regression,0.6167,0.7145,0.7361,0.3764,0.4981,0.2373,0.2725,4.191
svm,SVM - Linear Kernel,0.5984,0.7141,0.7752,0.3684,0.4993,0.2294,0.2744,3.858
qda,Quadratic Discriminant Analysis,0.5681,0.694,0.7928,0.3512,0.4868,0.2005,0.2509,3.74


In [None]:
# 최상의 모델 하이퍼파라미터 튜닝
tuned_models = [tune_model(model, optimize='AUC') for model in best_model]

# 메타 모델 리스트
meta_models = [
    LogisticRegression(),
    RandomForestClassifier(random_state=42),
    LGBMClassifier(random_state=42)
]

# 메타 모델 성능 저장을 위한 리스트
meta_model_performance = []

# 여러 메타 모델로 스태킹 앙상블 수행 및 성능 평가
for meta_model in meta_models:
    stacked_model = stack_models(estimator_list=tuned_models, meta_model=meta_model, fold=5)
    final_model = finalize_model(stacked_model)
    y_pred = predict_model(final_model, data=Total_X_test)
    
    accuracy = accuracy_score(Total_y_test, y_pred['Label'])
    f1 = f1_score(Total_y_test, y_pred['Label'])
    auc = roc_auc_score(Total_y_test, y_pred['Score'])
    
    meta_model_performance.append((meta_model, accuracy, f1, auc))

    print(f"Meta Model: {meta_model.__class__.__name__}")
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"AUC: {auc}")
    print("---")

# 최상의 메타 모델 선택
best_meta_model = max(meta_model_performance, key=lambda x: x[3])[0]  # AUC 기준으로 선택

print(f"Best Meta Model: {best_meta_model.__class__.__name__}")

# 최상의 메타 모델로 최종 모델 학습
final_stacked_model = stack_models(estimator_list=tuned_models, meta_model=best_meta_model, fold=5)
final_model = finalize_model(final_stacked_model)

In [None]:
# 모든 학습 데이터를 사용하여 최종 모델 학습
final_model.fit(Total_X, Total_y)

# 테스트 데이터 예측
Total_pred_scores = final_model.decision_function(Total_test.drop('ID', axis=1))

# 예측 점수를 테스트 데이터에 추가
Total_test['probability'] = Total_pred_scores

# 최종 제출 파일 생성
submission = Total_test[['ID', 'probability']]
submission = submission.sort_values(by='ID')

# 제출 파일 저장
submission.to_csv('../submission/code47_all_final_model.csv', index=False, encoding='utf-8')

In [None]:
# %pip install autogluon

In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor

ModuleNotFoundError: No module named 'autogluon'

In [None]:
# 특성 이름의 공백을 밑줄로 대체
Total_X.columns = Total_X.columns.str.replace(' ', '_')
Total_test.columns = Total_test.columns.str.replace(' ', '_')

# 데이터 분할
Total_X_train, Total_X_test, Total_y_train, Total_y_test = train_test_split(Total_X,
                                                                            Total_y,
                                                                            test_size=0.2,
                                                                            random_state=42,
                                                                            stratify=Total_y)

In [None]:
train_data = TabularDataset('/content/train.csv')
test_data = TabularDataset('/content/test.csv')

label = 'critical_temp'
eval_metric = 'mae'
time_limit = 3600 * 5

predictor = TabularPredictor(
    label=label, eval_metric=eval_metric
).fit(train_data, presets='best_quality', time_limit=time_limit, num_gpus=1)
predictor.leaderboard(silent = True)

.