## 난임 환자 대상 임신 성공 여부 예측

### LGAimers 6th 온라인 해커톤

Import

In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

### Data Load

In [2]:
# 데이터 로드
IVF_train = pd.read_csv('../data/IVF_train_dataset_31.csv')
IVF_test = pd.read_csv('../data/IVF_test_dataset_31.csv')

DI_train = pd.read_csv('../data/DI_train_dataset_31.csv')
DI_test = pd.read_csv('../data/DI_test_dataset_31.csv')

In [3]:
# ID 열을 제외한 특성과 타겟 변수 분리
IVF_X = IVF_train.drop(['임신_성공_여부', 'ID'], axis=1)
IVF_y = IVF_train['임신_성공_여부']

DI_X = DI_train.drop(['임신_성공_여부', 'ID'], axis=1)
DI_y = DI_train['임신_성공_여부']

In [4]:
print(f"IVF_X shape: {IVF_X.shape}")
print(f"IVF_test shape: {IVF_test.drop('ID', axis=1).shape}")
print(f"DI_X shape: {DI_X.shape}")
print(f"DI_test shape: {DI_test.drop('ID', axis=1).shape}")

IVF_X shape: (250052, 67)
IVF_test shape: (87891, 67)
DI_X shape: (6290, 22)
DI_test shape: (2176, 22)


### 인코딩 

In [5]:
IVF_categorical_columns = [
    "시술_시기_코드",
    "시술_당시_나이",
    "특정_시술_유형",
    "배란_유도_유형",
    "난자_출처",
    "정자_출처",
    "난자_기증자_나이",
    "정자_기증자_나이"
]

In [6]:
DI_categorical_columns = [
    "시술_시기_코드",
    "시술_당시_나이",
    "특정_시술_유형",
    "정자_기증자_나이"
]

In [7]:
# 모든 범주형 변수를 문자열로 변환
IVF_X[IVF_categorical_columns] = IVF_X[IVF_categorical_columns].astype(str)
DI_X[DI_categorical_columns] = DI_X[DI_categorical_columns].astype(str)
IVF_test[IVF_categorical_columns] = IVF_test[IVF_categorical_columns].astype(str)
DI_test[DI_categorical_columns] = DI_test[DI_categorical_columns].astype(str)

# OrdinalEncoder를 사용하여 범주형 변수 인코딩
IVF_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
DI_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

IVF_X[IVF_categorical_columns] = IVF_encoder.fit_transform(IVF_X[IVF_categorical_columns])
DI_X[DI_categorical_columns] = DI_encoder.fit_transform(DI_X[DI_categorical_columns])
IVF_test[IVF_categorical_columns] = IVF_encoder.transform(IVF_test[IVF_categorical_columns])
DI_test[DI_categorical_columns] = DI_encoder.transform(DI_test[DI_categorical_columns])

## Modeling

In [8]:
import h2o
print(h2o.__version__)

3.46.0.6


In [None]:
import pandas as pd
import h2o
from h2o.estimators import H2ORandomForestEstimator, H2OGradientBoostingEstimator, H2ODeepLearningEstimator, H2OStackedEnsembleEstimator
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# H2O 클러스터 시작 (메모리 제한 설정)
h2o.init(max_mem_size="3G")  # 3GB 메모리로 시작

# IVF 데이터 로드 및 전처리
IVF_train = pd.read_csv('../data/IVF_train_dataset_31.csv')
IVF_test = pd.read_csv('../data/IVF_test_dataset_31.csv')

IVF_X = IVF_train.drop(['임신_성공_여부', 'ID'], axis=1)
IVF_y = IVF_train['임신_성공_여부']

h2o_IVF_train = h2o.H2OFrame(pd.concat([IVF_X, IVF_y], axis=1))
h2o_IVF_train['임신_성공_여부'] = h2o_IVF_train['임신_성공_여부'].asfactor()

# DI 데이터 로드 및 전처리
DI_train = pd.read_csv('../data/DI_train_dataset_31.csv')
DI_test = pd.read_csv('../data/DI_test_dataset_31.csv')

DI_X = DI_train.drop(['임신_성공_여부', 'ID'], axis=1)
DI_y = DI_train['임신_성공_여부']

h2o_DI_train = h2o.H2OFrame(pd.concat([DI_X, DI_y], axis=1))
h2o_DI_train['임신_성공_여부'] = h2o_DI_train['임신_성공_여부'].asfactor()

# 샘플링하여 데이터 크기를 줄임
sample_fraction = 0.5  # 50% 샘플링

# H2OFrame을 Pandas 데이터프레임으로 변환 후 샘플링
h2o_IVF_train_df = h2o_IVF_train.as_data_frame()
h2o_DI_train_df = h2o_DI_train.as_data_frame

# 모델 초기화
models = {}

# 랜덤 포레스트
rf_model = H2ORandomForestEstimator(max_runtime_secs=300)  # 최대 5분
rf_model.train(x=h2o_IVF_train.columns[:-1], y='임신_성공_여부', training_frame=h2o_IVF_train)
models['Random Forest'] = rf_model

# 그래디언트 부스팅
gbm_model = H2OGradientBoostingEstimator(max_runtime_secs=300)  # 최대 5분
gbm_model.train(x=h2o_IVF_train.columns[:-1], y='임신_성공_여부', training_frame=h2o_IVF_train)
models['Gradient Boosting'] = gbm_model

# 딥러닝
dl_model = H2ODeepLearningEstimator(max_runtime_secs=300)  # 최대 5분
dl_model.train(x=h2o_IVF_train.columns[:-1], y='임신_성공_여부', training_frame=h2o_IVF_train)
models['Deep Learning'] = dl_model

# 스택드 앙상블 모델을 위한 학습
base_models = [rf_model.model_id, gbm_model.model_id, dl_model.model_id]
ensemble_model = H2OStackedEnsembleEstimator(base_models=base_models, max_runtime_secs=600)  # 최대 10분
ensemble_model.train(x=h2o_IVF_train.columns[:-1], y='임신_성공_여부', training_frame=h2o_IVF_train)

# 예측
IVF_X_test = IVF_test.drop(['ID'], axis=1)  # ID 열 제외
DI_X_test = DI_test.drop(['ID'], axis=1)

# 예측 확률
IVF_pred_proba = ensemble_model.predict_proba(h2o.H2OFrame(IVF_X_test))[:, 1]  # Positive class 확률
DI_pred_proba = ensemble_model.predict_proba(h2o.H2OFrame(DI_X_test))[:, 1]  # Positive class 확률

# 예측 결과 병합
IVF_test['probability'] = IVF_pred_proba
DI_test['probability'] = DI_pred_proba

# 최종 제출 파일 생성
submission = pd.concat([IVF_test[['ID', 'probability']], DI_test[['ID', 'probability']]], axis=0)
submission = submission.sort_values(by='ID')

# 제출 파일 저장
submission.to_csv('../submission/code30_submit_lgbm.csv', index=False, encoding='utf-8')

# H2O 클러스터 종료
h2o.shutdown()


Checking whether there is an H2O instance running at http://localhost:54321.

 connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,15 mins 32 secs
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,3 months and 13 days
H2O_cluster_name:,H2O_from_python_juneh_nf7jie
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.205 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%





drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
deeplearning Model Build progress: |█████████████████████████████████████████████| (done) 100%
stackedensemble Model Build progress: | (failed)


OSError: Job with key $03017f00000132d4ffffffff$_8632848be9b7bb86384c0457df149c0 failed with an exception: water.exceptions.H2OIllegalArgumentException: Base model does not use cross-validation: 0
stacktrace: 
water.exceptions.H2OIllegalArgumentException: Base model does not use cross-validation: 0
	at hex.ensemble.StackedEnsemble.checkAndInheritModelProperties(StackedEnsemble.java:483)
	at hex.ensemble.StackedEnsemble$StackedEnsembleDriver.computeImpl(StackedEnsemble.java:767)
	at hex.ModelBuilder$Driver.compute2(ModelBuilder.java:253)
	at water.H2O$H2OCountedCompleter.compute(H2O.java:1704)
	at jsr166y.CountedCompleter.exec(CountedCompleter.java:468)
	at jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:263)
	at jsr166y.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:976)
	at jsr166y.ForkJoinPool.runWorker(ForkJoinPool.java:1479)
	at jsr166y.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:104)


.