### Import

In [10]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import  OrdinalEncoder
from sklearn.ensemble import ExtraTreesClassifier

### Data Load

In [11]:
train = pd.read_csv('./train.csv').drop(columns=['ID'])
test = pd.read_csv('./test.csv').drop(columns=['ID'])

In [12]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

### Data Pre-processing

In [13]:
categorical_columns = [
    "시술 시기 코드",
    "시술 당시 나이",
    "시술 유형",
    "특정 시술 유형",
    "배란 자극 여부",
    "배란 유도 유형",
    "단일 배아 이식 여부",
    "착상 전 유전 검사 사용 여부",
    "착상 전 유전 진단 사용 여부",
    "남성 주 불임 원인",
    "남성 부 불임 원인",
    "여성 주 불임 원인",
    "여성 부 불임 원인",
    "부부 주 불임 원인",
    "부부 부 불임 원인",
    "불명확 불임 원인",
    "불임 원인 - 난관 질환",
    "불임 원인 - 남성 요인",
    "불임 원인 - 배란 장애",
    "불임 원인 - 여성 요인",
    "불임 원인 - 자궁경부 문제",
    "불임 원인 - 자궁내막증",
    "불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인",
    "불임 원인 - 정자 운동성",
    "불임 원인 - 정자 형태",
    "배아 생성 주요 이유",
    "총 시술 횟수",
    "클리닉 내 총 시술 횟수",
    "IVF 시술 횟수",
    "DI 시술 횟수",
    "총 임신 횟수",
    "IVF 임신 횟수",
    "DI 임신 횟수",
    "총 출산 횟수",
    "IVF 출산 횟수",
    "DI 출산 횟수",
    "난자 출처",
    "정자 출처",
    "난자 기증자 나이",
    "정자 기증자 나이",
    "동결 배아 사용 여부",
    "신선 배아 사용 여부",
    "기증 배아 사용 여부",
    "대리모 여부",
    "PGD 시술 여부",
    "PGS 시술 여부"
]

In [14]:
# 카테고리형 컬럼들을 문자열로 변환
for col in categorical_columns:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)

In [15]:
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

In [16]:
numeric_columns = [
    "임신 시도 또는 마지막 임신 경과 연수",
    "총 생성 배아 수",
    "미세주입된 난자 수",
    "미세주입에서 생성된 배아 수",
    "이식된 배아 수",
    "미세주입 배아 이식 수",
    "저장된 배아 수",
    "미세주입 후 저장된 배아 수",
    "해동된 배아 수",
    "해동 난자 수",
    "수집된 신선 난자 수",
    "저장된 신선 난자 수",
    "혼합된 난자 수",
    "파트너 정자와 혼합된 난자 수",
    "기증자 정자와 혼합된 난자 수",
    "난자 채취 경과일",
    "난자 해동 경과일",
    "난자 혼합 경과일",
    "배아 이식 경과일",
    "배아 해동 경과일"
]

In [17]:
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

# 1️⃣ 결측 여부 Feature 추가
for col in numeric_columns:
    X_train_encoded[col + '_missing'] = X_train_encoded[col].isna().astype(int)
    X_test_encoded[col + '_missing'] = X_test_encoded[col].isna().astype(int)

# zero_imputer = SimpleImputer(strategy="constant", fill_value=0)
# X_train_encoded[numeric_columns] = zero_imputer.fit_transform(X_train_encoded[numeric_columns])
# X_test_encoded[numeric_columns] = zero_imputer.transform(X_test_encoded[numeric_columns])

from numpy import log1p
# 🔹 로그 변환 적용 (Skewed Data Handling)
skewed_cols = ['총 생성 배아 수', '수집된 신선 난자 수', '저장된 배아 수', '미세주입된 난자 수']

for col in skewed_cols:
    X_train_encoded[col + '_log'] = log1p(X_train_encoded[col])
    X_test_encoded[col + '_log'] = log1p(X_test_encoded[col])

# 3️⃣ 80% 이상 결측치가 있는 컬럼 제거
missing_ratio = X_train_encoded.isnull().mean()
high_missing_columns = missing_ratio[missing_ratio > 0.8].index.tolist()
X_train_encoded.drop(columns=high_missing_columns, inplace=True)
X_test_encoded.drop(columns=high_missing_columns, inplace=True)

# 4️⃣ Feature Engineering (특성 추가)
X_train_encoded['배아_생성_효율'] = X_train_encoded['총 생성 배아 수'] / (X_train_encoded['수집된 신선 난자 수'] + 1)
X_test_encoded['배아_생성_효율'] = X_test_encoded['총 생성 배아 수'] / (X_test_encoded['수집된 신선 난자 수'] + 1)

X_train_encoded['배아_저장_비율'] = X_train_encoded['저장된 배아 수'] / (X_train_encoded['총 생성 배아 수'] + 1)
X_test_encoded['배아_저장_비율'] = X_test_encoded['저장된 배아 수'] / (X_test_encoded['총 생성 배아 수'] + 1)

X_train_encoded['난자_배아_비율'] = X_train_encoded['미세주입된 난자 수'] / (X_train_encoded['총 생성 배아 수'] + 1)
X_test_encoded['난자_배아_비율'] = X_test_encoded['미세주입된 난자 수'] / (X_test_encoded['총 생성 배아 수'] + 1)


# 5️⃣ 이상치 처리 (Clip 적용)
for col in ['총 생성 배아 수', '수집된 신선 난자 수', '저장된 배아 수']:
    X_train_encoded[col] = X_train_encoded[col].clip(lower=1, upper=50)
    X_test_encoded[col] = X_test_encoded[col].clip(lower=1, upper=50)

# 6️⃣ Feature Scaling 제거 (트리 모델은 불필요)

import seaborn as sns
import matplotlib.pyplot as plt

# 상관 행렬 계산
corr_matrix = X_train_encoded.corr()

# 높은 상관관계(절대값 0.9 이상)를 가지는 변수 찾기
high_corr_features = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > 0.9:
            colname = corr_matrix.columns[i]
            high_corr_features.add(colname)

print("높은 상관관계를 가지는 Feature:", high_corr_features)

# 제거
X_train_encoded.drop(columns=high_corr_features, inplace=True)
X_test_encoded.drop(columns=high_corr_features, inplace=True)


높은 상관관계를 가지는 Feature: {'배아 해동 경과일_missing', '미세주입된 난자 수_log', '해동된 배아 수_missing', '파트너 정자와 혼합된 난자 수_missing', 'IVF 출산 횟수', '파트너 정자와 혼합된 난자 수', '미세주입에서 생성된 배아 수_missing', '총 생성 배아 수_missing', '배란 유도 유형', '난자 채취 경과일_missing', '미세주입에서 생성된 배아 수', '저장된 신선 난자 수_missing', '착상 전 유전 진단 사용 여부', '기증자 정자와 혼합된 난자 수_missing', '미세주입 후 저장된 배아 수_missing', '미세주입 배아 이식 수_missing', '기증 배아 사용 여부', '혼합된 난자 수_missing', '수집된 신선 난자 수_missing', '미세주입된 난자 수_missing', '해동 난자 수_missing', 'IVF 임신 횟수', '저장된 배아 수_missing', 'IVF 시술 횟수', '대리모 여부', '이식된 배아 수_missing', '부부 주 불임 원인'}


### 검증

In [18]:
import numpy as np
import pandas as pd
import catboost as cb
from sklearn.model_selection import StratifiedKFold
import optuna
from sklearn.metrics import roc_auc_score 
from tqdm import tqdm 

# ✅ Optuna 진행 상황을 추적하는 tqdm 콜백 함수
class TQDMCallback:
    def __init__(self, total):
        self.pbar = tqdm(total=total, desc="Optuna Hyperparameter Tuning", position=0, leave=True)

    def __call__(self, study, trial):
        self.pbar.update(1)

    def close(self):
        self.pbar.close()

# ✅ 하이퍼파라미터 튜닝 (Optuna)
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'random_seed': 42,
        'verbose': 0
    }
    
    model = cb.CatBoostClassifier(**params)
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_idx, val_idx in cv.split(X_train_encoded, y):
        X_train_fold, X_val_fold = X_train_encoded.iloc[train_idx], X_train_encoded.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=50, verbose=0)
        y_val_pred = model.predict_proba(X_val_fold)[:, 1]
        
        auc_score = roc_auc_score(y_val_fold, y_val_pred)
        cv_scores.append(auc_score)
    
    return np.mean(cv_scores)

# ✅ Optuna 최적화 실행 (시도 횟수: 50번, tqdm 추가)
n_trials = 50
tqdm_callback = TQDMCallback(total=n_trials)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials, callbacks=[tqdm_callback])
tqdm_callback.close()

# ✅ 최적 하이퍼파라미터 출력
best_params = study.best_params
print(f"✅ Optuna 최적화 완료! 최적 하이퍼파라미터: {best_params}")

# ✅ 최적 하이퍼파라미터 적용
model = cb.CatBoostClassifier(**best_params)

# 교차 검증 설정 (5-Fold)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 진행도 표시를 위한 tqdm 적용
cv_scores = []
for fold, (train_idx, val_idx) in enumerate(tqdm(cv.split(X_train_encoded, y), desc="Cross Validation Progress", total=5)):
    X_train_fold, X_val_fold = X_train_encoded.iloc[train_idx], X_train_encoded.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    model.fit(X_train_fold, y_train_fold)  # 모델 학습
    y_val_pred = model.predict_proba(X_val_fold)[:, 1]  # ROC-AUC 계산을 위해 확률값 추출
    
    auc_score = roc_auc_score(y_val_fold, y_val_pred)
    cv_scores.append(auc_score)
    
    print(f"Fold {fold+1}: ROC-AUC = {auc_score:.4f}")

# 최종 결과 출력
print(f"\n✅ 5-Fold ROC-AUC 점수 평균: {np.mean(cv_scores):.4f}")
print(f"각 Fold 점수: {cv_scores}")

Optuna Hyperparameter Tuning:   0%|          | 0/50 [00:00<?, ?it/s][I 2025-02-02 22:45:00,544] A new study created in memory with name: no-name-e6a2ccf4-5213-4eb1-820f-8d4fba24a8fb
[I 2025-02-02 22:45:27,630] Trial 0 finished with value: 0.7397835838934037 and parameters: {'iterations': 492, 'depth': 6, 'learning_rate': 0.03503336506044769, 'l2_leaf_reg': 9.97717158625524, 'border_count': 94}. Best is trial 0 with value: 0.7397835838934037.
Optuna Hyperparameter Tuning:   2%|▏         | 1/50 [00:27<22:07, 27.09s/it][I 2025-02-02 22:46:18,319] Trial 1 finished with value: 0.7389211276299619 and parameters: {'iterations': 311, 'depth': 11, 'learning_rate': 0.04215702879927394, 'l2_leaf_reg': 5.121179524325639, 'border_count': 174}. Best is trial 0 with value: 0.7397835838934037.
Optuna Hyperparameter Tuning:   4%|▍         | 2/50 [01:17<32:46, 40.97s/it][I 2025-02-02 22:46:33,615] Trial 2 finished with value: 0.7398294537794093 and parameters: {'iterations': 519, 'depth': 5, 'learning_r

✅ Optuna 최적화 완료! 최적 하이퍼파라미터: {'iterations': 651, 'depth': 6, 'learning_rate': 0.06052823197516312, 'l2_leaf_reg': 8.68043996322241, 'border_count': 60}


Cross Validation Progress:   0%|          | 0/5 [00:00<?, ?it/s]

0:	learn: 0.6597497	total: 9.68ms	remaining: 6.29s
1:	learn: 0.6352558	total: 19.1ms	remaining: 6.2s
2:	learn: 0.6121901	total: 29.2ms	remaining: 6.3s
3:	learn: 0.5947730	total: 38.9ms	remaining: 6.29s
4:	learn: 0.5834538	total: 49.1ms	remaining: 6.34s
5:	learn: 0.5708467	total: 58.5ms	remaining: 6.29s
6:	learn: 0.5604841	total: 68.3ms	remaining: 6.28s
7:	learn: 0.5537174	total: 77.4ms	remaining: 6.22s
8:	learn: 0.5451461	total: 86.8ms	remaining: 6.19s
9:	learn: 0.5402421	total: 96.6ms	remaining: 6.19s
10:	learn: 0.5363418	total: 106ms	remaining: 6.16s
11:	learn: 0.5305442	total: 116ms	remaining: 6.17s
12:	learn: 0.5256571	total: 126ms	remaining: 6.16s
13:	learn: 0.5233608	total: 136ms	remaining: 6.19s
14:	learn: 0.5207267	total: 146ms	remaining: 6.18s
15:	learn: 0.5173400	total: 156ms	remaining: 6.19s
16:	learn: 0.5142972	total: 166ms	remaining: 6.2s
17:	learn: 0.5120739	total: 178ms	remaining: 6.24s
18:	learn: 0.5102722	total: 188ms	remaining: 6.25s
19:	learn: 0.5088347	total: 198ms	

Cross Validation Progress:  20%|██        | 1/5 [00:06<00:25,  6.29s/it]

643:	learn: 0.4788051	total: 6.05s	remaining: 65.7ms
644:	learn: 0.4788050	total: 6.05s	remaining: 56.3ms
645:	learn: 0.4788014	total: 6.06s	remaining: 46.9ms
646:	learn: 0.4787902	total: 6.07s	remaining: 37.5ms
647:	learn: 0.4787892	total: 6.08s	remaining: 28.2ms
648:	learn: 0.4787747	total: 6.09s	remaining: 18.8ms
649:	learn: 0.4787584	total: 6.1s	remaining: 9.38ms
650:	learn: 0.4787364	total: 6.11s	remaining: 0us
Fold 1: ROC-AUC = 0.7379
0:	learn: 0.6596842	total: 9.69ms	remaining: 6.3s
1:	learn: 0.6350651	total: 21.6ms	remaining: 7s
2:	learn: 0.6133397	total: 38.1ms	remaining: 8.22s
3:	learn: 0.5946361	total: 48.3ms	remaining: 7.81s
4:	learn: 0.5831942	total: 58.3ms	remaining: 7.54s
5:	learn: 0.5707644	total: 68.2ms	remaining: 7.34s
6:	learn: 0.5605193	total: 78.2ms	remaining: 7.2s
7:	learn: 0.5537366	total: 88.1ms	remaining: 7.08s
8:	learn: 0.5452189	total: 98.9ms	remaining: 7.05s
9:	learn: 0.5404113	total: 109ms	remaining: 6.96s
10:	learn: 0.5354192	total: 119ms	remaining: 6.91s


Cross Validation Progress:  40%|████      | 2/5 [00:12<00:18,  6.25s/it]

638:	learn: 0.4798878	total: 5.96s	remaining: 112ms
639:	learn: 0.4798876	total: 5.97s	remaining: 103ms
640:	learn: 0.4798871	total: 5.98s	remaining: 93.3ms
641:	learn: 0.4798853	total: 5.99s	remaining: 84ms
642:	learn: 0.4798668	total: 6s	remaining: 74.7ms
643:	learn: 0.4798665	total: 6.01s	remaining: 65.3ms
644:	learn: 0.4798429	total: 6.02s	remaining: 56ms
645:	learn: 0.4798324	total: 6.03s	remaining: 46.7ms
646:	learn: 0.4798321	total: 6.04s	remaining: 37.3ms
647:	learn: 0.4798301	total: 6.04s	remaining: 28ms
648:	learn: 0.4798180	total: 6.05s	remaining: 18.7ms
649:	learn: 0.4798179	total: 6.06s	remaining: 9.33ms
650:	learn: 0.4798073	total: 6.07s	remaining: 0us
Fold 2: ROC-AUC = 0.7424
0:	learn: 0.6593440	total: 10ms	remaining: 6.52s
1:	learn: 0.6351739	total: 20.2ms	remaining: 6.54s
2:	learn: 0.6121241	total: 29.9ms	remaining: 6.45s
3:	learn: 0.5934513	total: 39.4ms	remaining: 6.37s
4:	learn: 0.5804534	total: 49.7ms	remaining: 6.42s
5:	learn: 0.5664662	total: 59.9ms	remaining: 6.

Cross Validation Progress:  60%|██████    | 3/5 [00:18<00:12,  6.25s/it]

635:	learn: 0.4796701	total: 5.94s	remaining: 140ms
636:	learn: 0.4796587	total: 5.95s	remaining: 131ms
637:	learn: 0.4796475	total: 5.96s	remaining: 121ms
638:	learn: 0.4796381	total: 5.97s	remaining: 112ms
639:	learn: 0.4796244	total: 5.98s	remaining: 103ms
640:	learn: 0.4796143	total: 5.99s	remaining: 93.5ms
641:	learn: 0.4796104	total: 6s	remaining: 84.1ms
642:	learn: 0.4796047	total: 6.01s	remaining: 74.8ms
643:	learn: 0.4795892	total: 6.02s	remaining: 65.4ms
644:	learn: 0.4795758	total: 6.03s	remaining: 56.1ms
645:	learn: 0.4795577	total: 6.04s	remaining: 46.7ms
646:	learn: 0.4795402	total: 6.05s	remaining: 37.4ms
647:	learn: 0.4795316	total: 6.05s	remaining: 28ms
648:	learn: 0.4795175	total: 6.07s	remaining: 18.7ms
649:	learn: 0.4795072	total: 6.08s	remaining: 9.35ms
650:	learn: 0.4795000	total: 6.08s	remaining: 0us
Fold 3: ROC-AUC = 0.7398
0:	learn: 0.6595171	total: 10.1ms	remaining: 6.59s
1:	learn: 0.6348934	total: 20.5ms	remaining: 6.64s
2:	learn: 0.6113155	total: 30.2ms	rema

Cross Validation Progress:  80%|████████  | 4/5 [00:24<00:06,  6.20s/it]

649:	learn: 0.4790955	total: 5.97s	remaining: 9.18ms
650:	learn: 0.4790886	total: 5.98s	remaining: 0us
Fold 4: ROC-AUC = 0.7382
0:	learn: 0.6596972	total: 9.72ms	remaining: 6.32s
1:	learn: 0.6348528	total: 18.9ms	remaining: 6.14s
2:	learn: 0.6119196	total: 28.9ms	remaining: 6.24s
3:	learn: 0.5944883	total: 38.9ms	remaining: 6.3s
4:	learn: 0.5832478	total: 48.3ms	remaining: 6.24s
5:	learn: 0.5669994	total: 57.7ms	remaining: 6.21s
6:	learn: 0.5595220	total: 66.9ms	remaining: 6.15s
7:	learn: 0.5490670	total: 76.6ms	remaining: 6.16s
8:	learn: 0.5433483	total: 86.1ms	remaining: 6.14s
9:	learn: 0.5387454	total: 96.5ms	remaining: 6.19s
10:	learn: 0.5343983	total: 107ms	remaining: 6.23s
11:	learn: 0.5312607	total: 117ms	remaining: 6.2s
12:	learn: 0.5278021	total: 127ms	remaining: 6.22s
13:	learn: 0.5237237	total: 137ms	remaining: 6.24s
14:	learn: 0.5212620	total: 148ms	remaining: 6.26s
15:	learn: 0.5164142	total: 158ms	remaining: 6.28s
16:	learn: 0.5142198	total: 168ms	remaining: 6.28s
17:	lea

Cross Validation Progress: 100%|██████████| 5/5 [00:31<00:00,  6.21s/it]

632:	learn: 0.4794448	total: 5.82s	remaining: 166ms
633:	learn: 0.4794405	total: 5.83s	remaining: 156ms
634:	learn: 0.4794315	total: 5.84s	remaining: 147ms
635:	learn: 0.4794169	total: 5.85s	remaining: 138ms
636:	learn: 0.4794098	total: 5.86s	remaining: 129ms
637:	learn: 0.4793966	total: 5.87s	remaining: 120ms
638:	learn: 0.4793940	total: 5.88s	remaining: 110ms
639:	learn: 0.4793819	total: 5.88s	remaining: 101ms
640:	learn: 0.4793737	total: 5.89s	remaining: 91.9ms
641:	learn: 0.4793582	total: 5.9s	remaining: 82.7ms
642:	learn: 0.4793428	total: 5.91s	remaining: 73.5ms
643:	learn: 0.4793271	total: 5.92s	remaining: 64.4ms
644:	learn: 0.4793045	total: 5.93s	remaining: 55.2ms
645:	learn: 0.4792872	total: 5.94s	remaining: 46ms
646:	learn: 0.4792868	total: 5.95s	remaining: 36.8ms
647:	learn: 0.4792706	total: 5.96s	remaining: 27.6ms
648:	learn: 0.4792581	total: 5.96s	remaining: 18.4ms
649:	learn: 0.4792485	total: 5.97s	remaining: 9.19ms
650:	learn: 0.4792358	total: 5.98s	remaining: 0us
Fold 5:




### Train

In [19]:
# model = ExtraTreesClassifier(random_state=42)

model.fit(X_train_encoded, y)

0:	learn: 0.6596650	total: 11.7ms	remaining: 7.61s
1:	learn: 0.6351430	total: 22.8ms	remaining: 7.4s
2:	learn: 0.6122352	total: 34.2ms	remaining: 7.4s
3:	learn: 0.5936365	total: 45.5ms	remaining: 7.36s
4:	learn: 0.5815728	total: 56.9ms	remaining: 7.35s
5:	learn: 0.5692650	total: 67.9ms	remaining: 7.3s
6:	learn: 0.5591170	total: 79.7ms	remaining: 7.33s
7:	learn: 0.5525043	total: 90.5ms	remaining: 7.27s
8:	learn: 0.5439299	total: 101ms	remaining: 7.24s
9:	learn: 0.5392544	total: 113ms	remaining: 7.23s
10:	learn: 0.5353759	total: 123ms	remaining: 7.17s
11:	learn: 0.5297841	total: 134ms	remaining: 7.14s
12:	learn: 0.5266930	total: 145ms	remaining: 7.13s
13:	learn: 0.5242133	total: 156ms	remaining: 7.1s
14:	learn: 0.5211993	total: 175ms	remaining: 7.43s
15:	learn: 0.5188038	total: 196ms	remaining: 7.8s
16:	learn: 0.5164625	total: 209ms	remaining: 7.78s
17:	learn: 0.5141212	total: 220ms	remaining: 7.75s
18:	learn: 0.5122434	total: 237ms	remaining: 7.89s
19:	learn: 0.5106871	total: 249ms	rema

<catboost.core.CatBoostClassifier at 0x16f391400>

### Predict

In [20]:
pred_proba = model.predict_proba(X_test_encoded)[:, 1]

### Submission

In [21]:
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['probability'] = pred_proba

In [22]:
sample_submission.to_csv('./baseline_submit.csv', index=False)