In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import  OrdinalEncoder
from sklearn.linear_model import *
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, log_loss, r2_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier, early_stopping
from lightgbm import LGBMRegressor, early_stopping
from sklearn.model_selection import train_test_split
import optuna

In [2]:
train = pd.read_csv('./train.csv').drop(columns=['ID'])
test = pd.read_csv('./test.csv').drop(columns=['ID'])

## 결측치 최빈값 대체

In [3]:
train = train.apply(lambda x:x.fillna(x.mode()[0]))
test = test.apply(lambda x:x.fillna(x.mode()[0]))

In [4]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

In [5]:
categorical_columns = [
    "시술 시기 코드",
    "시술 당시 나이",
    "시술 유형",
    "특정 시술 유형",
    "배란 자극 여부",
    "배란 유도 유형",
    "단일 배아 이식 여부",
    "착상 전 유전 검사 사용 여부",
    "착상 전 유전 진단 사용 여부",
    "남성 주 불임 원인",
    "남성 부 불임 원인",
    "여성 주 불임 원인",
    "여성 부 불임 원인",
    "부부 주 불임 원인",
    "부부 부 불임 원인",
    "불명확 불임 원인",
    "불임 원인 - 난관 질환",
    "불임 원인 - 남성 요인",
    "불임 원인 - 배란 장애",
    "불임 원인 - 여성 요인",
    "불임 원인 - 자궁경부 문제",
    "불임 원인 - 자궁내막증",
    "불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인",
    "불임 원인 - 정자 운동성",
    "불임 원인 - 정자 형태",
    "배아 생성 주요 이유",
    "총 시술 횟수",
    "클리닉 내 총 시술 횟수",
    "IVF 시술 횟수",
    "DI 시술 횟수",
    "총 임신 횟수",
    "IVF 임신 횟수",
    "DI 임신 횟수",
    "총 출산 횟수",
    "IVF 출산 횟수",
    "DI 출산 횟수",
    "난자 출처",
    "정자 출처",
    "난자 기증자 나이",
    "정자 기증자 나이",
    "동결 배아 사용 여부",
    "신선 배아 사용 여부",
    "기증 배아 사용 여부",
    "대리모 여부",
    "PGD 시술 여부",
    "PGS 시술 여부"
]

In [6]:
# 카테고리형 컬럼들을 문자열로 변환
for col in categorical_columns:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)

In [7]:
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_encoded = X.copy()
X_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

test_encoded = test.copy()
test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X_encoded, y, test_size=0.2, stratify=y, random_state=42)

## catboost test

In [8]:
def objective(trial):
    # 하이퍼파라미터 제안
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-2, 10.0),
        'iterations': trial.suggest_int('iterations', 100, 500)
    }
    
    model = CatBoostClassifier(
        **params,
        loss_function='Logloss',
        verbose=0,
        cat_features=categorical_columns,
        random_state=42
    )
    
    model.fit(
        X_train, y_train,
        eval_set=(X_valid, y_valid),
        early_stopping_rounds=20
    )
    
    pred_probas = model.predict_proba(X_valid)[:, 1]
    
    auc = roc_auc_score(y_valid, pred_probas)
    log = log_loss(y_valid, pred_probas)
    return auc, log


In [9]:
study = optuna.create_study(directions=["maximize", "minimize"])
study.optimize(objective, n_trials=50)

print("Pareto front (ROC-AUC) 최적의 트라이얼들:")
for trial in study.best_trials:
    print("Trial values: {} | Params: {}".format(trial.values, trial.params))

[I 2025-02-06 11:50:01,310] A new study created in memory with name: no-name-419a7257-70d5-4fb8-a6aa-45a3043a0211
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-2, 10.0),
[I 2025-02-06 11:51:05,787] Trial 0 finished with values: [0.7363435740788555, 0.4899792742848752] and parameters: {'learning_rate': 0.07056708423093674, 'depth': 10, 'l2_leaf_reg': 2.477038980703661, 'iterations': 441}.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-2, 10.0),
[I 2025-02-06 11:52:12,495] Trial 1 finished with values: [0.7357933249768229, 0.4906227724863067] and parameters: {'learning_rate': 0.01755736124271166, 'depth': 8, 'l2_leaf_reg': 0.2619486206566269, 'iterations': 364}.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-2, 10.0),
[I 2025-02-06 1

Pareto front (ROC-AUC) 최적의 트라이얼들:
Trial values: [0.73681771917698, 0.48971481194801153] | Params: {'learning_rate': 0.06466646841320191, 'depth': 6, 'l2_leaf_reg': 3.811996666965158, 'iterations': 355}
Trial values: [0.7368131200189645, 0.4896964762884252] | Params: {'learning_rate': 0.06993256169940955, 'depth': 7, 'l2_leaf_reg': 0.8938167658096664, 'iterations': 230}


In [11]:
study.best_trials[0]
# parameters: {'learning_rate': 0.19603157179850195, 'depth': 6, 'l2_leaf_reg': 3.1113757733956238, 'iterations': 402, 'bagging_temperature': 0.056808024089444596, 'random_strength': 3.484431922663257, 'bootstrap_type': 'Bayesian'}. Best is trial 23 with value: 0.4863828367708195.


FrozenTrial(number=29, state=TrialState.COMPLETE, values=[0.73681771917698, 0.48971481194801153], datetime_start=datetime.datetime(2025, 2, 6, 12, 8, 7, 940699), datetime_complete=datetime.datetime(2025, 2, 6, 12, 9, 5, 145463), params={'learning_rate': 0.06466646841320191, 'depth': 6, 'l2_leaf_reg': 3.811996666965158, 'iterations': 355}, user_attrs={}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.3, log=True, low=0.01, step=None), 'depth': IntDistribution(high=10, log=False, low=3, step=1), 'l2_leaf_reg': FloatDistribution(high=10.0, log=True, low=0.01, step=None), 'iterations': IntDistribution(high=500, log=False, low=100, step=1)}, trial_id=29, value=None)

In [12]:
import pickle

best_params = study.best_trials[0].params

with open('best_params_cat.pkl', 'wb') as f:
    pickle.dump(best_params, f)

print("최적의 파라미터가 best_params.pkl 파일에 저장되었습니다.")

최적의 파라미터가 best_params.pkl 파일에 저장되었습니다.


In [13]:
best_params = study.best_trials[0].params

# params 직접 입력
# best_params = {'learning_rate': 0.19603157179850195, 'depth': 6, 'l2_leaf_reg': 3.1113757733956238, 'iterations': 402, 'bagging_temperature': 0.056808024089444596, 'random_strength': 3.484431922663257, 'bootstrap_type': 'Bayesian'}

final_model = CatBoostClassifier(
        **best_params,
        loss_function='Logloss',
        verbose=0,
        cat_features=categorical_columns,
        random_state=42
    )
final_model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=20)


<catboost.core.CatBoostClassifier at 0xfffef8018100>

In [None]:
pred_probas = final_model.predict_proba(test)[:, 1]
print("예측 확률:", pred_probas)
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['probability'] = pred_probas
display(sample_submission)

예측 확률: [0.00397512 0.0023746  0.15268218 ... 0.45990877 0.17314557 0.00429085]


Unnamed: 0,ID,probability
0,TEST_00000,0.003975
1,TEST_00001,0.002375
2,TEST_00002,0.152682
3,TEST_00003,0.108864
4,TEST_00004,0.506210
...,...,...
90062,TEST_90062,0.003419
90063,TEST_90063,0.329054
90064,TEST_90064,0.459909
90065,TEST_90065,0.173146


: 

In [13]:
pred_probas = final_model.predict_proba(test)[:, 1]
print("예측 확률:", pred_probas)
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['probability'] = pred_probas
sample_submission.to_csv('./submit/submit_17.csv', index=False)

예측 확률: [0.00170459 0.00073631 0.14700498 ... 0.25933033 0.15254075 0.00196207]


## lightGBM test

In [9]:
def objective(trial):
    # LightGBM 모델의 하이퍼파라미터 제안
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        # 추가 옵션: 필요에 따라 reg_lambda, reg_alpha 등 정규화 파라미터도 제안 가능
    }
    
    # LGBMClassifier 모델 생성
    model = LGBMClassifier(
        **params,
        random_state=42,
        n_jobs=-1
    )
    
    # 모델 학습 (early stopping 적용)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        callbacks=[early_stopping(stopping_rounds=50)]
    )
    
    # 검증 데이터에 대한 예측 확률 (양성 클래스 확률)
    pred_probas = model.predict_proba(X_valid)[:, 1]
    
    # 두 지표 계산
    auc = roc_auc_score(y_valid, pred_probas)
    loss = log_loss(y_valid, pred_probas)
    
    # Optuna 다중 목표: 첫 번째 목표(ROC-AUC)는 최대화, 두 번째 목표(Log Loss)는 최소화
    return auc, loss

In [None]:
# 3. 다중 목표 최적화를 위한 Optuna Study 생성 및 최적화 수행
study = optuna.create_study(directions=["maximize", "minimize"])
study.optimize(objective, n_trials=50)

# 4. Pareto Front (최적의 트라이얼 목록) 출력
print("Pareto Front (ROC-AUC, Log Loss) 최적의 트라이얼들:")
for trial in study.best_trials:
    print("Trial values (ROC-AUC, Log Loss): {} | Params: {}".format(trial.values, trial.params))

In [15]:
import pickle

# 최적의 파라미터 추출
best_params = study.best_trials[0].params

# best_params를 pickle 파일로 저장
with open('best_params_lightgbm.pkl', 'wb') as f:
    pickle.dump(best_params, f)

print("최적의 파라미터가 best_params.pkl 파일에 저장되었습니다.")

최적의 파라미터가 best_params.pkl 파일에 저장되었습니다.


In [12]:
# 5. 최적의 파라미터를 사용해 최종 모델 구축 (전체 데이터로 재학습 가능)
best_params = study.best_trials[0].params

final_model = LGBMClassifier(**best_params, random_state=42, n_jobs=-1)
final_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='auc', callbacks=[early_stopping(stopping_rounds=50)])

# 최종 모델의 검증 예측 및 평가 (예시)
pred_probas_final = final_model.predict_proba(X_valid)[:, 1]
final_auc = roc_auc_score(y_valid, pred_probas_final)
final_loss = log_loss(y_valid, pred_probas_final)
print("\n최종 모델 평가:")
print("ROC-AUC: {:.4f}".format(final_auc))
print("Log Loss: {:.4f}".format(final_loss))

[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024228 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 702
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 59
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[253]	valid_0's auc: 0.739286	valid_0's binary_logloss: 0.485829

최종 모델 평가:
ROC-AUC: 0.7393
Log Loss: 0.4858


In [13]:
pred_probas = final_model.predict_proba(test_encoded)[:, 1]
print("예측 확률:", pred_probas)

예측 확률: [0.00133422 0.000832   0.14547465 ... 0.42629758 0.17508381 0.00137922]


In [14]:
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['probability'] = pred_probas
sample_submission.to_csv('./submit/submit_10.csv', index=False)

## gradient boosting test

In [16]:
def objective(trial):
    # GradientBoostingClassifier의 하이퍼파라미터 제안
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20)
    }
    
    # 모델 생성
    model = GradientBoostingClassifier(**params, random_state=42)
    
    # 모델 학습
    model.fit(X_train, y_train)
    
    # 검증 데이터에 대한 예측 확률(양성 클래스)
    pred_probas = model.predict_proba(X_valid)[:, 1]
    
    
    # 두 지표 계산
    auc = roc_auc_score(y_valid, pred_probas)
    loss = log_loss(y_valid, pred_probas)
    
    # Optuna 다중 목표: 첫 번째 목표(ROC-AUC)는 최대화, 두 번째 목표(Log Loss)는 최소화
    return auc, loss

In [17]:
# 3. 다중 목표 최적화를 위한 Optuna Study 생성 및 최적화 수행
study = optuna.create_study(directions=["maximize", "minimize"])
study.optimize(objective, n_trials=50)

# 4. Pareto Front (최적의 트라이얼 목록) 출력
print("Pareto Front (ROC-AUC, Log Loss) 최적의 트라이얼들:")
for trial in study.best_trials:
    print("Trial values (ROC-AUC, Log Loss): {} | Params: {}".format(trial.values, trial.params))

[I 2025-02-05 05:05:40,969] A new study created in memory with name: no-name-01c75b3a-8865-4536-b2c7-6d0e143ea7bf
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
[I 2025-02-05 05:06:01,439] Trial 0 finished with values: [0.7378990630293993, 0.48705958360291896] and parameters: {'n_estimators': 101, 'learning_rate': 0.16569422918719784, 'max_depth': 3, 'subsample': 0.8388700939952699, 'min_samples_split': 15, 'min_samples_leaf': 18}.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
[I 2025-02-05 05:08:55,980] Trial 1 finished with values: [0.7378535100082733, 0.4867561439659727] and parameters: {'n_estimators': 443, 'learning_rate': 0.050171133927288, 'max_depth': 6, 'subsample': 0.8753963978433514, 'min_samples_split': 9, 'min_samples_leaf': 18}.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),

Pareto Front (ROC-AUC, Log Loss) 최적의 트라이얼들:
Trial values (ROC-AUC, Log Loss): [0.7389805977643669, 0.4862722601745465] | Params: {'n_estimators': 290, 'learning_rate': 0.0709328082138412, 'max_depth': 4, 'subsample': 0.821858633651255, 'min_samples_split': 5, 'min_samples_leaf': 13}
Trial values (ROC-AUC, Log Loss): [0.7391631453849794, 0.48669393874453504] | Params: {'n_estimators': 397, 'learning_rate': 0.012632650612676068, 'max_depth': 7, 'subsample': 0.6506781530488497, 'min_samples_split': 12, 'min_samples_leaf': 17}
Trial values (ROC-AUC, Log Loss): [0.7390804137647538, 0.48639097342043897] | Params: {'n_estimators': 625, 'learning_rate': 0.017158810785078856, 'max_depth': 6, 'subsample': 0.5342409167839189, 'min_samples_split': 10, 'min_samples_leaf': 4}


In [19]:
# 5. 최적의 파라미터를 사용해 최종 모델 구축 (전체 데이터로 재학습 가능)
best_params = study.best_trials[0].params

final_model = GradientBoostingClassifier(**best_params, random_state=42)
    
    # 모델 학습
final_model.fit(X_train, y_train)
# 최종 모델의 검증 예측 및 평가 (예시)
pred_probas_final = final_model.predict_proba(X_valid)[:, 1]
final_auc = roc_auc_score(y_valid, pred_probas_final)
final_loss = log_loss(y_valid, pred_probas_final)
print("\n최종 모델 평가:")
print("ROC-AUC: {:.4f}".format(final_auc))
print("Log Loss: {:.4f}".format(final_loss))


최종 모델 평가:
ROC-AUC: 0.7390
Log Loss: 0.4863


In [20]:
pred_probas = final_model.predict_proba(test_encoded)[:, 1]
print("예측 확률:", pred_probas)

예측 확률: [0.00362349 0.00309804 0.14509176 ... 0.44894606 0.22129401 0.00267728]


In [21]:
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['probability'] = pred_probas
sample_submission.to_csv('./submit/submit_11.csv', index=False)

## lightGBM Regressor test

In [9]:
def objective(trial):
    params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'num_leaves': trial.suggest_int('num_leaves', 20, 150),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        }
        
    # LightGBM Regressor 모델 생성
    model = LGBMRegressor(
        **params,
        random_state=42,
        n_jobs=-1
    )
    # 모델 학습
    model.fit(
        X_train, y_train,
        categorical_feature=categorical_columns
    )
    
    # 검증 데이터 예측
    y_pred = model.predict(X_valid)
    
    # RMSE 계산 (MSE의 제곱근)
    r2 = r2_score(y_valid, y_pred)

    return r2  # RMSE 값을 최소화하는 방향으로 최적화

In [None]:
# 3. 다중 목표 최적화를 위한 Optuna Study 생성 및 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 4. 최적의 하이퍼파라미터 및 R² 스코어 출력
print("Best trial:")
trial = study.best_trial
print("  R² Score: {:.4f}".format(trial.value))
print("  Best hyperparameters:")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [12]:
best_params = study.best_trials[0].params
best_params

{'n_estimators': 483,
 'learning_rate': 0.019050943579556338,
 'max_depth': 10,
 'num_leaves': 32,
 'min_child_samples': 59,
 'subsample': 0.6535773088799172,
 'colsample_bytree': 0.7581013800662908,
 'reg_alpha': 5.8445306660077625,
 'reg_lambda': 0.18197863515493562}

In [16]:
# 5. 최적의 파라미터를 사용해 최종 모델 구축 (전체 데이터로 재학습 가능)
best_params = study.best_trials[0].params

final_model = LGBMRegressor(**best_params, random_state=42, n_jobs=-1)
final_model.fit(X_train, y_train, categorical_feature=categorical_columns
)

# 최종 모델의 검증 예측 및 평가 (예시)
pred_probas_final = final_model.predict(X_valid)
final_r2 = r2_score(y_valid, pred_probas_final)
print("\n최종 모델 평가:")
print("  R² Score: {:.4f}".format(final_r2))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011899 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 725
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 59
[LightGBM] [Info] Start training from score 0.258933

최종 모델 평가:
  R² Score: 0.1328


In [18]:
pred_probas = final_model.predict(test_encoded)
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['probability'] = pred_probas
sample_submission.to_csv('./submit/submit_16.csv', index=False)

## 앙상블 테스트

In [10]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

# 1. 데이터 준비

# 2. 기본 모델(1차 모델) 정의
# - CatBoostClassifier (verbose=0으로 설정하여 출력 생략)
cat_model = CatBoostClassifier(iterations=200, learning_rate=0.1, depth=6, verbose=0, random_state=42)
# - RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# - LightGBMClassifier
lgbm_model = LGBMClassifier(n_estimators=200, learning_rate=0.1, random_state=42)

# 3. 기본 모델 학습 및 예측 (여기서는 단순하게 전체 학습 데이터를 이용)
cat_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
lgbm_model.fit(X_train, y_train)

# 각 기본 모델의 검증 데이터에 대한 양성(1) 확률 예측
cat_pred = cat_model.predict_proba(X_valid)[:, 1]
rf_pred = rf_model.predict_proba(X_valid)[:, 1]
lgbm_pred = lgbm_model.predict_proba(X_valid)[:, 1]

# 4. 기본 모델들의 예측 확률을 새로운 피처로 결합하여 스태킹 데이터셋 생성
# (각 행은 검증 데이터의 한 샘플에 대한 세 모델의 예측 확률)
stacked_features = np.column_stack([cat_pred, rf_pred, lgbm_pred])

# 5. 메타 모델(2차 모델) 정의 및 학습
# 여기서는 간단하게 로지스틱 회귀를 메타 모델로 사용하여 결합
meta_model = LogisticRegression(random_state=42)
meta_model.fit(stacked_features, y_valid)  # 주의: 실제 스태킹에서는 별도의 Out-Of-Fold 예측 결과를 사용합니다.

# 6. 최종 스태킹 앙상블 예측 및 평가
# 검증 데이터에 대한 메타 모델의 예측 확률
meta_pred = meta_model.predict_proba(stacked_features)[:, 1]

# 개별 기본 모델과 스태킹 앙상블의 ROC-AUC 점수 계산
auc_cat = roc_auc_score(y_valid, cat_pred)
auc_rf = roc_auc_score(y_valid, rf_pred)
auc_lgbm = roc_auc_score(y_valid, lgbm_pred)
auc_stacking = roc_auc_score(y_valid, meta_pred)

print("개별 모델 ROC-AUC:")
print(f"  CatBoost:    {auc_cat:.4f}")
print(f"  RandomForest:{auc_rf:.4f}")
print(f"  LightGBM:    {auc_lgbm:.4f}")
print("\n스태킹 앙상블 ROC-AUC:")
print(f"  Meta (Logistic Regression): {auc_stacking:.4f}")



: 

: 

: 

## 파일 제출

In [9]:
pred_probas = final_model.predict_proba(X_test_encoded)[:, 1]
print("예측 확률:", pred_probas)

In [10]:
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['probability'] = pred_probas
sample_submission.to_csv('./submit/submit_10.csv', index=False)