In [299]:
# 라이브러리
import pandas as pd
import numpy as np
import os, random

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier


In [300]:
# 시드고정

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정


In [301]:
# 데이터 로드
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")



In [302]:
# X,y 분리 ID, label 제거
X = train.drop(['ID', 'label'], axis=1)
y = train['label']

X_test = test.drop(['ID'], axis=1)



In [303]:
# train/ valid 분리
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42)

In [328]:
# 1) 랜덤 포레스트
rf = RandomForestClassifier(
    n_estimators=400,          
    min_samples_split=4,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42
)

# 2) GradientBoosting 
gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.3,
    max_depth=2,
    random_state=42
)

# 3) XGBoost 
xgb = XGBClassifier(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    )

models = [
    ("RandomForest", rf),
    ("GradientBoosting", gb),
    ("XGBoost", xgb)
]

In [329]:
proba_valid_list = []   
proba_test_list = []


print("개별 모델 성능 ")

for name, model in models:
    print(f" {name} 학습 중")
    model.fit(X_train, y_train)

    
    p_valid = model.predict_proba(X_valid)[:, 1]
    p_test = model.predict_proba(X_test)[:, 1]

    proba_valid_list.append(p_valid)
    proba_test_list.append(p_test)


    # 기본 threshold=0.5로 accuracy 확인
    y_pred_05 = (p_valid >= 0.5).astype(int)
    acc_05 = accuracy_score(y_valid, y_pred_05)

    print(f"{name}  |  Valid Accuracy @0.5 = {acc_05:.4f}")
    


개별 모델 성능 
 RandomForest 학습 중
RandomForest  |  Valid Accuracy @0.5 = 0.7314
 GradientBoosting 학습 중
GradientBoosting  |  Valid Accuracy @0.5 = 0.6936
 XGBoost 학습 중
XGBoost  |  Valid Accuracy @0.5 = 0.7157


In [324]:
proba_valid_ens = np.mean(proba_valid_list, axis=0)
proba_test_ens  = np.mean(proba_test_list , axis=0)

best_thr = 0.5
best_acc = 0.0


print("앙상블(3모델 평균) Threshold 탐색")

# 0.40 ~ 0.60 사이를 촘촘하게 탐색
for thr in np.linspace(0.40, 0.60, 41):  
    y_pred = (proba_valid_ens >= thr).astype(int)
    acc = accuracy_score(y_valid, y_pred)

    if acc > best_acc:
        best_acc = acc
        best_thr = thr

   
print(f"thr={thr:.3f}  acc={acc:.4f}")

print(f"\n▶ 앙상블 최적 Threshold: {best_thr:.4f}")
print(f"▶ 앙상블 최고 Valid Accuracy: {best_acc:.4f}")


# submit = pd.read_csv("sample_submission.csv")

# submit['label'] = (proba_test_ens >= best_thr).astype(int)

# print("\nsubmission 미리보기:")
# print(submit.head())

# submit.to_csv("submission_ensemble_xgb_rf_gb.csv", index=False)
# print("\n✅ submission_ensemble_xgb_rf_gb.csv 저장 완료")

앙상블(3모델 평균) Threshold 탐색
thr=0.600  acc=0.7036

▶ 앙상블 최적 Threshold: 0.5450
▶ 앙상블 최고 Valid Accuracy: 0.7179
