In [16]:
# ============================================
# 1. 라이브러리 & 시드 고정
# ============================================
import os, random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(42)


# ============================================
# 2. 데이터 로드
# ============================================
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

print("train shape:", train.shape)
print("test  shape:", test.shape)
print(train.head(3))


# ============================================
# 3. Feature / Label 분리 (원래 16개 피처만)
# ============================================
X = train.drop(columns=["ID", "label"])
y = train["label"].values

X_test_final = test.drop(columns=["ID"])

print("\n사용 피처:", X.columns.tolist())
print("X shape:", X.shape, "/ y shape:", y.shape)


# ============================================
# 4. Train / Valid 분리 (항상 쓰던 방식 그대로)
# ============================================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nX_train:", X_train.shape, "/ X_valid:", X_valid.shape)


# ============================================
# 5. 모델 정의 (검증된 셋업 중심)
#    - 이 조합에서 0.7586 근처가 나왔음
# ============================================
rf = RandomForestClassifier(
    n_estimators=1100,
    max_depth=None,          # 깊이 제한 X (지금 데이터에서는 이게 좋았음)
    min_samples_split=4,
    min_samples_leaf=1,
    max_features="sqrt",
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

xgb = XGBClassifier(
    n_estimators=700,
    max_depth=6,
    learning_rate=0.07,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)


# ============================================
# 6. Train / Valid 학습
# ============================================
print("\n=== RandomForest 학습 ===")
rf.fit(X_train, y_train)

print("\n=== XGBoost 학습 ===")
xgb.fit(X_train, y_train)

rf_valid_proba  = rf.predict_proba(X_valid)[:, 1]
xgb_valid_proba = xgb.predict_proba(X_valid)[:, 1]

# 개별 모델 성능 (참고)
rf_pred_05  = (rf_valid_proba  >= 0.5).astype(int)
xgb_pred_05 = (xgb_valid_proba >= 0.5).astype(int)

print("\n=== 단일 모델 Valid 성능 (thr = 0.5) ===")
print(f"RF  Valid Acc : {accuracy_score(y_valid, rf_pred_05):.4f}")
print(f"XGB Valid Acc : {accuracy_score(y_valid, xgb_pred_05):.4f}")


# ============================================
# 7. Soft Voting + Threshold 미세 튜닝
#    - RF 가중치 0.70 ~ 0.85
#    - threshold 0.44 ~ 0.50
#    - 이전에 잘 나온 구간만 집중 탐색 (시간 절약 + 과탐색 방지)
# ============================================
best_acc   = 0.0
best_thr   = 0.47
best_w_rf  = 0.75
best_w_xgb = 0.25

print("\n=== Soft Voting + Threshold 튜닝 시작 ===")

rf_weights = np.arange(0.70, 0.86, 0.01)   # 0.70 ~ 0.85
thr_list   = np.arange(0.44, 0.501, 0.002) # 0.44 ~ 0.50

for w_rf in rf_weights:
    w_xgb = 1.0 - w_rf
    ens_proba = rf_valid_proba * w_rf + xgb_valid_proba * w_xgb

    for thr in thr_list:
        pred = (ens_proba >= thr).astype(int)
        acc  = accuracy_score(y_valid, pred)

        if acc > best_acc:
            best_acc   = acc
            best_thr   = thr
            best_w_rf  = w_rf
            best_w_xgb = w_xgb

print("\n=== Soft Voting + Threshold 최종 결과 ===")
print(f"Best RF weight : {best_w_rf:.3f}")
print(f"Best XGB weight: {best_w_xgb:.3f}")
print(f"Best threshold : {best_thr:.3f}")
print(f"Best Valid Acc : {best_acc:.4f}")


# ============================================
# 8. 전체 데이터로 최종 학습
# ============================================
print("\n=== 전체 데이터로 최종 학습 ===")
rf_final = RandomForestClassifier(
    n_estimators=1100,
    max_depth=None,
    min_samples_split=4,
    min_samples_leaf=1,
    max_features="sqrt",
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

xgb_final = XGBClassifier(
    n_estimators=700,
    max_depth=6,
    learning_rate=0.07,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

rf_final.fit(X, y)
xgb_final.fit(X, y)

rf_test_proba  = rf_final.predict_proba(X_test_final)[:, 1]
xgb_test_proba = xgb_final.predict_proba(X_test_final)[:, 1]

final_proba = rf_test_proba * best_w_rf + xgb_test_proba * best_w_xgb
test_pred   = (final_proba >= best_thr).astype(int)

# ============================================
# 9. 제출 파일 생성
# ============================================
sample_submission["label"] = test_pred
sample_submission.to_csv("submission_KTE_6.csv", index=False)
print("\n✅ submission_KTE_6.csv 저장 완료!")

train shape: (7000, 18)
test  shape: (3000, 17)
           ID  나이  키(cm)  몸무게(kg)    BMI    시력  충치  공복 혈당  혈압  중성 지방  \
0  TRAIN_0000  35    170       70  24.22  1.10   1     98  40     80   
1  TRAIN_0001  40    150       55  24.44  1.00   0    173  39    104   
2  TRAIN_0002  60    170       50  17.30  0.75   0     96  40     61   

   혈청 크레아티닌  콜레스테롤  고밀도지단백  저밀도지단백  헤모글로빈  요 단백  간 효소율  label  
0       1.3    211      75     120   15.9     1   1.53      1  
1       0.6    251      46     184   11.8     1   1.45      0  
2       0.8    144      43      89   15.3     1   1.04      0  

사용 피처: ['나이', '키(cm)', '몸무게(kg)', 'BMI', '시력', '충치', '공복 혈당', '혈압', '중성 지방', '혈청 크레아티닌', '콜레스테롤', '고밀도지단백', '저밀도지단백', '헤모글로빈', '요 단백', '간 효소율']
X shape: (7000, 16) / y shape: (7000,)

X_train: (5600, 16) / X_valid: (1400, 16)

=== RandomForest 학습 ===

=== XGBoost 학습 ===

=== 단일 모델 Valid 성능 (thr = 0.5) ===
RF  Valid Acc : 0.7471
XGB Valid Acc : 0.7350

=== Soft Voting + Threshold 튜닝 시작 ===

=== Soft Voti