In [11]:

# 라이브러리 & 시드 고정

import os, random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(42)



# 2. 데이터 로드

train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

print("train shape:", train.shape)
print("test  shape:", test.shape)
print(train.head(3))



# 3. Feature / Label 분리 (ID, label 제거)

X = train.drop(columns=["ID", "label"])
y = train["label"].values

X_test_final = test.drop(columns=["ID"])

print("\n사용 피처:", X.columns.tolist())
print("X shape:", X.shape, "/ y shape:", y.shape)



# 4. Train / Valid 분리 

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nX_train:", X_train.shape, "/ X_valid:", X_valid.shape)




train shape: (7000, 18)
test  shape: (3000, 17)
           ID  나이  키(cm)  몸무게(kg)    BMI    시력  충치  공복 혈당  혈압  중성 지방  \
0  TRAIN_0000  35    170       70  24.22  1.10   1     98  40     80   
1  TRAIN_0001  40    150       55  24.44  1.00   0    173  39    104   
2  TRAIN_0002  60    170       50  17.30  0.75   0     96  40     61   

   혈청 크레아티닌  콜레스테롤  고밀도지단백  저밀도지단백  헤모글로빈  요 단백  간 효소율  label  
0       1.3    211      75     120   15.9     1   1.53      1  
1       0.6    251      46     184   11.8     1   1.45      0  
2       0.8    144      43      89   15.3     1   1.04      0  

사용 피처: ['나이', '키(cm)', '몸무게(kg)', 'BMI', '시력', '충치', '공복 혈당', '혈압', '중성 지방', '혈청 크레아티닌', '콜레스테롤', '고밀도지단백', '저밀도지단백', '헤모글로빈', '요 단백', '간 효소율']
X shape: (7000, 16) / y shape: (7000,)

X_train: (5600, 16) / X_valid: (1400, 16)


train shape: (7000, 18)
test  shape: (3000, 17)
           ID  나이  키(cm)  몸무게(kg)    BMI    시력  충치  공복 혈당  혈압  중성 지방  \
0  TRAIN_0000  35    170       70  24.22  1.10   1     98  40     80   
1  TRAIN_0001  40    150       55  24.44  1.00   0    173  39    104   
2  TRAIN_0002  60    170       50  17.30  0.75   0     96  40     61   

   혈청 크레아티닌  콜레스테롤  고밀도지단백  저밀도지단백  헤모글로빈  요 단백  간 효소율  label  
0       1.3    211      75     120   15.9     1   1.53      1  
1       0.6    251      46     184   11.8     1   1.45      0  
2       0.8    144      43      89   15.3     1   1.04      0  

사용 피처: ['나이', '키(cm)', '몸무게(kg)', 'BMI', '시력', '충치', '공복 혈당', '혈압', '중성 지방', '혈청 크레아티닌', '콜레스테롤', '고밀도지단백', '저밀도지단백', '헤모글로빈', '요 단백', '간 효소율']
X shape: (7000, 16) / y shape: (7000,)

X_train: (5600, 16) / X_valid: (1400, 16)

=== RandomForest 학습 ===

=== XGBoost 학습 ===

=== 개별 모델 성능 (thr = 0.5) ===
RF  Train ACC : 0.9902 / Valid ACC : 0.7450 / Gap : 0.2452
XGB Train ACC : 0.9738 / Valid ACC : 0.7329 / 

In [None]:
# 5. 모델 정의 

rf = RandomForestClassifier(
    n_estimators=1100,
    max_depth=None,
    min_samples_split=4,
    min_samples_leaf=1,
    max_features="sqrt",
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

xgb = XGBClassifier(
    n_estimators=700,
    max_depth=6,
    learning_rate=0.07,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)



# 6. Train / Valid에서 성능 확인

print("\n=== Train/Valid에서 학습 ===")
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

rf_valid_proba  = rf.predict_proba(X_valid)[:, 1]
xgb_valid_proba = xgb.predict_proba(X_valid)[:, 1]

# 단일 모델 
rf_pred_05  = (rf_valid_proba >= 0.5).astype(int)
xgb_pred_05 = (xgb_valid_proba >= 0.5).astype(int)

print("\n=== 단일 모델 Valid 성능 (thr = 0.5) ===")
print(f"RF  Valid ACC : {accuracy_score(y_valid, rf_pred_05):.4f}")
print(f"XGB Valid ACC : {accuracy_score(y_valid, xgb_pred_05):.4f}")

# Soft Voting (w_rf=0.75, w_xgb=0.25, thr=0.47)
w_rf  = 0.75
w_xgb = 0.25
best_thr = 0.47

ens_valid_proba = rf_valid_proba * w_rf + xgb_valid_proba * w_xgb
ens_valid_pred  = (ens_valid_proba >= best_thr).astype(int)

valid_acc = accuracy_score(y_valid, ens_valid_pred)

print("\n=== 최종 앙상블 Valid 성능 ===")
print(f"Ensemble Valid ACC : {valid_acc:.4f}")
print(f"(w_rf={w_rf}, w_xgb={w_xgb}, thr={best_thr})")



# 7. 전체 train 데이터로 다시 학습 후 Test 예측

print("\n=== 전체 train 데이터(7000개)로 최종 학습 ===")

rf_final = RandomForestClassifier(
    n_estimators=1100,
    max_depth=None,
    min_samples_split=4,
    min_samples_leaf=1,
    max_features="sqrt",
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

xgb_final = XGBClassifier(
    n_estimators=700,
    max_depth=6,
    learning_rate=0.07,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

rf_final.fit(X, y)
xgb_final.fit(X, y)

rf_test_proba  = rf_final.predict_proba(X_test_final)[:, 1]
xgb_test_proba = xgb_final.predict_proba(X_test_final)[:, 1]

final_proba = rf_test_proba * w_rf + xgb_test_proba * w_xgb
test_pred   = (final_proba >= best_thr).astype(int)



# 8. 제출 파일 생성

submission = pd.read_csv("sample_submission.csv")
submission["label"] = test_pred
submission.to_csv("submission_kte_7.csv", index=False)
print("\nsubmission_kte_7.csv 저장 완료!")


=== Train/Valid에서 학습 ===

=== 단일 모델 Valid 성능 (thr = 0.5) ===
RF  Valid ACC : 0.7471
XGB Valid ACC : 0.7350

=== 최종 앙상블 Valid 성능 ===
Ensemble Valid ACC : 0.7586
(w_rf=0.75, w_xgb=0.25, thr=0.47)

=== 전체 train 데이터(7000개)로 최종 학습 ===

✅ submission_kte_7.csv 저장 완료!
