In [5]:
# 라이브러리

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


In [None]:
# 대아토 로드
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

print("train shape:", train.shape)
print("test  shape:", test.shape)

train shape: (7000, 18)
test  shape: (3000, 17)


In [8]:
# 시드 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [7]:
feature_cols = ['나이', '키(cm)', '몸무게(kg)', 'BMI', '시력', '충치',
                '공복 혈당', '혈압', '중성 지방', '혈청 크레아티닌',
                '콜레스테롤', '고밀도지단백', '저밀도지단백',
                '헤모글로빈', '요 단백', '간 효소율']

X = train[feature_cols]
y = train['label']
X_test_final = test[feature_cols]

In [9]:
 # Train / Valid 나누기
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("X_train:", X_train.shape, " / X_valid:", X_valid.shape)

X_train: (5600, 16)  / X_valid: (1400, 16)


In [26]:
# RandomForest: 
rf = RandomForestClassifier(
    n_estimators=1000,       
    max_depth=None,          
    min_samples_split=4,
    min_samples_leaf=1,
    max_features="sqrt",
    class_weight=None,       
    random_state=42,
    n_jobs=-1
)

# XGBoost: 
xgb = XGBClassifier(
    n_estimators=700,
    max_depth=6,
    learning_rate=0.07,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.0,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

In [27]:
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

rf_train_pred  = rf.predict(X_train)
rf_valid_pred  = rf.predict(X_valid)
xgb_train_pred = xgb.predict(X_train)
xgb_valid_pred = xgb.predict(X_valid)

rf_train_acc  = accuracy_score(y_train, rf_train_pred)
rf_valid_acc  = accuracy_score(y_valid, rf_valid_pred)
xgb_train_acc = accuracy_score(y_train, xgb_train_pred)
xgb_valid_acc = accuracy_score(y_valid, xgb_valid_pred)

print("\n=== 단일 모델 성능 (threshold = 0.5 기준) ===")
print(f"RandomForest  | Train: {rf_train_acc:.4f} / Valid: {rf_valid_acc:.4f}")
print(f"XGBoost       | Train: {xgb_train_acc:.4f} / Valid: {xgb_valid_acc:.4f}")


=== 단일 모델 성능 (threshold = 0.5 기준) ===
RandomForest  | Train: 0.9982 / Valid: 0.7486
XGBoost       | Train: 0.9996 / Valid: 0.7350


In [33]:

# 5. Soft Voting + Threshold 튜닝 --------------------------------
rf_valid_proba  = rf.predict_proba(X_valid)[:, 1]
xgb_valid_proba = xgb.predict_proba(X_valid)[:, 1]

# 가중치 
w_rf  = 0.7
w_xgb = 0.3

ensemble_proba_valid = (rf_valid_proba * w_rf) + (xgb_valid_proba * w_xgb)

best_acc = 0
best_thr = 0.5

for thr in np.arange(0.30, 0.71, 0.01):
    pred = (ensemble_proba_valid >= thr).astype(int)
    acc = accuracy_score(y_valid, pred)
    if acc > best_acc:
        best_acc = acc
        best_thr = thr

print("\n Soft Voting + Threshold 튜닝 결과 ")
print(f"Best threshold : {best_thr:.2f}")
print(f"Best Valid Acc : {best_acc:.4f}")


# 6. 전체 Train으로 다시 학습 후 Test 예측 ------------------------
rf_final = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    min_samples_split=4,
    min_samples_leaf=1,
    max_features="sqrt",
    class_weight=None,
    random_state=42,
    n_jobs=-1
)
xgb_final = XGBClassifier(
    n_estimators=700,
    max_depth=6,
    learning_rate=0.07,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.0,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

rf_final.fit(X, y)
xgb_final.fit(X, y)

rf_test_proba  = rf_final.predict_proba(X_test_final)[:, 1]
xgb_test_proba = xgb_final.predict_proba(X_test_final)[:, 1]

ensemble_proba_test = (rf_test_proba * w_rf) + (xgb_test_proba * w_xgb)
test_pred = (ensemble_proba_test >= best_thr).astype(int)


 Soft Voting + Threshold 튜닝 결과 
Best threshold : 0.47
Best Valid Acc : 0.7507


In [15]:
# submission = pd.read_csv("sample_submission.csv")
# submission["label"] = test_pred
# submission.to_csv("submission_rf_xgb_aggresive.csv", index=False)
