In [40]:
# 라이브러리
import pandas as pd
import numpy as np
import os, random

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier


In [41]:
# 시드 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정


In [42]:
# 데이트 로드 /Feature / Label 분리
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(["ID", "label"], axis=1)
y = train["label"]
X_test_final = test.drop("ID", axis=1)


In [43]:
# Train / Valid 분리
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [47]:

# RandomForest

rf = RandomForestClassifier(
    n_estimators=800,
    max_depth=7,
    min_samples_leaf=2,
    min_samples_split=4,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

rf_valid_proba = rf.predict_proba(X_valid)[:, 1]


# XGBoost

xgb = XGBClassifier(
    random_state=42,
    n_estimators=800,
    max_depth=7,
    learning_rate=0.08,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss"
)
xgb.fit(X_train, y_train)

xgb_valid_proba = xgb.predict_proba(X_valid)[:, 1]


# Soft Voting 

ensemble_proba = (rf_valid_proba * 0.6) + (xgb_valid_proba * 0.4)


# Threshold

best_acc = 0
best_thr = 0.5

for thr in np.arange(0.30, 0.71, 0.01):
    pred = (ensemble_proba >= thr).astype(int)
    acc = accuracy_score(y_valid, pred)
    if acc > best_acc:
        best_acc = acc
        best_thr = thr


print(" Best threshold:", best_thr)
print(" Valid Accuracy:", best_acc)



# 최종 모델은 전체 Train 

rf_final = RandomForestClassifier(
    n_estimators=800,
    max_depth=7,
    min_samples_leaf=2,
    min_samples_split=4,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)
rf_final.fit(X, y)

xgb_final = XGBClassifier(
    random_state=42,
    n_estimators=800,
    max_depth=7,
    learning_rate=0.08,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss"
)
xgb_final.fit(X, y)

# Soft Voting on Test
rf_test = rf_final.predict_proba(X_test_final)[:, 1]
xgb_test = xgb_final.predict_proba(X_test_final)[:, 1]

test_ensemble = (rf_test * 0.6) + (xgb_test * 0.4)

test_pred = (test_ensemble >= best_thr).astype(int)


# Save submission

# sub["label"] = test_pred
# sub.to_csv("submission_best.csv", index=False)


 Best threshold: 0.5700000000000003
 Valid Accuracy: 0.7557142857142857


In [48]:
# 과적합 체크
rf_train_acc = accuracy_score(y_train, rf.predict(X_train))
rf_valid_acc = accuracy_score(y_valid, rf.predict(X_valid))

xgb_train_acc = accuracy_score(y_train, xgb.predict(X_train))
xgb_valid_acc = accuracy_score(y_valid, xgb.predict(X_valid))

print("=== 과적합 체크 ===")
print(f"RF Train: {rf_train_acc:.4f} / Valid: {rf_valid_acc:.4f}")
print(f"XGB Train: {xgb_train_acc:.4f} / Valid: {xgb_valid_acc:.4f}")

=== 과적합 체크 ===
RF Train: 0.7473 / Valid: 0.7207
XGB Train: 1.0000 / Valid: 0.7407
