In [158]:
import numpy as np
import pandas as pd
import random, os

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier


# 시드 고정
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

set_seed(42)

# 데이터 불러오기
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# ID, label 제외한 나머지를 학습용 데이터로 사용
features = [c for c in train.columns if c not in ["ID", "label"]]

X = train[features]
y = train["label"]
X_test = test[features]

# Train / Valid 나누기
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [160]:

# 1) RandomForest

rf = RandomForestClassifier(
    n_estimators=800,
    max_depth=8,
    min_samples_split=4,
    min_samples_leaf=1,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)
rf.fit(X_tr, y_tr)


# 2) CatBoost

cat = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.04,
    loss_function="Logloss",
    random_seed=42,
    verbose=0
)
cat.fit(X_tr, y_tr)

# 기본 성능 확인 (threshold=0.5)
rf_val = rf.predict(X_val)
cat_val = cat.predict(X_val)

print("RF acc  :", accuracy_score(y_val, rf_val))
print("Cat acc :", accuracy_score(y_val, cat_val))


RF acc  : 0.74
Cat acc : 0.745


In [161]:
# 3) Soft Voting + Threshold 찾기

rf_proba  = rf.predict_proba(X_val)[:, 1]
cat_proba = cat.predict_proba(X_val)[:, 1]

best_acc = 0
best_w = 0.5
best_thr = 0.5

for w in np.arange(0.3, 0.81, 0.05):  
    blend = rf_proba * w + cat_proba * (1 - w)

    for thr in np.arange(0.40, 0.61, 0.01):
        pred = (blend >= thr).astype(int)
        acc = accuracy_score(y_val, pred)

        if acc > best_acc:
            best_acc = acc
            best_w = w
            best_thr = thr

print("\n=== Soft Voting 결과 ===")
print("Best RF weight :", round(best_w, 2))
print("Best Cat weight:", round(1 - best_w, 2))
print("Best threshold :", best_thr)
print("Best Valid Acc :", best_acc)




=== Soft Voting 결과 ===
Best RF weight : 0.7
Best Cat weight: 0.3
Best threshold : 0.4600000000000001
Best Valid Acc : 0.755


In [165]:
# --------------------------------
# 4) 전체 데이터로 다시 학습 후 Test 예측
# --------------------------------
rf_final = RandomForestClassifier(
    n_estimators=800,
    max_depth=8,
    min_samples_split=4,
    min_samples_leaf=1,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)

cat_final = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.04,
    loss_function="Logloss",
    random_seed=42,
    verbose=0
)

rf_final.fit(X, y)
cat_final.fit(X, y)

rf_test  = rf_final.predict_proba(X_test)[:, 1]
cat_test = cat_final.predict_proba(X_test)[:, 1]

final_proba = rf_test * best_w + cat_test * (1 - best_w)
final_pred  = (final_proba >= best_thr).astype(int)



# 저장
sub = pd.read_csv("sample_submission.csv")
sub["label"] = final_pred
sub.to_csv("submission_rf_cat_ensemble.csv", index=False)