In [1]:
# 라이브러리
import pandas as pd
import numpy as np
import os, random

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier


In [2]:
# 시드고정

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정


In [3]:
# 데이터 로드
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")



In [4]:
# X,y 분리 ID, label 제거
X = train.drop(['ID', 'label'], axis=1)
y = train['label']

X_test = test.drop(['ID'], axis=1)



In [5]:
# train/ valid 분리
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42)

In [6]:
# 1) 랜덤 포레스트
rf = RandomForestClassifier(
    n_estimators=400,          
    min_samples_split=4,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42
)

# 2) GradientBoosting 
gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.3,
    max_depth=2,
    random_state=42
)

# 3) XGBoost 
xgb = XGBClassifier(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    )

models = [
    ("RandomForest", rf),
    ("GradientBoosting", gb),
    ("XGBoost", xgb)
]

In [7]:
proba_valid_list = []   
proba_test_list = []


print("개별 모델 성능 ")

for name, model in models:
    print(f" {name} 학습 중")
    model.fit(X_train, y_train)

    
    p_valid = model.predict_proba(X_valid)[:, 1]
    p_test = model.predict_proba(X_test)[:, 1]

    proba_valid_list.append(p_valid)
    proba_test_list.append(p_test)


    # 기본 threshold=0.5로 accuracy 확인
    y_pred_05 = (p_valid >= 0.5).astype(int)
    acc_05 = accuracy_score(y_valid, y_pred_05)

    print(f"{name}  |  Valid Accuracy @0.5 = {acc_05:.4f}")
    


개별 모델 성능 
 RandomForest 학습 중
RandomForest  |  Valid Accuracy @0.5 = 0.7314
 GradientBoosting 학습 중
GradientBoosting  |  Valid Accuracy @0.5 = 0.6936
 XGBoost 학습 중
XGBoost  |  Valid Accuracy @0.5 = 0.7157


In [8]:
proba_valid_ens = np.mean(proba_valid_list, axis=0)
proba_test_ens  = np.mean(proba_test_list , axis=0)

best_thr = 0.5
best_acc = 0.0


print("앙상블(3모델 평균) Threshold 탐색")

# 0.40 ~ 0.60 사이를 촘촘하게 탐색
for thr in np.linspace(0.40, 0.60, 41):  
    y_pred = (proba_valid_ens >= thr).astype(int)
    acc = accuracy_score(y_valid, y_pred)

    if acc > best_acc:
        best_acc = acc
        best_thr = thr

   
print(f"thr={thr:.3f}  acc={acc:.4f}")

print(f"\n▶ 앙상블 최적 Threshold: {best_thr:.4f}")
print(f"▶ 앙상블 최고 Valid Accuracy: {best_acc:.4f}")


# submit = pd.read_csv("sample_submission.csv")

# submit['label'] = (proba_test_ens >= best_thr).astype(int)

# print("\nsubmission 미리보기:")
# print(submit.head())

# submit.to_csv("submission_ensemble_xgb_rf_gb.csv", index=False)
# print("\n✅ submission_ensemble_xgb_rf_gb.csv 저장 완료")

앙상블(3모델 평균) Threshold 탐색
thr=0.600  acc=0.7143

▶ 앙상블 최적 Threshold: 0.5400
▶ 앙상블 최고 Valid Accuracy: 0.7186


In [9]:
import pandas as pd
import numpy as np
import os, random

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(42)


train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")


print("train shape :", train.shape)
print("test shape  :", test.shape)
print(train.head(3))


X = train.drop(["ID", "label"], axis=1)
y = train["label"]

X_test_final = test.drop(["ID"], axis=1)   

print("\n[컬럼 확인]")
print(X.columns)



train shape : (7000, 18)
test shape  : (3000, 17)
           ID  나이  키(cm)  몸무게(kg)    BMI    시력  충치  공복 혈당  혈압  중성 지방  \
0  TRAIN_0000  35    170       70  24.22  1.10   1     98  40     80   
1  TRAIN_0001  40    150       55  24.44  1.00   0    173  39    104   
2  TRAIN_0002  60    170       50  17.30  0.75   0     96  40     61   

   혈청 크레아티닌  콜레스테롤  고밀도지단백  저밀도지단백  헤모글로빈  요 단백  간 효소율  label  
0       1.3    211      75     120   15.9     1   1.53      1  
1       0.6    251      46     184   11.8     1   1.45      0  
2       0.8    144      43      89   15.3     1   1.04      0  

[컬럼 확인]
Index(['나이', '키(cm)', '몸무게(kg)', 'BMI', '시력', '충치', '공복 혈당', '혈압', '중성 지방',
       '혈청 크레아티닌', '콜레스테롤', '고밀도지단백', '저밀도지단백', '헤모글로빈', '요 단백', '간 효소율'],
      dtype='object')


In [10]:
# 분리 
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42)
print("\nTrain / Valid shape")
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)



Train / Valid shape
(5600, 16) (1400, 16) (5600,) (1400,)


In [11]:

rf_settings = [
    {
        "name": "RF_basic",
        "params": dict(
            n_estimators=300,
            random_state=42,
            n_jobs=-1
        )
    },
    {
        "name": "RF_balanced_depth8",
        "params": dict(
            n_estimators=500,
            max_depth=8,
            min_samples_split=4,
            min_samples_leaf=2,
            class_weight="balanced_subsample",  # 불균형 보정
            random_state=42,
            n_jobs=-1
        )
    },
    {
        "name": "RF_balanced_depth12",
        "params": dict(
            n_estimators=600,
            max_depth=12,
            min_samples_split=4,
            min_samples_leaf=2,
            class_weight="balanced_subsample",
            random_state=42,
            n_jobs=-1
        )
    },
]

results = []  # 각 세팅별 성능 저장

for setting in rf_settings:
    name = setting["name"]
    params = setting["params"]

    print("\n===================================")
    print(f"▶ {name} 학습 시작")
    print("  하이퍼파라미터:", params)

    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)

    # 1) 기본 predict() → Hard Voting 결과
    y_pred_hard = model.predict(X_valid)
    acc_hard = accuracy_score(y_valid, y_pred_hard)
    print(f"  Hard Voting Accuracy (model.predict) : {acc_hard:.4f}")

    # 2) 확률 기반 threshold 튜닝 (Soft 방식)
    #    - predict_proba 로 1일 확률 가져온 뒤
    #    - 여러 threshold 를 돌려 가장 Accuracy 높은 점 찾기
    proba_valid = model.predict_proba(X_valid)[:, 1]

    best_thr = 0.5
    best_acc = acc_hard  # 최소한 hard voting 만큼은 나오도록 시작

    # 0.30 ~ 0.70 사이를 0.01 간격으로 돌리면서 최고값 탐색
    thresholds = np.arange(0.30, 0.71, 0.01)

    for thr in thresholds:
        y_pred_thr = (proba_valid >= thr).astype(int)
        acc_thr = accuracy_score(y_valid, y_pred_thr)
        if acc_thr > best_acc:
            best_acc = acc_thr
            best_thr = thr

    print(f"  Soft(확률) 기반 최적 threshold : {best_thr:.2f}")
    print(f"  Soft 방식 최고 Valid Accuracy  : {best_acc:.4f}")

    results.append({
        "name": name,
        "params": params,
        "acc_hard": acc_hard,
        "best_thr": best_thr,
        "best_acc": best_acc
    })

# =====================================================
# 5. 어떤 세팅이 제일 좋은지 정리
# =====================================================
print("\n=======================")
print("RandomForest 세팅별 결과")
print("=======================")
for r in results:
    print(
        f"{r['name']:20s} | "
        f"Hard: {r['acc_hard']:.4f}  | "
        f"Best_thr: {r['best_thr']:.2f}  | "
        f"Best_acc: {r['best_acc']:.4f}"
    )

# Validation 기준으로 제일 좋은 세팅 선택
best_setting = max(results, key=lambda x: x["best_acc"])

print("\n=======================")
print("최종 선택된 세팅 (Valid 기준)")
print("=======================")
print("이름         :", best_setting["name"])
print("파라미터     :", best_setting["params"])
print("Hard Acc     :", best_setting["acc_hard"])
print("Best thr     :", best_setting["best_thr"])
print("Best val Acc :", best_setting["best_acc"])

# =====================================================
# 6. 최종 모델 다시 학습 (전체 train 7000행 사용)
#    - 위에서 고른 최적 파라미터 + threshold 로 확정
# =====================================================

final_params = best_setting["params"]
final_thr = best_setting["best_thr"]

final_model = RandomForestClassifier(**final_params)
final_model.fit(X, y)  # 전체 train 데이터 사용해서 다시 학습

# # =====================================================
# # 7. test.csv 예측 → submission.csv 저장
# #    - 확률 기반 threshold 적용
# # =====================================================
# test_proba = final_model.predict_proba(X_test_final)[:, 1]
# test_pred  = (test_proba >= final_thr).astype(int)

# submission = sample_submission.copy()
# submission["label"] = test_pred

# print("\n제출 파일 미리보기:")
# print(submission.head())

# submission.to_csv("submission_rf_best.csv", index=False)
# print("\n✅ 'submission_rf_best.csv' 저장 완료")


▶ RF_basic 학습 시작
  하이퍼파라미터: {'n_estimators': 300, 'random_state': 42, 'n_jobs': -1}
  Hard Voting Accuracy (model.predict) : 0.7200
  Soft(확률) 기반 최적 threshold : 0.46
  Soft 방식 최고 Valid Accuracy  : 0.7271

▶ RF_balanced_depth8 학습 시작
  하이퍼파라미터: {'n_estimators': 500, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 2, 'class_weight': 'balanced_subsample', 'random_state': 42, 'n_jobs': -1}
  Hard Voting Accuracy (model.predict) : 0.6879
  Soft(확률) 기반 최적 threshold : 0.65
  Soft 방식 최고 Valid Accuracy  : 0.7236

▶ RF_balanced_depth12 학습 시작
  하이퍼파라미터: {'n_estimators': 600, 'max_depth': 12, 'min_samples_split': 4, 'min_samples_leaf': 2, 'class_weight': 'balanced_subsample', 'random_state': 42, 'n_jobs': -1}
  Hard Voting Accuracy (model.predict) : 0.7071
  Soft(확률) 기반 최적 threshold : 0.56
  Soft 방식 최고 Valid Accuracy  : 0.7264

RandomForest 세팅별 결과
RF_basic             | Hard: 0.7200  | Best_thr: 0.46  | Best_acc: 0.7271
RF_balanced_depth8   | Hard: 0.6879  | Best_thr: 0.65  | Best_acc:

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True
