In [4]:
import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.metrics import mean_squared_error
plt.rcParams['font.family'] = 'Apple SD Gothic Neo'
plt.rcParams['axes.unicode_minus'] = False
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정


# pd.read_csv() 함수를 사용해서 데이터를 읽어오는 코드입니다.
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [5]:
import numpy as np
from sklearn.metrics import (
    log_loss,
    accuracy_score,
    classification_report,
    mean_squared_error,
    r2_score,
)

def evaluate_model(model, X_valid, y_valid, name="model", verbose=True):
    y_pred = model.predict(X_valid)

    # Log Loss (로스 로스, log_loss)
    loss = None
    if hasattr(model, "predict_proba"):
        try:
            y_proba = model.predict_proba(X_test)
            loss = log_loss(X_valid, y_proba)
        except Exception:
            loss = None

    # R² (결정계수, R-squared)
    r2 = r2_score(y_valid, y_pred)

    # 정확도(Accuracy)
    acc = None
    cls_report = None
    try:
        acc = accuracy_score(y_valid, y_pred)
        cls_report = classification_report(y_valid, y_pred)
    except Exception:
        # 회귀모델(continuous target)일 때는 여기로 옴
        pass

    # MSE / RMSE
    mse = mean_squared_error(y_valid, y_pred)
    rmse = np.sqrt(mse)

    # ------------ 출력 (verbose=True일 때만) ------------
    if verbose:
        print("=" * 40)
        print(f"[{name}] 모델 성능 평가")
        print("=" * 40)

        if loss is not None:
            print("Log Loss:", loss)

        print(f"{name:15s} R score(): {r2:.4f}")

        if acc is not None:
            print("정확도:", acc)
            if cls_report is not None:
                print(cls_report)

        print("-" * 40)
        print(f"MSE (평균 제곱 오차): {mse:.3f}")
        print(f"RMSE (평균 제곱근 오차): {rmse:.3f}")
        print(f"R² Score (결정계수): {r2:.3f}")
        print("=" * 40)

    # 핵심: 정확도(없으면 None)를 리턴
    return acc

In [6]:
# 선형 모델 (Linear Models)
from sklearn.linear_model import LogisticRegression  # 로지스틱 회귀
from sklearn.model_selection import StratifiedKFold
# 거리 기반 (Distance-based)
from sklearn.neighbors import KNeighborsClassifier   # k-최근접 이웃

# 트리 기반 (Tree-based)
from sklearn.tree import DecisionTreeClassifier      # 결정나무
from sklearn.ensemble import RandomForestClassifier  # 랜덤 포레스트
from sklearn.ensemble import GradientBoostingClassifier  # 그래디언트 부스팅

# 서포트 벡터 머신 (SVM)
from sklearn.svm import SVC                          # 서포트 벡터 분류
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.base import clone
plt.rcParams['font.family'] = 'Apple SD Gothic Neo'
plt.rcParams['axes.unicode_minus'] = False

from sklearn.model_selection import KFold

In [76]:
train.rename(columns={'혈압': '맥압'}, inplace=True)
test.rename(columns={'혈압': '맥압'}, inplace=True)

train.loc[train['시력'] > 2, '시력'] = 2.0
train.loc[train['혈청 크레아티닌'] > 1.5, '혈청 크레아티닌'] = 1.5
train.loc[train['요 단백'] > 4, '요 단백'] = 4.0
train.loc[train['저밀도지단백'] > 150, '저밀도지단백'] = 150
train.loc[train['고밀도지단백'] > 100, '고밀도지단백'] = 100
train.loc[train['중성 지방'] > 200, '중성 지방'] = 200
train.loc[train['공복 혈당'] > 200, '공복 혈당'] = 200

train.loc[train['저밀도지단백'] < 22, '저밀도지단백'] = 22
train.loc[train['고밀도지단백'] < 22, '고밀도지단백'] = 22


test.loc[test['시력'] > 2, '시력'] = 2.0
test.loc[test['혈청 크레아티닌'] > 1.5, '혈청 크레아티닌'] = 1.5
test.loc[test['요 단백'] > 4, '요 단백'] = 4.0
test.loc[test['저밀도지단백'] > 150, '저밀도지단백'] = 150
test.loc[test['고밀도지단백'] > 100, '고밀도지단백'] = 100
test.loc[test['중성 지방'] > 200, '중성 지방'] = 200
test.loc[test['공복 혈당'] > 200, '공복 혈당'] = 200
test.loc[test['간 효소율'] > 10, '간 효소율'] = 10

test.loc[test['저밀도지단백'] < 22, '저밀도지단백'] = 22
test.loc[test['고밀도지단백'] < 22, '고밀도지단백'] = 22


train['저밀도지단백_고밀도지단백_낮음'] = train['저밀도지단백'] / train['고밀도지단백']
train['고밀도지단백_저밀도지단백_높음'] = train['고밀도지단백'] / train['저밀도지단백']
train['중성지방_고밀도지단백_2이하'] = train['중성 지방'] / train['고밀도지단백']
train['true_BMI'] = train['몸무게(kg)'] / ((train['키(cm)'] / 100) ** 2)
train['총 콜레스테롤'] = train['저밀도지단백'] +train['고밀도지단백'] + (train['중성 지방'] / 5)
train['나이_BMI'] = train['나이'] * train['true_BMI']
train['키(m)'] = train['키(cm)'] * 0.01
train['고밀도지단백-저밀도지단백'] = train['고밀도지단백'] - train['저밀도지단백']
train['나이/간 효소율'] = train['나이'] / train['간 효소율']
train['헤모글로빈*나이'] = train['헤모글로빈'] * train['나이']
train["age_bin"] = (train["나이"] // 10) * 10
train['BMI_bin'] = (train['BMI']// 10) * 10
train['맥압/나이'] = train['맥압'] / train['나이']
train['헤모글로빈_키구간'] = train['키(m)'] * train['헤모글로빈']
train["헤모글로빈_나이구간"] = train["age_bin"] * train['헤모글로빈']
train['헤마토크릿'] = train['헤모글로빈'] * 3


test['저밀도지단백_고밀도지단백_낮음'] = test['저밀도지단백'] / test['고밀도지단백']
test['고밀도지단백_저밀도지단백_높음'] = test['고밀도지단백'] / test['저밀도지단백']
test['중성지방_고밀도지단백_2이하'] = test['중성 지방'] / test['고밀도지단백']
test['true_BMI'] = test['몸무게(kg)'] / ((test['키(cm)'] / 100) ** 2)
test['총 콜레스테롤'] = test['저밀도지단백'] +test['고밀도지단백'] + (test['중성 지방'] / 5)
test['나이_BMI'] = test['나이'] * test['true_BMI']
test['키(m)'] = test['키(cm)'] * 0.01
test['고밀도지단백-저밀도지단백'] = test['고밀도지단백'] - test['저밀도지단백']
test['나이/간 효소율'] = test['나이'] / test['간 효소율']
test['헤모글로빈*나이'] = test['헤모글로빈'] * test['나이']
test["age_bin"] = (test["나이"] // 10) * 10
test['BMI_bin'] = (test['BMI']// 10) * 10
test['맥압/나이'] = test['맥압'] / test['나이']
test['헤모글로빈_키구간'] = test['키(m)'] * test['헤모글로빈']
test["헤모글로빈_나이구간"] = test["age_bin"] * test['헤모글로빈']
test['헤마토크릿'] = test['헤모글로빈'] * 3

cols = [
    "나이",
    "키(cm)",
    "몸무게(kg)",
    "시력",
    #"충치",
    "공복 혈당",
    "맥압",
    #"중성 지방",
    "혈청 크레아티닌",
    #"고밀도지단백",
    "저밀도지단백",
    "헤모글로빈",
    "요 단백",
    "간 효소율",
    #"저밀도지단백_고밀도지단백_낮음",
    "고밀도지단백_저밀도지단백_높음",
    "중성지방_고밀도지단백_2이하",
    #"true_BMI",
    "총 콜레스테롤",
    "나이_BMI",
    #"키(m)",
    "고밀도지단백-저밀도지단백",
    "나이/간 효소율",
    #"헤모글로빈*나이",
    '콜레스테롤',
    'BMI',
    'age_bin',
    'BMI_bin',
    #'맥압/나이',
    #'헤모글로빈_키구간'
    #'헤모글로빈_나이구간',
    '헤마토크릿',
    #'평균적혈구혈색농도'
    ]

X = train[cols]
y = train['label']
X_test = test[cols]
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [80]:
X = train.drop(['ID', 'label',], axis = 1)
y = train['label']

x_test = test.drop('ID', axis = 1)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [81]:
skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42,
    
)


In [91]:


param_grids = {
        'GradientBoostingClassifier': {
        'loss':['exponential'],
        'n_estimators':[100],

    }
}
models = [
    ('GradientBoostingClassifier', GradientBoostingClassifier(random_state=42)),
]

results = []

In [92]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.base import clone


results = []
trained_models = {}  # 나중에 test.csv 예측에 사용할 학습 완료 모델 저장

for name, base_model in models:
    print(f"\n====== {name} ======")

    # (A) GridSearchCV 적용
    if name in param_grids:
        grid = GridSearchCV(
            estimator=base_model,
            param_grid=param_grids[name],
            scoring='roc_auc',
            cv=skf,
            n_jobs=-1,
            verbose=1
        )
        grid.fit(X_train, y_train)
       
        print("Best params:", grid.best_params_)
        print("Best CV score (inner):", grid.best_score_)

        model = grid.best_estimator_
        best_params = grid.best_params_
        best_grid_cv_mean = grid.best_score_
    else:
        model = clone(base_model)
        model.fit(X_train, y_train)
        best_params = None
        best_grid_cv_mean = None

    # 학습된 모델 저장 (나중에 test.csv에 사용)
    trained_models[name] = model

    # (B-1) 기본 predict 기준 Train / Valid 성능
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)

    train_acc = accuracy_score(y_train, y_train_pred)
    valid_acc = accuracy_score(y_valid, y_valid_pred)

    print("Base Valid Accuracy:", valid_acc)
    print("Classification Report (Base Predict):")
    print(classification_report(y_valid, y_valid_pred))

    # (B-2) Threshold 튜닝 (predict_proba 사용 가능할 때만)
    best_thr = None
    best_thr_acc = None

    if hasattr(model, "predict_proba"):
        probs_valid = model.predict_proba(X_valid)[:, 1]

        thresholds = np.linspace(0.0, 1.0, 101)
        best_thr = 0.5
        best_thr_acc = 0.0

        for thr in thresholds:
            y_pred_thr = (probs_valid >= thr).astype(int)
            acc = accuracy_score(y_valid, y_pred_thr)
            if acc > best_thr_acc:
                best_thr_acc = acc
                best_thr = thr

        print(f"\n[Threshold Tuning] Best Threshold (by Accuracy): {best_thr:.2f}")
        print(f"[Threshold Tuning] Best Accuracy on Valid: {best_thr_acc:.4f}")

        y_best_pred = (probs_valid >= best_thr).astype(int)
        print("Classification Report (Best Threshold):")
        print(classification_report(y_valid, y_best_pred))
    else:
        print("\n[Threshold Tuning] 이 모델은 predict_proba를 지원하지 않아 threshold 튜닝을 생략했습니다.")

    # (C) 전체 데이터에 대한 Cross-Validation
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    cv_mean = cv_scores.mean()
    cv_std  = cv_scores.std()

    results.append({
        "name": name,
        "train_acc": train_acc,
        "valid_acc": valid_acc,             # 기본 predict 기준
        "cv_mean": cv_mean,
        "cv_std": cv_std,
        "best_params": best_params,
        "best_grid_cv_mean": best_grid_cv_mean,
        "best_thr": best_thr,               # threshold 튜닝이 된 모델만 값 존재
        "best_thr_acc": best_thr_acc,       # 튜닝 후 Valid Accuracy
    })

# (D) 정리 출력 (기본 valid_acc 기준 정렬)
results_sorted = sorted(results, key=lambda x: x["valid_acc"], reverse=True)

for r in results_sorted:
    gap = r["train_acc"] - r["valid_acc"]
    print(
        f"\n[Summary] {r['name']}\n"
        f"  Train Acc: {r['train_acc']:.4f}\n"
        f"  Valid Acc (Base Predict): {r['valid_acc']:.4f}\n"
        f"  Gap(Train-Valid): {gap:.4f}\n"
        f"  CV Mean (outer): {r['cv_mean']:.4f} (+/- {r['cv_std']:.4f})\n"
        f"  Grid Best CV Mean (inner): {r['best_grid_cv_mean']}\n"
        f"  Best Threshold (if tuned): {r['best_thr']}\n"
        f"  Best Threshold Acc: {r['best_thr_acc']}\n"
        f"  Best Params: {r['best_params']}\n"
    )



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best params: {'loss': 'exponential', 'n_estimators': 100}
Best CV score (inner): 0.8060708041052809
Base Valid Accuracy: 0.7378571428571429
Classification Report (Base Predict):
              precision    recall  f1-score   support

           0       0.81      0.77      0.79       886
           1       0.63      0.68      0.66       514

    accuracy                           0.74      1400
   macro avg       0.72      0.73      0.72      1400
weighted avg       0.74      0.74      0.74      1400


[Threshold Tuning] Best Threshold (by Accuracy): 0.47
[Threshold Tuning] Best Accuracy on Valid: 0.7436
Classification Report (Best Threshold):
              precision    recall  f1-score   support

           0       0.83      0.74      0.79       886
           1       0.63      0.74      0.68       514

    accuracy                           0.74      1400
   macro avg       0.73      0.74      0.73      1400
weighted avg      