In [46]:
import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.metrics import mean_squared_error
plt.rcParams['font.family'] = 'Apple SD Gothic Neo'
plt.rcParams['axes.unicode_minus'] = False
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정
# pd.read_csv() 함수를 사용해서 데이터를 읽어오는 코드입니다.
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [47]:
import numpy as np
from sklearn.metrics import (
    log_loss,
    accuracy_score,
    classification_report,
    mean_squared_error,
    r2_score,
)

def evaluate_model(model, X_valid, y_valid, name="model", verbose=True):
    y_pred = model.predict(X_valid)

    # Log Loss (로스 로스, log_loss)
    loss = None
    if hasattr(model, "predict_proba"):
        try:
            y_proba = model.predict_proba(X_test)
            loss = log_loss(X_valid, y_proba)
        except Exception:
            loss = None

    # R² (결정계수, R-squared)
    r2 = r2_score(y_valid, y_pred)

    # 정확도(Accuracy)
    acc = None
    cls_report = None
    try:
        acc = accuracy_score(y_valid, y_pred)
        cls_report = classification_report(y_valid, y_pred)
    except Exception:
        # 회귀모델(continuous target)일 때는 여기로 옴
        pass

    # MSE / RMSE
    mse = mean_squared_error(y_valid, y_pred)
    rmse = np.sqrt(mse)

    # ------------ 출력 (verbose=True일 때만) ------------
    if verbose:
        print("=" * 40)
        print(f"[{name}] 모델 성능 평가")
        print("=" * 40)

        if loss is not None:
            print("Log Loss:", loss)

        print(f"{name:15s} R score(): {r2:.4f}")

        if acc is not None:
            print("정확도:", acc)
            if cls_report is not None:
                print(cls_report)

        print("-" * 40)
        print(f"MSE (평균 제곱 오차): {mse:.3f}")
        print(f"RMSE (평균 제곱근 오차): {rmse:.3f}")
        print(f"R² Score (결정계수): {r2:.3f}")
        print("=" * 40)

    # 핵심: 정확도(없으면 None)를 리턴
    return acc

In [48]:
train.rename(columns={'혈압': '맥압'}, inplace=True)
test.rename(columns={'혈압': '맥압'}, inplace=True)

---

In [67]:
train.loc[train['시력'] > 2, '시력'] = 2.0
train.loc[train['혈청 크레아티닌'] > 1.5, '혈청 크레아티닌'] = 1.5
train.loc[train['요 단백'] > 4, '요 단백'] = 4.0
train.loc[train['저밀도지단백'] > 150, '저밀도지단백'] = 150
train.loc[train['고밀도지단백'] > 100, '고밀도지단백'] = 100
train.loc[train['중성 지방'] > 200, '중성 지방'] = 200
train.loc[train['공복 혈당'] > 200, '공복 혈당'] = 200

train.loc[train['저밀도지단백'] < 22, '저밀도지단백'] = 22
train.loc[train['고밀도지단백'] < 22, '고밀도지단백'] = 22


In [68]:
test.loc[test['시력'] > 2, '시력'] = 2.0
test.loc[test['혈청 크레아티닌'] > 1.5, '혈청 크레아티닌'] = 1.5
test.loc[test['요 단백'] > 4, '요 단백'] = 4.0
test.loc[test['저밀도지단백'] > 150, '저밀도지단백'] = 150
test.loc[test['고밀도지단백'] > 100, '고밀도지단백'] = 100
test.loc[test['중성 지방'] > 200, '중성 지방'] = 200
test.loc[test['공복 혈당'] > 200, '공복 혈당'] = 200
test.loc[test['간 효소율'] > 10, '간 효소율'] = 10

test.loc[test['저밀도지단백'] < 22, '저밀도지단백'] = 22
test.loc[test['고밀도지단백'] < 22, '고밀도지단백'] = 22


-----

In [70]:
train['저밀도지단백_고밀도지단백_낮음'] = train['저밀도지단백'] / train['고밀도지단백']
train['고밀도지단백_저밀도지단백_높음'] = train['고밀도지단백'] / train['저밀도지단백']
train['중성지방_고밀도지단백_2이하'] = train['중성 지방'] / train['고밀도지단백']
train['true_BMI'] = train['몸무게(kg)'] / ((train['키(cm)'] / 100) ** 2)
train['총 콜레스테롤'] = train['저밀도지단백'] +train['고밀도지단백'] + (train['중성 지방'] / 5)



In [52]:
train.drop(columns=['BMI','콜레스테롤'], inplace=True)

In [69]:
test['저밀도지단백_고밀도지단백_낮음'] = test['저밀도지단백'] / test['고밀도지단백']
test['고밀도지단백_저밀도지단백_높음'] = test['고밀도지단백'] / test['저밀도지단백']
test['중성지방_고밀도지단백_2이하'] = test['중성 지방'] / test['고밀도지단백']
test['true_BMI'] = test['몸무게(kg)'] / ((test['키(cm)'] / 100) ** 2)
test['총 콜레스테롤'] = test['저밀도지단백'] +test['고밀도지단백'] + (test['중성 지방'] / 5)


In [54]:
test.drop(columns=['BMI','콜레스테롤'], inplace=True)

----

In [71]:
train.describe()

Unnamed: 0,나이,키(cm),몸무게(kg),시력,충치,공복 혈당,맥압,중성 지방,혈청 크레아티닌,고밀도지단백,저밀도지단백,헤모글로빈,요 단백,간 효소율,label,저밀도지단백_고밀도지단백_낮음,고밀도지단백_저밀도지단백_높음,중성지방_고밀도지단백_2이하,true_BMI,총 콜레스테롤
count,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0
mean,43.973571,164.781429,65.932857,0.996271,0.227429,98.911714,45.532857,117.096714,0.881486,57.257857,111.802714,14.631914,1.082429,1.144696,0.367286,2.076071,0.560865,2.282204,24.144971,192.479914
std,12.063793,9.170213,12.978702,0.306997,0.419202,18.103501,8.820611,52.111481,0.188655,14.138728,28.277407,1.540907,0.379395,0.432735,0.4821,0.740636,0.267895,1.373607,3.501699,30.732552
min,20.0,135.0,30.0,0.1,0.0,57.0,14.0,21.0,0.1,22.0,22.0,4.9,1.0,0.14,0.0,0.22,0.192,0.239583,14.268728,85.2
25%,35.0,160.0,55.0,0.8,0.0,89.0,40.0,74.0,0.8,47.0,92.0,13.6,1.0,0.84,0.0,1.531915,0.390883,1.181818,21.604938,171.0
50%,40.0,165.0,65.0,1.0,0.0,96.0,45.0,107.0,0.9,55.0,113.0,14.8,1.0,1.1,0.0,2.027027,0.493333,1.916667,23.875115,193.0
75%,50.0,170.0,75.0,1.2,0.0,104.0,50.0,161.0,1.0,66.0,136.0,15.7,1.0,1.38,1.0,2.558311,0.652778,3.174603,26.122449,216.4
max,85.0,190.0,130.0,2.0,1.0,200.0,91.0,200.0,1.5,100.0,150.0,20.9,4.0,5.67,1.0,5.208333,4.545455,8.0,42.44898,278.6


In [72]:
test.describe()

Unnamed: 0,나이,키(cm),몸무게(kg),시력,충치,공복 혈당,맥압,중성 지방,혈청 크레아티닌,고밀도지단백,저밀도지단백,헤모글로빈,요 단백,간 효소율,저밀도지단백_고밀도지단백_낮음,고밀도지단백_저밀도지단백_높음,중성지방_고밀도지단백_2이하,true_BMI,총 콜레스테롤
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,44.59,164.366667,65.443333,0.994267,0.210667,98.766333,45.340667,118.013333,0.8867,57.432,111.856333,14.614633,1.087,1.14866,2.076167,0.5644,2.290381,24.10448,192.891
std,12.328121,9.109146,12.500222,0.301674,0.40785,17.70735,9.021072,51.107884,0.188799,14.281995,28.249467,1.576051,0.389206,0.440844,0.745633,0.276559,1.346702,3.413727,30.148936
min,20.0,130.0,40.0,0.1,0.0,48.0,19.0,20.0,0.4,22.0,22.0,6.8,1.0,0.16,0.32,0.146667,0.234043,15.570934,92.2
25%,40.0,155.0,55.0,0.8,0.0,89.0,40.0,76.0,0.8,47.0,91.0,13.6,1.0,0.85,1.515507,0.388889,1.209497,21.484375,171.6
50%,40.0,165.0,65.0,1.0,0.0,96.0,45.0,108.0,0.9,55.0,113.0,14.8,1.0,1.11,2.033343,0.491801,1.94,23.875115,193.4
75%,55.0,170.0,75.0,1.2,0.0,104.0,50.0,160.25,1.0,66.0,136.0,15.7,1.0,1.38,2.571429,0.659845,3.168269,25.951557,217.0
max,85.0,190.0,120.0,2.0,1.0,200.0,96.0,200.0,1.5,100.0,150.0,19.3,4.0,10.0,6.818182,3.125,7.692308,39.183673,272.0


In [56]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                7000 non-null   object 
 1   나이                7000 non-null   int64  
 2   키(cm)             7000 non-null   int64  
 3   몸무게(kg)           7000 non-null   int64  
 4   시력                7000 non-null   float64
 5   충치                7000 non-null   int64  
 6   공복 혈당             7000 non-null   int64  
 7   맥압                7000 non-null   int64  
 8   중성 지방             7000 non-null   int64  
 9   혈청 크레아티닌          7000 non-null   float64
 10  고밀도지단백            7000 non-null   int64  
 11  저밀도지단백            7000 non-null   int64  
 12  헤모글로빈             7000 non-null   float64
 13  요 단백              7000 non-null   int64  
 14  간 효소율             7000 non-null   float64
 15  label             7000 non-null   int64  
 16  저밀도지단백_고밀도지단백_낮음  7000 non-null   float64


In [57]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                3000 non-null   object 
 1   나이                3000 non-null   int64  
 2   키(cm)             3000 non-null   int64  
 3   몸무게(kg)           3000 non-null   int64  
 4   시력                3000 non-null   float64
 5   충치                3000 non-null   int64  
 6   공복 혈당             3000 non-null   int64  
 7   맥압                3000 non-null   int64  
 8   중성 지방             3000 non-null   int64  
 9   혈청 크레아티닌          3000 non-null   float64
 10  고밀도지단백            3000 non-null   int64  
 11  저밀도지단백            3000 non-null   int64  
 12  헤모글로빈             3000 non-null   float64
 13  요 단백              3000 non-null   int64  
 14  간 효소율             3000 non-null   float64
 15  저밀도지단백_고밀도지단백_낮음  3000 non-null   float64
 16  고밀도지단백_저밀도지단백_높음  3000 non-null   float64


In [151]:
X = train.drop(['label','ID',], axis = 1)
y = train['label']
X_test = test.drop(['ID',], axis = 1)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
X

Unnamed: 0,나이,키(cm),몸무게(kg),시력,충치,공복 혈당,맥압,중성 지방,혈청 크레아티닌,고밀도지단백,저밀도지단백,헤모글로빈,요 단백,간 효소율,저밀도지단백_고밀도지단백_낮음,고밀도지단백_저밀도지단백_높음,중성지방_고밀도지단백_2이하,true_BMI,총 콜레스테롤
0,35,170,70,1.10,1,98,40,80,1.3,75,120,15.9,1,1.53,1.600000,0.625000,1.066667,24.221453,211.0
1,40,150,55,1.00,0,173,39,104,0.6,46,150,11.8,1,1.45,3.260870,0.306667,2.260870,24.444444,216.8
2,60,170,50,0.75,0,96,40,61,0.8,43,89,15.3,1,1.04,2.069767,0.483146,1.418605,17.301038,144.2
3,40,150,45,0.50,0,92,40,46,0.7,66,110,13.4,1,1.18,1.666667,0.600000,0.696970,20.000000,185.2
4,55,155,65,1.00,0,87,42,95,0.9,62,150,13.8,1,1.32,2.419355,0.413333,1.532258,27.055151,231.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,25,170,65,1.50,0,87,45,141,1.2,44,112,14.9,1,1.50,2.545455,0.392857,3.204545,22.491349,184.2
6996,60,165,65,0.90,0,87,45,82,0.9,64,103,14.3,1,1.47,1.609375,0.621359,1.281250,23.875115,183.4
6997,40,180,100,1.20,0,97,44,87,0.9,54,107,15.6,1,1.00,1.981481,0.504673,1.611111,30.864198,178.4
6998,60,150,55,0.60,0,89,57,161,0.6,49,76,14.4,1,1.00,1.551020,0.644737,3.285714,24.444444,157.2


In [160]:
param_grid = param_grid = {
    "n_estimators": [50,60,70,80,90,100,200, 300],
    "max_depth": [8, 10, 12,20,30,40,50],
    "min_samples_leaf": [1, 3, 5],
    "class_weight": ["balanced"],
    'random_state': [42]
}
rf = RandomForestClassifier()

In [168]:
grid = GridSearchCV(
    rf,
    param_grid,
    n_jobs=-1,
    scoring='accuracy'
)

In [169]:
grid.fit(X_train, y_train)

In [174]:
print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

best_model = grid.best_estimator_
test_acc = accuracy_score(y_valid, best_model.predict(X_valid))
print("Test Accuracy:", test_acc)
print(classification_report(y_valid, best_model.predict(X_valid)))


Best params: {'class_weight': 'balanced', 'max_depth': 40, 'min_samples_leaf': 3, 'n_estimators': 70, 'random_state': 42}
Best CV score: 0.7303571428571429
Test Accuracy: 0.7471428571428571
              precision    recall  f1-score   support

           0       0.84      0.74      0.79       886
           1       0.63      0.76      0.69       514

    accuracy                           0.75      1400
   macro avg       0.74      0.75      0.74      1400
weighted avg       0.76      0.75      0.75      1400



In [None]:
X_test


Unnamed: 0,나이,키(cm),몸무게(kg),시력,충치,공복 혈당,맥압,중성 지방,혈청 크레아티닌,고밀도지단백,저밀도지단백,헤모글로빈,요 단백,간 효소율,저밀도지단백_고밀도지단백_낮음,고밀도지단백_저밀도지단백_높음,중성지방_고밀도지단백_2이하,true_BMI,총 콜레스테롤
0,40,165,55,0.90,1,98,47,75,0.5,59,150,13.7,1,1.73,2.542373,0.393333,1.271186,20.202020,224.0
1,65,145,50,0.50,0,99,59,98,0.6,65,115,12.2,1,1.00,1.769231,0.565217,1.507692,23.781213,199.6
2,40,160,75,1.00,0,105,34,200,0.6,50,73,15.1,1,1.33,1.460000,0.684932,4.000000,29.296875,163.0
3,30,180,90,1.35,0,78,45,200,0.9,55,98,15.2,1,0.89,1.781818,0.561224,3.636364,27.777778,193.0
4,50,155,55,0.75,0,116,67,139,1.0,66,136,15.0,1,0.85,2.060606,0.485294,2.106061,22.892820,229.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,40,170,90,0.85,0,94,60,146,1.1,55,137,14.3,1,0.72,2.490909,0.401460,2.654545,31.141869,221.2
2996,25,175,80,1.00,0,95,50,100,0.8,47,100,13.9,1,0.83,2.127660,0.470000,2.127660,26.122449,167.0
2997,45,155,60,1.20,1,88,32,51,0.4,61,114,12.8,1,1.22,1.868852,0.535088,0.836066,24.973985,185.2
2998,35,175,90,1.35,0,94,42,142,0.8,44,91,16.3,1,0.78,2.068182,0.483516,3.227273,29.387755,163.4


In [177]:
pred = best_model.predict(X_test)
submit = pd.read_csv('sample_submission.csv')
submit['label'] = pred
submit.head(15)


Unnamed: 0,ID,label
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,1
3,TEST_0003,1
4,TEST_0004,0
5,TEST_0005,1
6,TEST_0006,1
7,TEST_0007,0
8,TEST_0008,1
9,TEST_0009,0


In [178]:
submit.to_csv('submission.csv', index = False)