In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import koreanize_matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold, cross_val_predict
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV, Ridge, LogisticRegression, LinearRegression
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, FunctionTransformer, StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
from lightgbm import LGBMClassifier

import optuna

# 성능지표
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 1. 데이터 불러오기
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')
# 2. 데이터가 어떻게 생겼는지 5줄만 보기
print("--- 데이터 샘플 ---")
display(train.head())

# 3. 데이터의 통계치 확인 (평균, 최소/최대값 등)
print("\n--- 통계적 특성 ---")
display(train.describe())

--- 데이터 샘플 ---


Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence



--- 통계적 특성 ---


Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
count,630000.0,630000.0,630000.0,630000.0,630000.0,630000.0,630000.0,630000.0,630000.0,630000.0,630000.0,630000.0,630000.0,630000.0
mean,314999.5,54.136706,0.714735,3.312752,130.497433,245.011814,0.079987,0.98166,152.816763,0.273725,0.716028,1.455871,0.45104,4.618873
std,181865.479132,8.256301,0.451541,0.851615,14.975802,33.681581,0.271274,0.998783,19.112927,0.44587,0.948472,0.545192,0.798549,1.950007
min,0.0,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,157499.75,48.0,0.0,3.0,120.0,223.0,0.0,0.0,142.0,0.0,0.0,1.0,0.0,3.0
50%,314999.5,54.0,1.0,4.0,130.0,243.0,0.0,0.0,157.0,0.0,0.1,1.0,0.0,3.0
75%,472499.25,60.0,1.0,4.0,140.0,269.0,0.0,2.0,166.0,1.0,1.4,2.0,1.0,7.0
max,629999.0,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


In [3]:
# Presence(질환 있음)는 1, Absence(정상)는 0으로 변환
train['target'] = train['Heart Disease'].map({'Presence': 1, 'Absence': 0})

# 1. 이제 다시 특성(칼럼)들 골라내기 실행
features = train.columns.drop(['id', 'Heart Disease', 'target'])

# 2. X, y 설정
X = train[features]
y = train['target']

In [17]:
print(train[['Heart Disease', 'target']].head())

  Heart Disease  target
0      Presence       1
1       Absence       0
2       Absence       0
3       Absence       0
4      Presence       1


In [15]:
# 1. 데이터를 학습용(80%)과 검증용(20%)으로 나누기
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 2. 모델 재학습 (학습용 데이터로만)
model = LGBMClassifier(random_state=42)
model.fit(X_train, y_train)

# 3. 예측하기
y_pred = model.predict(X_val)            # 0 또는 1로 예측 (정확도용)
y_prob = model.predict_proba(X_val)[:, 1] # 확률로 예측 (ROC-AUC용)

# 4. 점수 확인
print(f"정확도(Accuracy): {accuracy_score(y_val, y_pred):.4f}")
print(f"ROC-AUC 점수: {roc_auc_score(y_val, y_prob):.4f}")

[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011629 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 418
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383
정확도(Accuracy): 0.8895
ROC-AUC 점수: 0.9556


In [16]:
from sklearn.model_selection import cross_val_score

# 5번 시험 봐서 평균 점수 내기
scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print(f"교차 검증 AUC 평균: {scores.mean():.4f}")

[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012628 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 419
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383
[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012693 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 419
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 13
[LightGBM] [In

In [4]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

# 1. 목적 함수 정의 (어떤 수치를 최적화할지 결정)
def objective(trial):
    # 테스트해볼 파라미터들 범위 설정
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'n_estimators': 1000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

    # 모델 생성
    model = LGBMClassifier(**param)
    
    # 교차 검증으로 점수 계산 (시간 절약을 위해 3-Fold 사용)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    score = cross_val_score(model, X, y, cv=cv, scoring='roc_auc').mean()
    
    return score

# 2. 최적화 실행 (Study 생성)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20) # 우선 20회 정도 시도해봅니다.

print(f"최고 점수: {study.best_value}")
print(f"최적 파라미터: {study.best_params}")

# 3. 최적의 파라미터로 최종 모델 학습
best_model = LGBMClassifier(**study.best_params, n_estimators=2000, random_state=42)
best_model.fit(X, y)

# 4. 테스트 데이터 예측 및 제출 파일 생성
X_test = test[features]
final_preds = best_model.predict_proba(X_test)[:, 1]

submission_optuna = pd.DataFrame({
    'id': test['id'],
    'Heart Disease': final_preds
})
submission_optuna.to_csv('submission_optuna.csv', index=False)
print("Optuna 제출 파일 생성 완료!")

[32m[I 2026-02-13 18:05:25,572][0m A new study created in memory with name: no-name-ad80168e-7379-4eb8-954d-bc45c49bb455[0m
[32m[I 2026-02-13 18:06:02,939][0m Trial 0 finished with value: 0.9551080691993238 and parameters: {'learning_rate': 0.010061159675697694, 'num_leaves': 90, 'max_depth': 11, 'min_child_samples': 29, 'subsample': 0.7805045778792495, 'colsample_bytree': 0.6117062343345896}. Best is trial 0 with value: 0.9551080691993238.[0m
[32m[I 2026-02-13 18:06:20,002][0m Trial 1 finished with value: 0.955076511124603 and parameters: {'learning_rate': 0.04487223484346622, 'num_leaves': 36, 'max_depth': 9, 'min_child_samples': 85, 'subsample': 0.9418825287290286, 'colsample_bytree': 0.7717385597527264}. Best is trial 0 with value: 0.9551080691993238.[0m
[32m[I 2026-02-13 18:06:40,887][0m Trial 2 finished with value: 0.9553672282602151 and parameters: {'learning_rate': 0.043382534828870306, 'num_leaves': 20, 'max_depth': 10, 'min_child_samples': 69, 'subsample': 0.821748

최고 점수: 0.9554018833349085
최적 파라미터: {'learning_rate': 0.037058542404525996, 'num_leaves': 21, 'max_depth': 5, 'min_child_samples': 72, 'subsample': 0.9867247502965943, 'colsample_bytree': 0.5070204688738021}
Optuna 제출 파일 생성 완료!


In [18]:
# 2. 학습 때 사용했던 특성(features) 그대로 테스트 데이터에서 추출
X_test = test[features]

# 3. '확률(Probability)'로 예측하기 (대회 요구사항)
# [:, 1]은 'Presence(질환 있음)'일 확률만 가져오는 것입니다.
test_preds = model.predict_proba(X_test)[:, 1]

# 4. 제출 양식(sample_submission.csv)에 맞춰 파일 만들기
submission = pd.DataFrame({
    'id': test['id'],
    'Heart Disease': test_preds
})

# 5. CSV 파일로 저장
submission.to_csv('my_first_submission.csv', index=False)
print("제출 파일 생성 완료!")




제출 파일 생성 완료!
