In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
import lightgbm as lgb


In [2]:
file_path=r"C:\Users\nyang\Downloads\archive\heart.csv"
df = pd.read_csv(file_path)
X = df.drop("target", axis=1)
y = df["target"]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [3]:
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

print(f"Original number of features: {X.shape[1]}, Reduced features by PCA: {X_pca.shape[1]}")

Original number of features: 13, Reduced features by PCA: 11


In [4]:
# LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'verbose': -1,
    'seed': 42
}

In [5]:
# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
metrics = {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': [], 'AUC': []}

for train_idx, test_idx in skf.split(X_pca, y):
    X_train, X_test = X_pca[train_idx], X_pca[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

    model = lgb.train(
        params,
        train_data,
        num_boost_round=100,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(stopping_rounds=10)]
    )

    y_proba = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred = (y_proba >= 0.5).astype(int)

    metrics['Accuracy'].append(accuracy_score(y_test, y_pred))
    metrics['Precision'].append(precision_score(y_test, y_pred))
    metrics['Recall'].append(recall_score(y_test, y_pred))
    metrics['F1 Score'].append(f1_score(y_test, y_pred))
    metrics['AUC'].append(roc_auc_score(y_test, y_proba))

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[56]	valid_0's auc: 1
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[43]	valid_0's auc: 1
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[63]	valid_0's auc: 1
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[61]	valid_0's auc: 1
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[59]	valid_0's auc: 1


In [13]:
print("Cross-Validated Performance Metrics:")
for metric_name, values in metrics.items():
    print(f"{metric_name}: {np.mean(values):.4f} ± {np.std(values):.4f}")

Cross-Validated Performance Metrics:
Accuracy: 0.9902 ± 0.0062
Precision: 0.9963 ± 0.0074
Recall: 0.9848 ± 0.0143
F1 Score: 0.9904 ± 0.0061
AUC: 1.0000 ± 0.0000
