In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from imblearn.pipeline import Pipeline # Use imblearn's pipeline for resampling-safe workflows
import matplotlib.pyplot as plt
import seaborn as sns

# --- 1. Load the Dataset ---

In [2]:
# Read the data that is inside of the CSV
df = pd.read_csv("./Health_Data/cleaned_health.csv")
df

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,cp_0,cp_1,cp_2,cp_3
0,52,1,125,212,0,1,168,0,1.0,2,2,3,0,1,0,0,0
1,53,1,140,203,1,0,155,1,3.1,0,0,3,0,1,0,0,0
2,70,1,145,174,0,1,125,1,2.6,0,0,3,0,1,0,0,0
3,61,1,148,203,0,1,161,0,0.0,2,1,3,0,1,0,0,0
4,62,0,138,294,1,1,106,0,1.9,1,3,2,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,68,0,120,211,0,0,115,0,1.5,1,0,2,1,0,0,1,0
298,44,0,108,141,0,1,175,0,0.6,1,0,2,1,0,0,1,0
299,52,1,128,255,0,1,161,1,0.0,2,1,3,0,1,0,0,0
300,59,1,160,273,0,0,125,0,0.0,2,0,2,0,0,0,0,1


In [3]:
# Define original features and target columns
ORIGINAL_FEATURES = [
    'age', 'sex', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
    'exang', 'oldpeak', 'slope', 'ca', 'thal',
    'cp_0', 'cp_1', 'cp_2', 'cp_3'
]
TARGET_COL = 'target' # The column indicating disease presence (0 or 1)

# Define columns by type for robust preprocessing
NUMERICAL_FEATURES = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
CATEGORICAL_FEATURES = ['sex', 'cp_0', 'cp_1', 'cp_2', 'cp_3', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# --- 2. Data Preprocessing (Ensuring Cleanliness and Correct Types) ---

In [4]:
print("\n--- Data Preprocessing for Cross-Validation ---")

# Handle '?' or other non-numeric values if they exist, converting to NaN first
df.replace('?', np.nan, inplace=True)
df.replace('N/A', np.nan, inplace=True)

# Convert all relevant columns to numeric, coercing errors
for col in NUMERICAL_FEATURES + CATEGORICAL_FEATURES + [TARGET_COL]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Impute missing values after type conversion
for col in NUMERICAL_FEATURES:
    if col in df.columns and df[col].isnull().any():
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
for col in CATEGORICAL_FEATURES + [TARGET_COL]:
    if col in df.columns and df[col].isnull().any():
        mode_val = df[col].mode()[0]
        df[col].fillna(mode_val, inplace=True)

# Separate features (X) and target (y)
X = df[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]
y = df[TARGET_COL].astype(int) # Ensure target is an integer type

if y.nunique() != 2:
    print(f"Error: The target column '{TARGET_COL}' is not binary. It has {y.nunique()} unique values: {y.unique()}")
    print("Please ensure your 'target' column is binary (e.g., 0 and 1) for classification.")
    exit()

print(f"\nOriginal Target distribution:\n{y.value_counts(normalize=True).round(2)}")


--- Data Preprocessing for Cross-Validation ---

Original Target distribution:
1    0.54
0    0.46
Name: target, dtype: float64


# --- 3. Create Preprocessing and Model Pipeline ---

In [5]:
# This pipeline will be used throughout the cross-validation process
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUMERICAL_FEATURES),
        ('cat', 'passthrough', CATEGORICAL_FEATURES)
    ],
    remainder='passthrough'
)

# Define a single model pipeline to use in cross-validation
# For this example, we'll use a Logistic Regression with balanced class weights
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced'))
])

# --- 4. Cross-Validation with cross_val_score ---

In [6]:
print("\n--- Method 1: Using cross_val_score for quick performance estimate ---")
# 'cv' specifies the number of folds (e.g., 5-fold cross-validation)
# 'scoring' specifies the metric to calculate for each fold
# We use StratifiedKFold to ensure a balanced class distribution in each fold
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(model_pipeline, X, y, cv=cv_strategy, scoring='accuracy')

print(f"Scores for each fold: {scores}")
print(f"Mean Accuracy: {scores.mean():.4f}")
print(f"Standard Deviation: {scores.std():.4f}")
print("A small standard deviation indicates the model's performance is stable across different data splits.")


--- Method 1: Using cross_val_score for quick performance estimate ---
Scores for each fold: [0.75409836 0.83606557 0.78333333 0.9        0.8       ]
Mean Accuracy: 0.8147
Standard Deviation: 0.0502
A small standard deviation indicates the model's performance is stable across different data splits.


# --- 5. Cross-Validation with cross_val_predict for a full report ---

In [7]:
print("\n--- Method 2: Using cross_val_predict for a full evaluation report ---")
# cross_val_predict returns predictions for each data point from its respective out-of-fold model
y_pred_cv = cross_val_predict(model_pipeline, X, y, cv=cv_strategy)

# We can now use these predictions to generate a single, comprehensive report
print("\nComprehensive Classification Report from Cross-Validation:")
print(classification_report(y, y_pred_cv))
print("Comprehensive Confusion Matrix from Cross-Validation:")
print(confusion_matrix(y, y_pred_cv))

# It's also useful to get the probability scores for ROC AUC
y_proba_cv = cross_val_predict(model_pipeline, X, y, cv=cv_strategy, method='predict_proba')[:, 1]
print(f"\nComprehensive ROC AUC Score: {roc_auc_score(y, y_proba_cv):.4f}")


--- Method 2: Using cross_val_predict for a full evaluation report ---

Comprehensive Classification Report from Cross-Validation:
              precision    recall  f1-score   support

           0       0.81      0.78      0.79       138
           1       0.82      0.84      0.83       164

    accuracy                           0.81       302
   macro avg       0.81      0.81      0.81       302
weighted avg       0.81      0.81      0.81       302

Comprehensive Confusion Matrix from Cross-Validation:
[[108  30]
 [ 26 138]]

Comprehensive ROC AUC Score: 0.8921


In [8]:
print("\nCross-validation complete. The results provide a more reliable estimate of the model's performance.")


Cross-validation complete. The results provide a more reliable estimate of the model's performance.
