In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier # Another diverse base model
from sklearn.tree import DecisionTreeClassifier # Another base model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# --- 1. Load the Dataset ---

In [2]:
# Read the data that is inside of the CSV
df = pd.read_csv("./Health_Data/cleaned_health.csv")
df

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,cp_0,cp_1,cp_2,cp_3
0,52,1,125,212,0,1,168,0,1.0,2,2,3,0,1,0,0,0
1,53,1,140,203,1,0,155,1,3.1,0,0,3,0,1,0,0,0
2,70,1,145,174,0,1,125,1,2.6,0,0,3,0,1,0,0,0
3,61,1,148,203,0,1,161,0,0.0,2,1,3,0,1,0,0,0
4,62,0,138,294,1,1,106,0,1.9,1,3,2,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,68,0,120,211,0,0,115,0,1.5,1,0,2,1,0,0,1,0
298,44,0,108,141,0,1,175,0,0.6,1,0,2,1,0,0,1,0
299,52,1,128,255,0,1,161,1,0.0,2,1,3,0,1,0,0,0
300,59,1,160,273,0,0,125,0,0.0,2,0,2,0,0,0,0,1


In [3]:
# Define original features and target columns
ORIGINAL_FEATURES = [
    'age', 'sex', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
    'exang', 'oldpeak', 'slope', 'ca', 'thal',
    'cp_0', 'cp_1', 'cp_2', 'cp_3'
]
TARGET_COL = 'target' # The column indicating disease presence (0 or 1)

# Define numerical columns for imputation and potential engineering
NUMERICAL_COLS_FOR_PROCESSING = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# --- 2. Data Preprocessing (Ensuring Cleanliness and Correct Types) ---

In [4]:
print("\n--- Data Preprocessing for Ensembling ---")

# Handle '?' or other non-numeric values if they exist, converting to NaN first
df.replace('?', np.nan, inplace=True)
df.replace('N/A', np.nan, inplace=True)

# Convert all relevant columns to numeric, coercing errors
all_relevant_cols_initial = ORIGINAL_FEATURES + [TARGET_COL]
for col in all_relevant_cols_initial:
    if col in df.columns:
        if col in NUMERICAL_COLS_FOR_PROCESSING:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        else: # Categorical/binary features including target
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64') # Use nullable integer

# Impute missing values after type conversion
print("Missing values before imputation:")
print(df[all_relevant_cols_initial].isnull().sum()[df[all_relevant_cols_initial].isnull().sum() > 0])

for col in all_relevant_cols_initial:
    if col in df.columns and df[col].isnull().any():
        if col in NUMERICAL_COLS_FOR_PROCESSING:
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
            print(f"Filled missing values in '{col}' with its median ({median_val}).")
        else: # Categorical/binary features including target
            mode_val = df[col].mode()[0]
            df[col].fillna(mode_val, inplace=True)
            print(f"Filled missing values in '{col}' with its mode ({mode_val}).")

print("\nMissing values after imputation:")
print(df[all_relevant_cols_initial].isnull().sum())


--- Data Preprocessing for Ensembling ---
Missing values before imputation:
Series([], dtype: int64)

Missing values after imputation:
age         0
sex         0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
cp_0        0
cp_1        0
cp_2        0
cp_3        0
target      0
dtype: int64


# --- 3. Advanced Feature Engineering (Re-applying from previous step) ---

In [5]:
print("\n--- Re-applying Advanced Feature Engineering ---")

# Create a copy of the DataFrame to add new features
df_engineered = df.copy()


--- Re-applying Advanced Feature Engineering ---


# 3.1. Polynomial Features

In [6]:
poly = PolynomialFeatures(degree=2, include_bias=False)
cols_for_poly = ['age', 'trestbps', 'chol']
actual_cols_for_poly = [col for col in cols_for_poly if col in df_engineered.columns]

if actual_cols_for_poly:
    poly_features = poly.fit_transform(df_engineered[actual_cols_for_poly])
    poly_feature_names = poly.get_feature_names_out(actual_cols_for_poly)
    poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df_engineered.index)

    # Drop original columns from poly_df to avoid duplication
    cols_to_drop_from_poly_df = [col for col in actual_cols_for_poly if col in poly_df.columns]
    poly_df = poly_df.drop(columns=cols_to_drop_from_poly_df, errors='ignore')

    df_engineered = pd.concat([df_engineered, poly_df], axis=1)
    print(f"Added polynomial features for: {actual_cols_for_poly}")
else:
    print("Skipping polynomial features: None of the specified columns found.")

Added polynomial features for: ['age', 'trestbps', 'chol']


# 3.2. Interaction Features (Manual)

In [7]:
if 'age' in df_engineered.columns and 'chol' in df_engineered.columns:
    df_engineered['age_x_chol'] = (df_engineered['age'] * df_engineered['chol']).squeeze()
    print("Added interaction feature: 'age_x_chol'")
if 'thalach' in df_engineered.columns and 'exang' in df_engineered.columns:
    df_engineered['thalach_x_exang'] = (df_engineered['thalach'] * df_engineered['exang']).squeeze()
    print("Added interaction feature: 'thalach_x_exang'")

Added interaction feature: 'age_x_chol'
Added interaction feature: 'thalach_x_exang'


# 3.3. Binning / Discretization

In [8]:
if 'age' in df_engineered.columns:
    age_bins = [0, 40, 50, 60, 70, df_engineered['age'].max() + 1]
    age_labels = ['<40', '40-49', '50-59', '60-69', '70+']
    df_engineered['age_group'] = pd.cut(df_engineered['age'], bins=age_bins, labels=age_labels, right=False)
    # Convert 'age_group' to numerical for modeling if needed, e.g., using LabelEncoder or one-hot encoding
    # For now, we'll keep it as categorical and handle it in feature selection for X
    print("Added binned feature: 'age_group'")

Added binned feature: 'age_group'


# 3.4. Ratio Features

In [9]:
if 'chol' in df_engineered.columns and 'trestbps' in df_engineered.columns:
    df_engineered['chol_to_trestbps_ratio'] = np.where(
        df_engineered['trestbps'] != 0,
        df_engineered['chol'] / df_engineered['trestbps'],
        0
    ).squeeze()
    print("Added ratio feature: 'chol_to_trestbps_ratio'")

Added ratio feature: 'chol_to_trestbps_ratio'


# 3.5. Combining One-Hot Encoded 'cp' into a single 'cp_type' categorical feature

In [10]:
cp_cols = ['cp_0', 'cp_1', 'cp_2', 'cp_3']
actual_cp_cols = [col for col in cp_cols if col in df_engineered.columns]
if len(actual_cp_cols) == 4:
    df_engineered['cp_type'] = df_engineered[actual_cp_cols].idxmax(axis=1)
    df_engineered['cp_type'] = df_engineered['cp_type'].str.replace('cp_', '').astype(int)
    print("Combined one-hot encoded 'cp' into 'cp_type' categorical feature.")
else:
    print("Skipping 'cp_type' combination: Not all one-hot encoded 'cp' columns found.")

# Update FEATURES list to include new engineered features for modeling
# Ensure these new features are handled correctly (e.g., age_group will need encoding)
ENGINEERED_FEATURES = [
    col for col in df_engineered.columns
    if col not in df.columns and col != TARGET_COL # Exclude target and original columns
]
# For 'age_group', we need to convert it to numerical. Let's one-hot encode it for simplicity.
if 'age_group' in df_engineered.columns:
    df_engineered = pd.get_dummies(df_engineered, columns=['age_group'], prefix='age_group')
    # Update ENGINEERED_FEATURES to include the new one-hot encoded age_group columns
    ENGINEERED_FEATURES = [col for col in df_engineered.columns if col.startswith('age_group_')] + \
                          [f for f in ENGINEERED_FEATURES if not f.startswith('age_group')]

# Final list of features for X
FINAL_FEATURES = [col for col in ORIGINAL_FEATURES if col in df_engineered.columns] + ENGINEERED_FEATURES
# Remove original cp_0, cp_1, cp_2, cp_3 if cp_type is used
if 'cp_type' in FINAL_FEATURES:
    FINAL_FEATURES = [f for f in FINAL_FEATURES if f not in cp_cols]

# Separate features (X) and target (y)
X = df_engineered[FINAL_FEATURES]
y = df_engineered[TARGET_COL]

# Check if target variable has only two unique values (binary classification)
if y.nunique() != 2:
    print(f"Error: The target column '{TARGET_COL}' is not binary. It has {y.nunique()} unique values: {y.unique()}")
    print("Please ensure your 'target' column is binary (e.g., 0 and 1) for classification.")
    exit()

print(f"\nFinal Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts(normalize=True).round(2)}")
print("\nFinal features used for modeling:")
print(X.columns.tolist())

Combined one-hot encoded 'cp' into 'cp_type' categorical feature.

Final Features (X) shape: (302, 27)
Target (y) shape: (302,)
Target distribution:
1    0.54
0    0.46
Name: target, dtype: Float64

Final features used for modeling:
['age', 'sex', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'age_group_<40', 'age_group_40-49', 'age_group_50-59', 'age_group_60-69', 'age_group_70+', 'age^2', 'age trestbps', 'age chol', 'trestbps^2', 'trestbps chol', 'chol^2', 'age_x_chol', 'thalach_x_exang', 'chol_to_trestbps_ratio', 'cp_type']


# --- 4. Split Data into Training and Testing Sets ---

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)
print(f"\nTraining set shape: {X_train.shape}, {y_train.shape}")
print(f"Testing set shape: {X_test.shape}, {y_test.shape}")


Training set shape: (226, 27), (226,)
Testing set shape: (76, 27), (76,)


# --- 5. Feature Scaling (for numerical features) ---

In [12]:
scaler = StandardScaler()
# Identify numerical features from the FINAL_FEATURES list that need scaling
# These are the original numerical columns plus any new numerical interaction/ratio features
numerical_features_to_scale = [col for col in NUMERICAL_COLS_FOR_PROCESSING if col in X_train.columns] + \
                              ['age_x_chol', 'thalach_x_exang', 'chol_to_trestbps_ratio']
numerical_features_to_scale = [f for f in numerical_features_to_scale if f in X_train.columns]

if numerical_features_to_scale:
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[numerical_features_to_scale] = scaler.fit_transform(X_train[numerical_features_to_scale])
    X_test_scaled[numerical_features_to_scale] = scaler.transform(X_test[numerical_features_to_scale])
    print(f"Scaled numerical features: {numerical_features_to_scale}")
else:
    print("No numerical features found for scaling. Using original X_train/X_test.")
    X_train_scaled = X_train
    X_test_scaled = X_test

print("\nFirst 5 rows of scaled training features:")
print(X_train_scaled.head())

Scaled numerical features: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'age_x_chol', 'thalach_x_exang', 'chol_to_trestbps_ratio']

First 5 rows of scaled training features:
         age  sex  trestbps      chol  fbs  restecg   thalach  exang  \
18  0.410557    1  0.515901 -0.676764    1        0  0.611286      0   
90 -0.246722    1 -0.804933 -1.153240    0        0  1.717907      0   
75 -1.123095    1 -0.684857 -0.505233    0        1  0.832610      0   
8  -0.904002    1 -0.684857  0.047479    0        0 -0.318276      0   
28  0.081918    0  2.917417  1.534084    0        2 -1.513427      1   

     oldpeak  slope  ...   age^2  age trestbps  age chol  trestbps^2  \
18 -0.851400      2  ...  3364.0        8120.0   12238.0     19600.0   
90 -0.851400      1  ...  2704.0        6136.0    9672.0     13924.0   
75 -0.851400      2  ...  1936.0        5280.0    9680.0     14400.0   
8  -0.149150      2  ...  2116.0        5520.0   11454.0     14400.0   
28  2.133161      1  ...  30

# --- 6. Model Ensembling ---

# --- 6.1. Define Base Estimators ---

In [13]:
print("\n--- Defining Base Estimators for Ensembling ---")
# Use diverse models for better ensemble performance
clf1 = LogisticRegression(random_state=42, solver='liblinear', C=0.1) # Example best C from tuning
clf2 = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10, class_weight='balanced') # Example best params
clf3 = KNeighborsClassifier(n_neighbors=5) # K-Nearest Neighbors
clf4 = DecisionTreeClassifier(random_state=42, max_depth=5) # Decision Tree

estimators = [
    ('lr', clf1),
    ('rf', clf2),
    ('knn', clf3),
    ('dt', clf4)
]


--- Defining Base Estimators for Ensembling ---


# --- 6.2. Voting Classifier ---

In [14]:
print("\n--- Training and Evaluating Voting Classifier ---")
# 'soft' voting uses predicted probabilities, which is generally better if models are well-calibrated
voting_clf = VotingClassifier(estimators=estimators, voting='soft', weights=[0.25, 0.4, 0.15, 0.2]) # Assign weights (sum to 1)
voting_clf.fit(X_train_scaled, y_train)
y_pred_vote = voting_clf.predict(X_test_scaled)
y_proba_vote = voting_clf.predict_proba(X_test_scaled)[:, 1]

print("\nVoting Classifier Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_vote):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_vote):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_vote):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_vote):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba_vote):.4f}")
print("\nConfusion Matrix (Voting Classifier):")
print(confusion_matrix(y_test, y_pred_vote))
print("\nClassification Report (Voting Classifier):")
print(classification_report(y_test, y_pred_vote))


--- Training and Evaluating Voting Classifier ---

Voting Classifier Performance:
Accuracy: 0.8289
Precision: 0.8333
Recall: 0.8537
F1-Score: 0.8434
ROC AUC Score: 0.8732

Confusion Matrix (Voting Classifier):
[[28  7]
 [ 6 35]]

Classification Report (Voting Classifier):
              precision    recall  f1-score   support

         0.0       0.82      0.80      0.81        35
         1.0       0.83      0.85      0.84        41

    accuracy                           0.83        76
   macro avg       0.83      0.83      0.83        76
weighted avg       0.83      0.83      0.83        76



# --- 6.3. Stacking Classifier ---

In [15]:
print("\n--- Training and Evaluating Stacking Classifier ---")
# Base estimators will make predictions, and a final estimator will learn from those predictions
# The final estimator can be any classifier, often a simple one like Logistic Regression
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(random_state=42, solver='liblinear'),
    cv=5, # Cross-validation for training the final estimator
    n_jobs=-1 # Use all available cores
)

stacking_clf.fit(X_train_scaled, y_train)
y_pred_stack = stacking_clf.predict(X_test_scaled)
y_proba_stack = stacking_clf.predict_proba(X_test_scaled)[:, 1]

print("\nStacking Classifier Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_stack):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_stack):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_stack):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba_stack):.4f}")
print("\nConfusion Matrix (Stacking Classifier):")
print(confusion_matrix(y_test, y_pred_stack))
print("\nClassification Report (Stacking Classifier):")
print(classification_report(y_test, y_pred_stack))


--- Training and Evaluating Stacking Classifier ---

Stacking Classifier Performance:
Accuracy: 0.8158
Precision: 0.8140
Recall: 0.8537
F1-Score: 0.8333
ROC AUC Score: 0.8418

Confusion Matrix (Stacking Classifier):
[[27  8]
 [ 6 35]]

Classification Report (Stacking Classifier):
              precision    recall  f1-score   support

         0.0       0.82      0.77      0.79        35
         1.0       0.81      0.85      0.83        41

    accuracy                           0.82        76
   macro avg       0.82      0.81      0.81        76
weighted avg       0.82      0.82      0.82        76



In [16]:
print("\nModel ensembling and stacking complete. Performance metrics for ensemble models are displayed.")


Model ensembling and stacking complete. Performance metrics for ensemble models are displayed.
