In [6]:
# 1. Imports 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE, SelectFromModel, mutual_info_classif
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from scikeras.wrappers import KerasClassifier
import shap
import warnings
warnings.filterwarnings('ignore')




def create_model():
    model = keras.Sequential([
        layers.Dense(12, input_shape=(X_train.shape[1],), activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Test if it initializes as a Scikit-Learn object
clf = KerasClassifier(model=create_model, epochs=10, batch_size=32, verbose=0)
print("SciKeras is successfully linked to your project!")






# Load data
df = pd.read_csv('heart_disease_eda_advanced.csv')
# Drop temporary columns used in EDA
drop_cols = ['iso_outlier', 'dbscan_outlier', 'ae_outlier', 'outlier_any', 'outlier_count',
             'PC1', 'PC2', 'PCA1', 'PCA2', 'PCA3', 'tSNE1', 'tSNE2', 'UMAP1', 'UMAP2',
             'kmeans_cluster', 'gmm_cluster', 'spectral_cluster']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Separate features and target
X = df.drop('target', axis=1)
y = df['target'].astype(int)

# Encode categorical variables (except target)
categorical_cols = X.select_dtypes(include=['category']).columns.tolist()
# Convert to numeric codes for modeling
for col in categorical_cols:
    X[col] = X[col].cat.codes

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


SciKeras is successfully linked to your project!


In [9]:
# 2. Feature Selection 



# 1. Identify all non-numeric columns (Strings and Categories)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# 2. Convert them to numeric codes
for col in categorical_cols:
    # pd.factorize handles both strings and categories efficiently
    X[col] = pd.factorize(X[col])[0]

# 3. Handle any potential NaNs that might have been created or existed
X = X.fillna(0) 

# Now you can re-run the split and Mutual Information
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 4. Feature Selection
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)






# Mutual information
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_series = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
print("Top 10 features by mutual information:\n", mi_series.head(10))

# Select top 10 features for modeling to reduce dimensionality
top_features = mi_series.head(10).index.tolist()
X_train_fs = X_train[top_features]
X_test_fs = X_test[top_features] 

Top 10 features by mutual information:
 cp                  0.156436
thal                0.128619
ca                  0.126923
chol oldpeak        0.117666
oldpeak             0.108542
slope               0.098897
thalach oldpeak     0.098522
age oldpeak         0.096136
exang               0.087338
trestbps thalach    0.086173
dtype: float64


In [10]:
 # 3. Traditional Machine Learning Models


models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42),
    'SVM (RBF)': SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]), random_state=42),
    'LightGBM': LGBMClassifier(verbose=-1, class_weight='balanced', random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

results = []
for name, model in models.items():
    # Scale for SVM and LR
    if name in ['Logistic Regression', 'SVM (RBF)']:
        pipeline = Pipeline([('scaler', StandardScaler()), ('model', model)])
    else:
        pipeline = Pipeline([('model', model)])
    pipeline.fit(X_train_fs, y_train)
    y_pred = pipeline.predict(X_test_fs)
    y_proba = pipeline.predict_proba(X_test_fs)[:,1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    f1 = f1_score(y_test, y_pred)
    results.append({'Model': name, 'Accuracy': acc, 'ROC-AUC': auc, 'F1': f1})

results_df = pd.DataFrame(results).sort_values('ROC-AUC', ascending=False)
print(results_df)


                 Model  Accuracy   ROC-AUC        F1
5             CatBoost  0.836066  0.896104  0.864865
4             LightGBM  0.836066  0.892857  0.857143
6    Gradient Boosting  0.852459  0.888528  0.873239
1        Random Forest  0.786885  0.882576  0.821918
3              XGBoost  0.836066  0.882035  0.861111
2            SVM (RBF)  0.803279  0.871212  0.833333
0  Logistic Regression  0.754098  0.844156  0.788732
