In [6]:
# 1. Imports 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE, SelectFromModel, mutual_info_classif
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from scikeras.wrappers import KerasClassifier
import shap
import warnings
warnings.filterwarnings('ignore')




def create_model():
    model = keras.Sequential([
        layers.Dense(12, input_shape=(X_train.shape[1],), activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Test if it initializes as a Scikit-Learn object
clf = KerasClassifier(model=create_model, epochs=10, batch_size=32, verbose=0)
print("SciKeras is successfully linked to your project!")






# Load data
df = pd.read_csv('heart_disease_eda_advanced.csv')
# Drop temporary columns used in EDA
drop_cols = ['iso_outlier', 'dbscan_outlier', 'ae_outlier', 'outlier_any', 'outlier_count',
             'PC1', 'PC2', 'PCA1', 'PCA2', 'PCA3', 'tSNE1', 'tSNE2', 'UMAP1', 'UMAP2',
             'kmeans_cluster', 'gmm_cluster', 'spectral_cluster']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Separate features and target
X = df.drop('target', axis=1)
y = df['target'].astype(int)

# Encode categorical variables (except target)
categorical_cols = X.select_dtypes(include=['category']).columns.tolist()
# Convert to numeric codes for modeling
for col in categorical_cols:
    X[col] = X[col].cat.codes

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


SciKeras is successfully linked to your project!


In [9]:
# 2. Feature Selection 



# 1. Identify all non-numeric columns (Strings and Categories)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# 2. Convert them to numeric codes
for col in categorical_cols:
    # pd.factorize handles both strings and categories efficiently
    X[col] = pd.factorize(X[col])[0]

# 3. Handle any potential NaNs that might have been created or existed
X = X.fillna(0) 

# Now you can re-run the split and Mutual Information
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 4. Feature Selection
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)






# Mutual information
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_series = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
print("Top 10 features by mutual information:\n", mi_series.head(10))

# Select top 10 features for modeling to reduce dimensionality
top_features = mi_series.head(10).index.tolist()
X_train_fs = X_train[top_features]
X_test_fs = X_test[top_features] 

Top 10 features by mutual information:
 cp                  0.156436
thal                0.128619
ca                  0.126923
chol oldpeak        0.117666
oldpeak             0.108542
slope               0.098897
thalach oldpeak     0.098522
age oldpeak         0.096136
exang               0.087338
trestbps thalach    0.086173
dtype: float64


In [10]:
 # 3. Traditional Machine Learning Models


models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42),
    'SVM (RBF)': SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]), random_state=42),
    'LightGBM': LGBMClassifier(verbose=-1, class_weight='balanced', random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

results = []
for name, model in models.items():
    # Scale for SVM and LR
    if name in ['Logistic Regression', 'SVM (RBF)']:
        pipeline = Pipeline([('scaler', StandardScaler()), ('model', model)])
    else:
        pipeline = Pipeline([('model', model)])
    pipeline.fit(X_train_fs, y_train)
    y_pred = pipeline.predict(X_test_fs)
    y_proba = pipeline.predict_proba(X_test_fs)[:,1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    f1 = f1_score(y_test, y_pred)
    results.append({'Model': name, 'Accuracy': acc, 'ROC-AUC': auc, 'F1': f1})

results_df = pd.DataFrame(results).sort_values('ROC-AUC', ascending=False)
print(results_df)


                 Model  Accuracy   ROC-AUC        F1
5             CatBoost  0.836066  0.896104  0.864865
4             LightGBM  0.836066  0.892857  0.857143
6    Gradient Boosting  0.852459  0.888528  0.873239
1        Random Forest  0.786885  0.882576  0.821918
3              XGBoost  0.836066  0.882035  0.861111
2            SVM (RBF)  0.803279  0.871212  0.833333
0  Logistic Regression  0.754098  0.844156  0.788732


In [14]:
# 4. Deep Learning with TensorFlow/Keras 

# Build a function to create Keras model with tunable hyperparameters
def create_keras_model(optimizer='adam', dropout_rate=0.2, neurons=64, n_layers=2, l2_reg=0.001):
    model = keras.Sequential()
    model.add(layers.Input(shape=(X_train_fs.shape[1],)))
    for i in range(n_layers):
        model.add(layers.Dense(neurons, activation='relu',
                               kernel_regularizer=regularizers.l2(l2_reg)))
        model.add(layers.Dropout(dropout_rate))
        model.add(layers.BatchNormalization())
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['AUC'])
    return model

# Wrap model for scikit-learn
keras_clf = KerasClassifier(build_fn=create_keras_model, verbose=0)

# Simple training (without tuning first)
# Scale data for NN
scaler_nn = StandardScaler()
X_train_scaled = scaler_nn.fit_transform(X_train_fs)
X_test_scaled = scaler_nn.transform(X_test_fs)

# Train a basic model
model_nn = create_keras_model()
history = model_nn.fit(X_train_scaled, y_train, epochs=50, batch_size=16, validation_split=0.2, verbose=0)

# Evaluate
y_pred_nn = (model_nn.predict(X_test_scaled) > 0.5).astype(int).flatten()
y_proba_nn = model_nn.predict(X_test_scaled).flatten()
print("Neural Network - Accuracy:", accuracy_score(y_test, y_pred_nn))
print("Neural Network - ROC-AUC:", roc_auc_score(y_test, y_proba_nn))


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Neural Network - Accuracy: 0.7704918032786885
Neural Network - ROC-AUC: 0.8528138528138528


In [16]:
# 5. Advanced Deep Learning: Wide & Deep Network (FIXED VERSION)

# Wide & Deep: combines linear model with deep neural network
def create_wide_deep_model(wide_features, deep_features, deep_units=[64,32], dropout=0.2):
    # Wide part (linear)
    # The shape is determined by the number of wide features passed in
    wide_input = layers.Input(shape=(len(wide_features),), name='wide')
    wide_out = layers.Dense(1, use_bias=False)(wide_input)  # linear combination
    
    # Deep part
    deep_input = layers.Input(shape=(len(deep_features),), name='deep')
    deep = layers.Dense(deep_units[0], activation='relu')(deep_input)
    deep = layers.Dropout(dropout)(deep)
    deep = layers.BatchNormalization()(deep)
    for units in deep_units[1:]:
        deep = layers.Dense(units, activation='relu')(deep)
        deep = layers.Dropout(dropout)(deep)
        deep = layers.BatchNormalization()(deep)
    deep_out = layers.Dense(1, activation='linear')(deep)
    
    # Combine
    combined = layers.Add()([wide_out, deep_out])
    output = layers.Activation('sigmoid')(combined)
    
    model = keras.Model(inputs=[wide_input, deep_input], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    return model

# --- FIX 1: Selection Logic ---
# Define features
wide_feats = ['age', 'trestbps', 'chol'] 
deep_feats = top_features # These are the columns currently in X_train_fs

# IMPORTANT: We select wide features from X_train (full data) because 
# they might not have been selected in the top 10 'fs' (feature selection) step.
X_train_wide = X_train[wide_feats].values
X_test_wide = X_test[wide_feats].values

# Deep features come from our already selected features
X_train_deep = X_train_fs.values
X_test_deep = X_test_fs.values

# --- FIX 2: Scaling Logic ---
# We must scale wide and deep inputs separately as they have different dimensions
scaler_wide = StandardScaler().fit(X_train_wide)
scaler_deep = StandardScaler().fit(X_train_deep)

X_train_wide_scaled = scaler_wide.transform(X_train_wide)
X_test_wide_scaled = scaler_wide.transform(X_test_wide)

X_train_deep_scaled = scaler_deep.transform(X_train_deep)
X_test_deep_scaled = scaler_deep.transform(X_test_deep)

# --- Model Building & Training ---
wide_deep_model = create_wide_deep_model(wide_feats, deep_feats)

history_wd = wide_deep_model.fit(
    [X_train_wide_scaled, X_train_deep_scaled], 
    y_train,
    epochs=50, 
    batch_size=16, 
    validation_split=0.2, 
    verbose=0
)

# --- Prediction & Evaluation ---
# predict returns a 2D array, we flatten it to 1D to match y_test
y_proba_wd = wide_deep_model.predict([X_test_wide_scaled, X_test_deep_scaled]).flatten()
y_pred_wd = (y_proba_wd > 0.5).astype(int)

print("Wide & Deep - Accuracy:", accuracy_score(y_test, y_pred_wd))
print("Wide & Deep - ROC-AUC:", roc_auc_score(y_test, y_proba_wd))

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Wide & Deep - Accuracy: 0.7377049180327869
Wide & Deep - ROC-AUC: 0.8679653679653679


In [18]:
# 6. Handling Imbalance with Class Weights in TensorFlow 

# Compute class weights
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# Train same NN with class weights
model_nn_weighted = create_keras_model()
history_weighted = model_nn_weighted.fit(X_train_scaled, y_train, epochs=50, batch_size=16,
                                         validation_split=0.2, class_weight=class_weight_dict, verbose=0)

y_pred_nnw = (model_nn_weighted.predict(X_test_scaled) > 0.5).astype(int).flatten()
y_proba_nnw = model_nn_weighted.predict(X_test_scaled).flatten()
print("NN with Class Weights - Accuracy:", accuracy_score(y_test, y_pred_nnw))
print("NN with Class Weights - ROC-AUC:", roc_auc_score(y_test, y_proba_nnw))


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
NN with Class Weights - Accuracy: 0.7377049180327869
NN with Class Weights - ROC-AUC: 0.8538961038961039


In [None]:
# 7. Autoencoder for Feature Extraction (Unsupervised Pre-training) 

# Train a sparse autoencoder to learn compressed representation
input_dim = X_train_scaled.shape[1]
encoding_dim = 5

input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(encoding_dim, activation='relu', activity_regularizer=regularizers.l1(10e-5))(input_layer)
decoded = layers.Dense(input_dim, activation='linear')(encoded)
autoencoder = keras.Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

autoencoder.fit(X_train_scaled, X_train_scaled, epochs=100, batch_size=16, shuffle=True, validation_split=0.2, verbose=0)

# Encoder model to extract features
encoder = keras.Model(input_layer, encoded)
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)

# Train a classifier on encoded features
clf_encoded = LogisticRegression(max_iter=1000)
clf_encoded.fit(X_train_encoded, y_train)
y_pred_enc = clf_encoded.predict(X_test_encoded)
y_proba_enc = clf_encoded.predict_proba(X_test_encoded)[:,1]
print("AE + Logistic Regression - Accuracy:", accuracy_score(y_test, y_pred_enc))
print("AE + Logistic Regression - ROC-AUC:", roc_auc_score(y_test, y_proba_enc))
