In [2]:
# 1. Imports 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings

# Scikit-learn modeling and validation
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# --- FIX 1: Split calibration_curve from metrics ---
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             confusion_matrix, classification_report, roc_curve, precision_recall_curve,
                             brier_score_loss)
from sklearn.calibration import calibration_curve # Moved to its correct module

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers, backend as K

# --- FIX 2: Modern Keras Tuner naming ---
import keras_tuner as kt
from keras_tuner.tuners import RandomSearch

# Model Interpretability
import shap
import lime
import lime.lime_tabular

warnings.filterwarnings('ignore')

# 2. Data Loading & Preprocessing

# Load data
df = pd.read_csv('heart_disease_eda_advanced.csv')

# Separate features and target
X = df.drop('target', axis=1)
y = df['target'].astype(int)

# --- FIX 3: Robust Categorical Encoding ---
# This identifies both 'object' (strings like '60+') and 'category' types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
for col in categorical_cols:
    # Using factorize handles strings better than .cat.codes if types aren't explicitly 'category'
    X[col] = pd.factorize(X[col])[0]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3. Feature Selection & Scaling

# Reuse saved artifacts from previous steps
try:
    top_features = joblib.load('top_features.pkl')
    scaler_nn = joblib.load('scaler_nn.pkl')
    
    # Filter to selected features
    X_train_fs = X_train[top_features]
    X_test_fs = X_test[top_features]

    # Apply scaling
    X_train_scaled = scaler_nn.transform(X_train_fs)
    X_test_scaled = scaler_nn.transform(X_test_fs)
    
    print("Preprocessed data ready: Used", len(top_features), "top features.")
    
except FileNotFoundError:
    print("Error: .pkl files not found. Ensure you saved 'top_features.pkl' and 'scaler_nn.pkl' in the previous notebook.")

Preprocessed data ready: Used 10 top features.
