In [1]:
import pandas as pd
import numpy as np
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# 1. Load dataset
df = pd.read_csv('Diabetes_Final_Data_V2.csv')  # Replace with your actual file path

# 2. Clean column names
df.columns = df.columns.str.strip()

# 3. Define features
categorical_features = [
    "gender", "family_diabetes", "hypertensive",
    "family_hypertension", "cardiovascular_disease",
    "stroke", "age_group"
]
target_col = 'diabetic'

# 4. Handle age_group if necessary
if 'age_group' not in df.columns or df['age_group'].isnull().any():
    df['age_group'] = pd.cut(
        df['age'],
        bins=[0, 30, 45, 60, 100],
        labels=['Young', 'Mid', 'Senior', 'Elderly']
    ).astype(str)

# Convert target variable to numerical (0 and 1)
df[target_col] = df[target_col].map({'Yes': 1, 'No': 0})

# 5. Split features and target
X = df.drop(columns=target_col)
y = df[target_col]

# 6. Preprocessing
numerical_features = [col for col in X.columns if col not in categorical_features]
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer([
    ('num', scaler, numerical_features),
    ('cat', encoder, categorical_features)
])

# Fit preprocessor
preprocessor.fit(X)
X_processed = preprocessor.transform(X)

# Save preprocessor and feature order
joblib.dump(preprocessor, 'preprocessor.pkl')
joblib.dump(X.columns.tolist(), 'feature_order.pkl')

# 7. Apply SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_processed, y)

# 8. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Create model directory
os.makedirs('models', exist_ok=True)

# 9. Logistic Regression
lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)
joblib.dump(lr, 'models/lr.pkl')

# 10. Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
joblib.dump(rf, 'models/rf.pkl')

# 11. XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
joblib.dump(xgb, 'models/xgb.pkl')

# 12. LightGBM
lgb = LGBMClassifier(random_state=42)
lgb.fit(X_train, y_train)
joblib.dump(lgb, 'models/lgb.pkl')

# 13. Neural Network
nn = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
nn.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
nn.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=0)
nn.save('models/nn.h5')

print("✅ All models trained and saved successfully using SMOTE.")

Parameters: { "use_label_encoder" } are not used.

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[LightGBM] [Info] Number of positive: 4087, number of negative: 4061
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001114 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4164
[LightGBM] [Info] Number of data points in the train set: 8148, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501595 -> initscore=0.006382
[LightGBM] [Info] Start training from score 0.006382




✅ All models trained and saved successfully using SMOTE.


# --- Apply SHAP and LIME ---

In [4]:
# --- Apply SHAP and LIME ---

# SHAP Explanations
print("\n--- Generating SHAP Explanations ---")

# For tree-based models (Random Forest, XGBoost, LightGBM)
# Use TreeExplainer for faster computation and exact SHAP values
explainer_rf = shap.TreeExplainer(rf)
shap_values_rf = explainer_rf.shap_values(X_test)
print("SHAP values for Random Forest generated.")

explainer_xgb = shap.TreeExplainer(xgb)
shap_values_xgb = explainer_xgb.shap_values(X_test)
print("SHAP values for XGBoost generated.")

explainer_lgb = shap.TreeExplainer(lgb)
shap_values_lgb = explainer_lgb.shap_values(X_test)
print("SHAP values for LightGBM generated.")

# For model-agnostic models (Logistic Regression, Neural Network)
# Use KernelExplainer for these models. It's slower but works for any model.
# A background dataset (e.g., a subset of X_train) is needed for KernelExplainer.
# Using a small sample of X_train for efficiency.
background = X_train[np.random.choice(X_train.shape[0], 100, replace=False)]

explainer_lr = shap.KernelExplainer(lr.predict_proba, background)
shap_values_lr = explainer_lr.shap_values(X_test)
print("SHAP values for Logistic Regression generated.")

# For Neural Network, you need to wrap predict_proba.
# Keras models usually have predict, and for binary classification, it outputs probabilities directly.
explainer_nn = shap.KernelExplainer(nn.predict, background)
shap_values_nn = explainer_nn.shap_values(X_test)
print("SHAP values for Neural Network generated.")

# Example of plotting SHAP summary plot for Random Forest (you can do this for others too)
# shap.summary_plot(shap_values_rf[1], X_test, feature_names=all_feature_names, show=False) # For binary classification, shap_values is a list for each class. Take class 1.
# plt.title("SHAP Summary Plot for Random Forest")
# plt.savefig('shap_summary_rf.png')
# plt.show()

# LIME Explanations
print("\n--- Generating LIME Explanations ---")

# For LIME, you need to create an explainer object and then explain individual instances.
# It's particularly useful for local interpretability.
# The `feature_names` and `class_names` are important for understandable explanations.

class_names = ['No Diabetes', 'Diabetes'] # Assuming 0: No, 1: Yes

# Logistic Regression LIME Explainer
explainer_lime_lr = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train,
    feature_names=all_feature_names,
    class_names=class_names,
    mode='classification'
)
print("LIME Explainer for Logistic Regression created.")

# Random Forest LIME Explainer
explainer_lime_rf = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train,
    feature_names=all_feature_names,
    class_names=class_names,
    mode='classification'
)
print("LIME Explainer for Random Forest created.")

# XGBoost LIME Explainer
explainer_lime_xgb = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train,
    feature_names=all_feature_names,
    class_names=class_names,
    mode='classification'
)
print("LIME Explainer for XGBoost created.")

# LightGBM LIME Explainer
explainer_lime_lgb = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train,
    feature_names=all_feature_names,
    class_names=class_names,
    mode='classification'
)
print("LIME Explainer for LightGBM created.")

# Neural Network LIME Explainer
# For Keras, you need to pass `nn.predict` (or `nn.predict_proba` if available)
explainer_lime_nn = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train,
    feature_names=all_feature_names,
    class_names=class_names,
    mode='classification'
)
print("LIME Explainer for Neural Network created.")

# Example of explaining a single instance with LIME (e.g., first instance in X_test)
# You can loop through several instances or select specific ones.
# For Logistic Regression:
# idx_to_explain = 0
# exp_lr = explainer_lime_lr.explain_instance(
#     data_row=X_test[idx_to_explain],
#     predict_fn=lr.predict_proba,
#     num_features=10 # Number of features to show in the explanation
# )
# print(f"\nLIME explanation for Logistic Regression (instance {idx_to_explain}):")
# print(exp_lr.as_list())
# # You can visualize this with exp_lr.show_in_notebook() if using a Jupyter environment.
# # Or save as HTML: exp_lr.save_to_file('lime_explanation_lr.html')

# For Neural Network:
# exp_nn = explainer_lime_nn.explain_instance(
#     data_row=X_test[idx_to_explain],
#     predict_fn=nn.predict, # For NN, predict outputs probabilities directly
#     num_features=10
# )
# print(f"\nLIME explanation for Neural Network (instance {idx_to_explain}):")
# print(exp_nn.as_list())

print("\nSHAP and LIME integration complete. You can now use the generated shap_values and LIME explainers for analysis and visualization.")
print("Remember to uncomment the plotting/explanation parts to see the outputs.")



--- Generating SHAP Explanations ---
SHAP values for Random Forest generated.
SHAP values for XGBoost generated.
SHAP values for LightGBM generated.




  0%|          | 0/2038 [00:00<?, ?it/s]

SHAP values for Logistic Regression generated.
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


  0%|          | 0/2038 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m6550/6550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 284us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m6550/6550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 305us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m6550/6550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 290us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m6550/6550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 285us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m6550/6550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 288us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m6550/6550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 285us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m6550/6550[0m [32

KeyboardInterrupt: 

In [7]:
import shap
import lime
import lime.lime_tabular
import numpy as np
import matplotlib.pyplot as plt

In [5]:
# For tree-based models (Random Forest, XGBoost, LightGBM)
# Use TreeExplainer for faster computation and exact SHAP values
explainer_rf = shap.TreeExplainer(rf)
shap_values_rf = explainer_rf.shap_values(X_test)
print("SHAP values for Random Forest generated.")

explainer_xgb = shap.TreeExplainer(xgb)
shap_values_xgb = explainer_xgb.shap_values(X_test)
print("SHAP values for XGBoost generated.")

explainer_lgb = shap.TreeExplainer(lgb)
shap_values_lgb = explainer_lgb.shap_values(X_test)
print("SHAP values for LightGBM generated.")

SHAP values for Random Forest generated.
SHAP values for XGBoost generated.
SHAP values for LightGBM generated.




In [8]:
# Example of plotting SHAP summary plot for Random Forest (you can do this for others too)

shap.summary_plot(shap_values_rf[1], X_test, feature_names=all_feature_names, show=False) # For binary classification, shap_values is a list for each class. Take class 1.
plt.title("SHAP Summary Plot for Random Forest")
plt.savefig('shap_summary_rf.png')
plt.show()


NameError: name 'all_feature_names' is not defined

In [9]:
import pandas as pd
import numpy as np
import joblib
import os
import matplotlib.pyplot as plt
import shap
from lime import lime_tabular
from sklearn.base import BaseEstimator, ClassifierMixin

# ... [Keep all previous code until model training is complete] ...

print("✅ All models trained and saved successfully using SMOTE.")

# 14. Prepare original test set (without SMOTE)
# Split original data (without resampling)
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocess original test set
X_test_orig_processed = preprocessor.transform(X_test_orig)

# Get feature names after preprocessing
num_features = numerical_features
cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_feature_names = np.concatenate([num_features, cat_features])

# 15. Create directory for explanations
os.makedirs('explanations/shap', exist_ok=True)
os.makedirs('explanations/lime', exist_ok=True)

# 16. SHAP Explanations
def generate_shap_explanations(model, model_name, X_test, feature_names):
    """Generate SHAP summary plots for a model"""
    plt.figure(figsize=(10, 6))
    
    # Select appropriate explainer
    if isinstance(model, (RandomForestClassifier, XGBClassifier, LGBMClassifier)):
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
    elif isinstance(model, LogisticRegression):
        explainer = shap.LinearExplainer(model, X_test)
        shap_values = explainer.shap_values(X_test)
    else:
        explainer = shap.KernelExplainer(model.predict, shap.kmeans(X_test, 5))
        shap_values = explainer.shap_values(X_test)
    
    # Generate plots
    if isinstance(shap_values, list):  # For classifiers with multiple outputs
        shap_values = shap_values[1]  # Use positive class values
        
    shap.summary_plot(shap_values, X_test, feature_names=feature_names, show=False)
    plt.title(f'SHAP Summary - {model_name}')
    plt.tight_layout()
    plt.savefig(f'explanations/shap/shap_summary_{model_name}.png', dpi=150)
    plt.close()

# 17. LIME Explanations
class KerasWrapper(BaseEstimator, ClassifierMixin):
    """Wrapper for Keras models to use with LIME"""
    def __init__(self, model):
        self.model = model
        
    def predict_proba(self, X):
        return np.hstack([1 - self.model.predict(X), self.model.predict(X)])

def generate_lime_explanations(model, model_name, X_train, X_test, feature_names, class_names):
    """Generate LIME explanations for sample instances"""
    # Wrap Keras model
    if model_name == 'nn':
        model = KerasWrapper(model)
    
    # Create LIME explainer
    explainer = lime_tabular.LimeTabularExplainer(
        training_data=X_train,
        feature_names=feature_names,
        class_names=class_names,
        mode='classification',
        discretize_continuous=False
    )
    
    # Generate explanations for 5 random instances
    np.random.seed(42)
    sample_indices = np.random.choice(len(X_test), 5, replace=False)
    
    for i, idx in enumerate(sample_indices):
        exp = explainer.explain_instance(
            data_row=X_test[idx],
            predict_fn=model.predict_proba,
            num_features=10
        )
        
        # Save as HTML
        exp.save_to_file(f'explanations/lime/lime_{model_name}_sample_{i+1}.html')

# 18. Generate explanations for all models
models = {
    'lr': lr,
    'rf': rf,
    'xgb': xgb,
    'lgb': lgb,
    'nn': nn
}

class_names = ['non-diabetic', 'diabetic']

for name, model in models.items():
    print(f"Generating explanations for {name}...")
    generate_shap_explanations(model, name, X_test_orig_processed, all_feature_names)
    
    # For LIME, use original non-processed training data
    generate_lime_explanations(
        model=model,
        model_name=name,
        X_train=X_train_orig.values,
        X_test=X_test_orig.values,
        feature_names=X.columns.tolist(),
        class_names=class_names
    )

print("✅ SHAP and LIME explanations generated successfully.")

✅ All models trained and saved successfully using SMOTE.
Generating explanations for lr...


ValueError: could not convert string to float: 'Female'