In [None]:
# ====================================================
# -----------Section 1: Importing Libraries-----------
# ====================================================
# 1.1. Data Manipulation, Statistics, and Feature Engineering
# ====================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import scipy.stats as stats
from scipy.stats import randint
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
import math

# ====================================================
# 1.2. Data Visualization
# ====================================================
import matplotlib.pyplot as plt
import seaborn as sns
import sys

import ptitprince as pt



# ============================
# 1.3. Data Splitting, Model Building, and Hyperparameter Tuning
# ============================
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB                   # Naive Bayes classifier
from sklearn.linear_model import LogisticRegression          # Logistic regression
from sklearn.ensemble import RandomForestClassifier # Ensemble methods
from sklearn.svm import SVC                                   # Support vector classifier
from xgboost import XGBClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_score         # Cross-validation techniques
from scipy.stats import uniform


# ============================
# 1.4. Model Evaluation and Interpretation
# ============================
import shap                          # SHAP (Shapley values) for model interpretability
import lime                          # LIME (Local Interpretable Model-agnostic Explanations) for model interpretability
import lime.lime_tabular            # LIME (Local Interpretable Model-agnostic Explanations) for model interpretability
from lime.lime_tabular import LimeTabularExplainer

In [None]:
# ========================================================================
# ----------Section 2: Data Loading, Exploration, and Cleaning------------
# ========================================================================
# ----------------------------
# 2.1 Load the dataset
# ----------------------------

pima_df= pd.read_csv("F:/University/大四上/MANG3099 Final Project/Dataset/diabetes.csv")
# ----------------------------
# 2.2 Initial dataset inspection
# ----------------------------
#2.2.1 DataFrame Check
print("Dataset row Indices:")
print(pima_df.index)
print("Dataset Column Names:")
print(pima_df.columns)

In [None]:
#2.2.2 Data Overview
pima_df.head()  # Display the first few rows

In [None]:
print("\nDataset Info:")
print(pima_df.info())  # Display dataset info

In [None]:
#2.2.3 Descriptive Statistics
pima_df.describe().T .round(2)  # Display descriptive statistics

In [None]:
# ========================================================================
# ----------Section 3: Data Cleaning------------
# ========================================================================
# ----------------------------
# 3.1 Missing Value Imputation(On these columns, a value of zero does not make sense and thus indicates missing value.)
# ----------------------------
# 3.1.1 Missing Value Check
print("\nMissing Values:")
# Replace zeros with NaN in specific columns
pima_df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = pima_df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0, np.nan)
print( pima_df.isnull().sum())

# 3.1.2 Missing Value Imputation
# Remove rows with missing values in Glucose, BloodPressure and BMI
pima_df.dropna(subset=['Glucose','BloodPressure','BMI'], inplace=True)

# Impute missing values in SkinThickness and Insulin with median values
pima_df['SkinThickness'].fillna(pima_df['SkinThickness'].median(), inplace=True)

pima_df['Insulin'].fillna(pima_df['Insulin'].median(), inplace=True)

print("\n Missing Values after Cleaning \n\n", pima_df.isnull().sum())


In [None]:
# ----------------------------
# 3.2 Duplicate Value Check
# ----------------------------
# 3.2.1 Duplicate Value Check
print("\nDuplicate Values:")
print(pima_df.duplicated().sum())

In [None]:
# ----------------------------
# 3.3 Outlier Detection using IQR
# ----------------------------

#3.3.1 Function to detect outliers in every feature using IQR
def find_outliers_IQR(pima_df):
    Q1 = pima_df.quantile(0.25)
    Q3 = pima_df.quantile(0.75)
    IQR = Q3 - Q1
    alloutliers = pima_df.apply(lambda x: (x < (Q1 - 1.5 * IQR)) | (x > (Q3 + 1.5 * IQR)))
    lowoutliers = pima_df.apply(lambda x: x < (Q1 - 1.5 * IQR))
    highoutliers = pima_df.apply(lambda x: x > (Q3 + 1.5 * IQR))
    return alloutliers, lowoutliers, highoutliers


In [None]:
#3.3.2 Detect outliers in every feature using IQR
# Outliers for 'Pregnancies' in pima_df
print("\n Outliers in 'Pregnancies'")
alloutliers, lowoutliers, highoutliers = find_outliers_IQR(pima_df['Pregnancies'])
print("number of outliers: " + str(alloutliers.sum()))
print("max outlier value: " + str(pima_df['Pregnancies'][alloutliers].max()))
print("min outlier value: " + str(pima_df['Pregnancies'][alloutliers].min()))
print("number of low outliers: " + str(lowoutliers.sum()))
print("number of high outliers: " + str(highoutliers.sum()))
alloutliers

In [None]:
# Outliers for 'Glucose' in pima_df
print("\n Outliers in 'Glucose'")
alloutliers, lowoutliers, highoutliers = find_outliers_IQR(pima_df['Glucose'])
print("number of outliers: " + str(alloutliers.sum()))
print("max outlier value: " + str(pima_df['Glucose'][alloutliers].max()))
print("min outlier value: " + str(pima_df['Glucose'][alloutliers].min()))
print("number of low outliers: " + str(lowoutliers.sum()))
print("number of high outliers: " + str(highoutliers.sum()))
alloutliers

In [None]:
# Outliers for 'SkinThickness' in pima_df
print("\n Outliers in 'SkinThickness'")
alloutliers, lowoutliers, highoutliers = find_outliers_IQR(pima_df['SkinThickness'])
print("number of outliers: " + str(alloutliers.sum()))
print("max outlier value: " + str(pima_df['SkinThickness'][alloutliers].max()))
print("min outlier value: " + str(pima_df['SkinThickness'][alloutliers].min()))
print("number of low outliers: " + str(lowoutliers.sum()))
print("number of high outliers: " + str(highoutliers.sum()))
alloutliers

In [None]:
# Outliers for 'Insulin' in pima_df
print("\n Outliers in 'Insulin'")
alloutliers, lowoutliers, highoutliers = find_outliers_IQR(pima_df['Insulin'])
print("number of outliers: " + str(alloutliers.sum()))
print("max outlier value: " + str(pima_df['Insulin'][alloutliers].max()))
print("min outlier value: " + str(pima_df['Insulin'][alloutliers].min()))
print("number of low outliers: " + str(lowoutliers.sum()))
print("number of high outliers: " + str(highoutliers.sum()))
alloutliers

In [None]:
# Outliers for 'DiabetesPedigreeFunction' in pima_df
print("\n Outliers in 'DiabetesPedigreeFunction'")
alloutliers, lowoutliers, highoutliers = find_outliers_IQR(pima_df['DiabetesPedigreeFunction'])
print("number of outliers: " + str(alloutliers.sum()))
print("max outlier value: " + str(pima_df['DiabetesPedigreeFunction'][alloutliers].max()))
print("min outlier value: " + str(pima_df['DiabetesPedigreeFunction'][alloutliers].min()))
print("number of low outliers: " + str(lowoutliers.sum()))
print("number of high outliers: " + str(highoutliers.sum()))
alloutliers

In [None]:
# Outliers for 'Blood Pressure' in pima_df
print("\n Outliers in 'Blood Pressure'")
alloutliers, lowoutliers, highoutliers = find_outliers_IQR(pima_df['BloodPressure'])
print("number of outliers: " + str(alloutliers.sum()))
print("max outlier value: " + str(pima_df['BloodPressure'][alloutliers].max()))
print("min outlier value: " + str(pima_df['BloodPressure'][alloutliers].min()))
print("number of low outliers: " + str(lowoutliers.sum()))
print("number of high outliers: " + str(highoutliers.sum()))
alloutliers


In [None]:
# Outliers for 'BMI' in pima_df
print("\n Outliers in 'BMI'")
alloutliers, lowoutliers, highoutliers = find_outliers_IQR(pima_df['BMI'])
print("number of outliers: " + str(alloutliers.sum()))
print("max outlier value: " + str(pima_df['BMI'][alloutliers].max()))
print("min outlier value: " + str(pima_df['BMI'][alloutliers].min()))
print("number of low outliers: " + str(lowoutliers.sum()))
print("number of high outliers: " + str(highoutliers.sum()))
alloutliers


In [None]:
# Outliers for 'Age' in pima_df
print("\n Outliers in 'Age'")
alloutliers, lowoutliers, highoutliers = find_outliers_IQR(pima_df['Age'])
print("number of outliers: " + str(alloutliers.sum()))
print("max outlier value: " + str(pima_df['Age'][alloutliers].max()))
print("min outlier value: " + str(pima_df['Age'][alloutliers].min()))
print("number of low outliers: " + str(lowoutliers.sum()))
print("number of high outliers: " + str(highoutliers.sum()))
alloutliers

In [None]:
#Check the dataset after Data Cleaning
pima_df.describe().T.round(2)


In [None]:

# ========================================================================
# ----------Section 4: Exploratory Data Analysis------------
# ========================================================================
# ----------------------------
# 4.1 Target Variable Analysis
# ----------------------------
#4.1.1 Brief overview of target variable
print("\nDistribution of target variable:")
print(pima_df['Outcome'].value_counts())

# 4.1.2 Visualizing the Target Variable (Bar Chart with Percentages)
# Calculate percentage distribution of the Outcome variable
outcome_counts = pima_df['Outcome'].value_counts()
outcome_percentage = (outcome_counts / outcome_counts.sum()) * 100

# Create figure for bar chart
plt.figure(figsize=(8, 5))

# Bar chart displaying percentage of positive vs. negative diabetes patients
sns.barplot(
    x=outcome_percentage.index,
    y=outcome_percentage.values,
    palette=['#4682B4', '#FF6347']  # Blue for Negative, Red for Positive
)

# Add percentage labels on top of bars
for index, value in enumerate(outcome_percentage):
    plt.text(index, value + 1, f'{value:.1f}%', ha='center', fontsize=12)

# Customize chart aesthetics
plt.title('Percentage of Diabetic vs. Non-Diabetic Cases', fontsize=14)
plt.xlabel('Outcome', fontsize=12)
plt.ylabel('Percentage (%)', fontsize=12)
plt.xticks([0, 1], ['Non-Diabetic', 'Diabetic'])  # Rename x-axis labels
plt.ylim(0, 100)  # Ensure y-axis represents percentage (0-100)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.show()

In [None]:
# ----------------------------
# 4.2 Age Feature Analysis
# ----------------------------
#4.2.1 Age Distribution Analysis
# Create age groups based on WHO age classification
bins = [0, 44, 60, 75, 90, 100]  # Define age bins
labels = ['Young Age', 'Middle Age', 'Elderly Age', 'Senile Age', 'Long-Livers']  # Define labels for bins
pima_df['age_group'] = pd.cut(pima_df['Age'], bins=bins, labels=labels, right=False)

# Verify the distribution of age groups
print("\nAge Group Distribution:")
print(pima_df['age_group'].value_counts())

In [None]:
#4.2.2 Age Distribution Across Different Age Groups
import numpy
# Patching the asscalar function to avoid error
def patch_asscalar(a):
    return a.item()

setattr(numpy, "asscalar", patch_asscalar)

# Set figure size
plt.figure(figsize=(14, 8))

# Define colors for diabetic and non-diabetic groups
diabetic_color = "#FF4500"  # Bright orange-red
non_diabetic_color = "#1E90FF"  # Bright deep blue

# Calculate the total count for each age group
age_group_totals = pima_df['age_group'].value_counts().sort_index()

# Create a new DataFrame to store the percentage values
age_group_percentages = pima_df.groupby(['age_group', 'Outcome']).size().unstack(fill_value=0).div(age_group_totals, axis=0) * 100

# Create stacked horizontal bar chart with percentage values
ax = age_group_percentages.plot(kind='barh', stacked=True, color=[non_diabetic_color, diabetic_color], figsize=(14, 8))

# Add Count Labels to Each Age Group
age_group_counts = pima_df.groupby(["age_group", 'Outcome']).size().unstack(fill_value=0)

# Add Percentage Labels on Each Bar
for i, (age_group, row) in enumerate(age_group_percentages.iterrows()):
    non_diabetic_percentage = row.get(0, 0)  # Get percentage safely
    diabetic_percentage = row.get(1, 0)  # Get percentage safely

    if non_diabetic_percentage > 0:
        ax.text(non_diabetic_percentage / 2, i, f"{non_diabetic_percentage:.1f}%", va='center', ha='center', fontsize=12, color='white', fontweight='bold')
    if diabetic_percentage > 0:
        ax.text(100 - diabetic_percentage / 2, i, f"{diabetic_percentage:.1f}%", va='center', ha='center', fontsize=12, color='white', fontweight='bold')

# Get rightmost x-limit for alignment
x_max = plt.xlim()[1]  
text_x = x_max + 5  # Move labels further right

# Add Count Labels to Each Age Group
for i, (age_group, row) in enumerate(age_group_counts.iterrows()):
    non_diabetic_count = row.get(0, 0)  # Get count safely
    diabetic_count = row.get(1, 0)  # Get count safely

    plt.text(text_x, i + 0.15, f"Non-Diabetic: {non_diabetic_count}", fontsize=13,
             fontweight="bold", color=non_diabetic_color, ha="left")

    plt.text(text_x, i - 0.15, f"Diabetic: {diabetic_count}", fontsize=13,
             fontweight="bold", color=diabetic_color, ha="left")

# Improve plot readability
plt.title("Age Distribution Across Different Age Groups", fontsize=16, fontweight="bold")
plt.xlabel("Percentage", fontsize=14)  # Changed to "Percentage" to reflect the x-axis data
plt.ylabel("Age Group", fontsize=14)
plt.xticks(rotation=45, fontsize=13)
plt.yticks(fontsize=13)
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.legend(["Negative", "Positive"], title="Diabetic Status", title_fontsize="13", fontsize="12", loc="upper right")

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
#4.2.3 Drop the 'age_group' column
pima_df.drop(columns=['age_group'], inplace=True)

In [None]:
# ----------------------------
# 4.3 Categorical Feature Analysis
# ----------------------------
import matplotlib.pyplot as plt
import seaborn as sns
fig, axes = plt.subplots(4, 2, figsize=(15, 15))

# Extract colors from the 'Set2' palette
palette = sns.color_palette('Set2')
negative_color = palette[0]
positive_color = palette[1]

# ----------------------------
# Plot glucose level grouped by outcome
# ----------------------------
sns.histplot(
    ax=axes[0, 0], x='Glucose', hue='Outcome', data=pima_df, kde=True, bins=20, palette='Set2'
)
axes[0, 0].set_title('Glucose Level grouped by Outcome', fontsize=14)
axes[0, 0].set_xlabel('Glucose', fontsize=12)
axes[0, 0].set_ylabel('Count', fontsize=12)
axes[0, 0].grid(axis='y', linestyle='--', alpha=0.7)
axes[0, 0].legend(title='Outcome', labels=['Positive', 'Negative'], loc='upper right')


# ----------------------------
# Plot blood pressure grouped by outcome
# ----------------------------
sns.histplot(
    ax=axes[0, 1], x='BloodPressure', hue='Outcome', data=pima_df, kde=True, bins=20, palette='Set2'
)
axes[0, 1].set_title('Blood Pressure grouped by Outcome', fontsize=14)
axes[0, 1].set_xlabel('Blood Pressure', fontsize=12)
axes[0, 1].set_ylabel('Count', fontsize=12)
axes[0, 1].grid(axis='y', linestyle='--', alpha=0.7)
axes[0, 1].legend(title='Outcome', labels=['Positive', 'Negative'], loc='upper right')


# ----------------------------
# Plot BMI grouped by outcome
# ----------------------------
sns.histplot(
    ax=axes[1, 0], x='BMI', hue='Outcome', data=pima_df, kde=True, bins=20, palette='Set2'
)
axes[1, 0].set_title('BMI grouped by Outcome', fontsize=14)
axes[1, 0].set_xlabel('BMI', fontsize=12)
axes[1, 0].set_ylabel('Count', fontsize=12)
axes[1, 0].grid(axis='y', linestyle='--', alpha=0.7)
axes[1, 0].legend(title='Outcome', labels=['Positive', 'Negative'], loc='upper right')


# ----------------------------
# Plot insulin level grouped by outcome
# ----------------------------
sns.histplot(
    ax=axes[1, 1], x='Insulin', hue='Outcome', data=pima_df, kde=True, bins=20, palette='Set2'
)
axes[1, 1].set_title('Insulin Level grouped by Outcome', fontsize=14)
axes[1, 1].set_xlabel('Insulin', fontsize=12)
axes[1, 1].set_ylabel('Count', fontsize=12)
axes[1, 1].grid(axis='y', linestyle='--', alpha=0.7)
axes[1, 1].legend(title='Outcome', labels=['Positive', 'Negative'], loc='upper right')


# ----------------------------
# Plot skin thickness grouped by outcome
# ----------------------------
sns.histplot(
    ax=axes[2, 0], x='SkinThickness', hue='Outcome', data=pima_df, kde=True, bins=20, palette='Set2'
)
axes[2, 0].set_title('Skin Thickness grouped by Outcome', fontsize=14)
axes[2, 0].set_xlabel('Skin Thickness', fontsize=12)
axes[2, 0].set_ylabel('Count', fontsize=12)
axes[2, 0].grid(axis='y', linestyle='--', alpha=0.7)
axes[2, 0].legend(title='Outcome', labels=['Positive', 'Negative'], loc='upper right')


# ----------------------------
# Plot Pregnancies grouped by outcome
# ----------------------------
sns.histplot(
    ax=axes[2, 1], x='Pregnancies', hue='Outcome', data=pima_df, kde=True, bins=20, palette='Set2'
)
axes[2, 1].set_title('Pregnancies grouped by Outcome', fontsize=14)
axes[2, 1].set_xlabel('Pregnancies', fontsize=12)
axes[2, 1].set_ylabel('Count', fontsize=12)
axes[2, 1].grid(axis='y', linestyle='--', alpha=0.7)
axes[2, 1].legend(title='Outcome', labels=['Positive', 'Negative'], loc='upper right')


# ----------------------------
# Plot Diabetes Pedigree Function grouped by outcome
# ----------------------------
sns.histplot(
    ax=axes[3, 0], x='DiabetesPedigreeFunction', hue='Outcome', data=pima_df, kde=True, bins=20, palette='Set2'
)
axes[3, 0].set_title('Diabetes Pedigree Function(DPF) grouped by Outcome', fontsize=14)
axes[3, 0].set_xlabel('Diabetes Pedigree Function(DPF)', fontsize=12)
axes[3, 0].set_ylabel('Count', fontsize=12)
axes[3, 0].grid(axis='y', linestyle='--', alpha=0.7)
axes[3, 0].legend(title='Outcome', labels=['Positive', 'Negative'], loc='upper right')


# ----------------------------
# Plot Age grouped by outcome
# ----------------------------
sns.histplot(
    ax=axes[3, 1], x='Age', hue='Outcome', data=pima_df, kde=True, bins=20, palette='Set2'
)
axes[3, 1].set_title('Age grouped by Outcome', fontsize=14)
axes[3, 1].set_xlabel('Age', fontsize=12)
axes[3, 1].set_ylabel('Count', fontsize=12)
axes[3, 1].grid(axis='y', linestyle='--', alpha=0.7)
axes[3, 1].legend(title='Outcome', labels=['Positive', 'Negative'], loc='upper right')


# ----------------------------
# Adjust the layout
# ----------------------------
plt.tight_layout()
plt.show()

In [None]:
# ----------------------------
# 4.4 Categorical Feature Analysis
# ----------------------------
#4.4.1 Heatmap of correlations
plt.figure(figsize=(10, 8))
sns.heatmap(pima_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Features', fontsize=16)
plt.show()



In [None]:
# 4.4.2 Top correlated features with Class
correlations = pima_df.corr()['Outcome'].drop('Outcome').sort_values(ascending=False)
print("Top Correlated Features with Class:")
print(correlations)

# Set figure size
plt.figure(figsize=(12, 8))

# Create bar plot
sns.barplot(x=correlations.values, y=correlations.index, dodge=False, palette='Set2')

# Add count labels on each bar
for i, v in enumerate(correlations):
    plt.text(v, i, f"{v:.2f}", color='black', va='center', fontsize=12)

# Improve readability
plt.xlabel("Correlation Coefficient", fontsize=12)
plt.ylabel("Feature", fontsize=12)
plt.title('Feature Correlations with Outcome', fontsize=16, fontweight="bold")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# ========================================================================
# ----------Section 5: Feature Engineering------------
# ========================================================================
# ----------------------------
# 5.1 Data Splitting
# ----------------------------
#5.1.1 Split the dataset into features (X) and target variable (y)
X = pima_df.drop(columns=['Outcome'])
y = pima_df['Outcome']

# Display the shapes of X and y to verify the split
print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)

In [None]:
#5.1.2 Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Display the shapes of the training and testing sets
print("Training Features shape:", X_train.shape)
print("Testing Features shape:", X_test.shape)
print("Training Target shape:", y_train.shape)
print("Testing Target shape:", y_test.shape)

print("Training class distribution:\n", y_train.value_counts(normalize=True))
print("Testing class distribution:\n", y_test.value_counts(normalize=True))


In [None]:
# ----------------------------
# 5.2 Data SMOTE
# ----------------------------
#5.3.1 Apply SMOTE to balance the target variable
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)

# Display the shapes of the training set after SMOTE
print("Training Features shape after SMOTE:", X_train.shape)
print("Training Target shape after SMOTE:", y_train.shape)

print("Training class distribution:\n", y_train.value_counts(normalize=True))
print("Testing class distribution:\n", y_test.value_counts(normalize=True))


In [None]:
# ----------------------------
# 5.3 Data Scaling
# ----------------------------
# Create StandardScaler instance
scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Fit and transform the training set
X_train_scaled = scaler.fit_transform(X_train_scaled)
X_test_scaled = scaler.transform(X_test_scaled)


# Display the shape of the splits
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
# ========================================================================
# ----------Section 6: Model Building and Hyperparameter Tuning-----------
# ========================================================================
# ----------------------------
# 6.1 Naive Bayes
# ----------------------------
# 6.1.1 Model Building and Hyperparameter Tuning using GridSearchCV

# Initialize the Naive Bayes classifier
nb = GaussianNB()

# Initialize the RandomizedSearchCV object
nb_search = GridSearchCV(
    estimator=nb, 
    param_grid={"var_smoothing": np.logspace(0,-2, num=100)},  # Use GridSearchCV 
    cv=10,  # 10-fold cross-validation
    scoring="roc_auc",  # Optimize for ROC-AUC
    n_jobs=-1,  # Use all available processors
    verbose=1  # Display progress
)

# Fit the GridSearchCV object to the training data
nb_search.fit(X_train_scaled, y_train)

# Get the best parameters
nb_best_params = nb_search.best_params_
print("Best Parameters for Naive Bayes:", nb_best_params)

In [None]:
#6.1.2 Model Evaluation

# Initialize the Naive Bayes classifier with the best parameters
nb_best_model = GaussianNB(var_smoothing=nb_best_params['var_smoothing'])
nb_best_model.fit(X_train_scaled, y_train)

# Predict the target variable
y_pred_nb = nb_best_model.predict(X_test_scaled)


# Calculate evaluation metrics
nb_acc = accuracy_score(y_test, y_pred_nb)
nb_conf_matrix = confusion_matrix(y_test, y_pred_nb)
nb_roc_auc = roc_auc_score(y_test, y_pred_nb)
nb_f1 = f1_score(y_test, y_pred_nb)
nb_precision = precision_score(y_test, y_pred_nb)
nb_recall = recall_score(y_test, y_pred_nb)

# Print evaluation results
print(f"Naive Bayes Accuracy: {nb_acc:.2f}")
print("Naive Bayes Confusion Matrix:")
print(nb_conf_matrix)
print("\nClassification Report:\n", classification_report(y_test, y_pred_nb))
print(f"ROC-AUC Score: {nb_roc_auc:.2f}")
print(f"F1-Score: {nb_f1:.2f}")
print(f"Precision: {nb_precision:.2f}")
print(f"Recall: {nb_recall:.2f}")

# Visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(nb_conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["No Diabetes", "Diabetes"], yticklabels=["No Diabetes", "Diabetes"])
plt.title("Naive Bayes Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
# ----------------------------
# 6.2 Logistic Regression
# ----------------------------
# 6.2.1 Model Building and Hyperparameter Tuning using GridSearchCV
# Initialize the Logistic Regression classifier
lr = LogisticRegression()

# Define the hyperparameter grid for tuning
lr_params = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Regularization strengths
    "penalty": ["l2", "l1"],  # Regularization penalties
    "solver": ["liblinear", "saga"],  # Solvers compatible with both l1 and l2 penalties
}

# Initialize GridSearchCV
lr_grid = GridSearchCV(
    estimator=lr,
    param_grid=lr_params,
    scoring="roc_auc",  # Optimize for ROC-AUC
    cv=10,  # 10-fold cross-validation
    verbose=1,  # Display progress during fitting
    n_jobs=-1  # Use all available processors
)

# Fit the model with hyperparameter tuning
lr_grid.fit(X_train_scaled, y_train)

# Retrieve the best hyperparameters
lr_best_params = lr_grid.best_params_
print("Best Parameters for Logistic Regression:", lr_best_params)

In [None]:
#6.2.2 Model Evaluation

# Initialize the Logistic Regression classifier with the best parameters
lr_best_model = LogisticRegression(**lr_best_params)

# Fit the model on the training data
lr_best_model.fit(X_train_scaled, y_train)

# Predict the target variable
y_pred_lr = lr_best_model.predict(X_test_scaled)

# Calculate evaluation metrics
lr_acc = accuracy_score(y_test, y_pred_lr)
lr_conf_matrix = confusion_matrix(y_test, y_pred_lr)
lr_roc_auc = roc_auc_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_recall = recall_score(y_test, y_pred_lr)

# Print evaluation results
print(f"Logistic Regression Accuracy: {lr_acc:.2f}")
print("Logistic Regression Confusion Matrix:")
print(lr_conf_matrix)
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))
print(f"ROC-AUC Score: {lr_roc_auc:.2f}")
print(f"F1-Score: {lr_f1:.2f}")
print(f"Precision: {lr_precision:.2f}")
print(f"Recall: {lr_recall:.2f}")

# Visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(lr_conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["No Diabetes", "Diabetes"], yticklabels=["No Diabetes", "Diabetes"])
plt.title("Logistic Regression Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# ----------------------------
# 6.3 Random Forest
# ----------------------------

# 6.3.1 Model Building and Hyperparameter Tuning


# Initialize the Random Forest classifier
rf = RandomForestClassifier()  # Set random_state for reproducibility

# Define the hyperparameter grid
rf_params = {
    "n_estimators": [100, 200, 300, 400, 500],  # Number of trees in the forest
    "criterion": ["gini", "entropy"],  # Split criterion
    "max_depth": [10, 20, 30, 40, 50, None],  # Maximum depth of the tree
    "min_samples_split": [2, 5, 10],  # Minimum samples to split an internal node
    "min_samples_leaf": [1, 2, 4],  # Minimum samples required to be at a leaf node
    "max_features": ["sqrt", "log2", None],  # Number of features to consider at each split
}

# Initialize GridSearchCV for hyperparameter tuning
rf_grid = GridSearchCV(
    estimator=rf,
    param_grid=rf_params,
    scoring="roc_auc",  # Optimize for ROC-AUC
    cv=10,  # 10-fold cross-validation
    verbose=1,  # Display progress
    n_jobs=-1  # Use all available processors
)

# Fit the GridSearchCV object to find the best hyperparameters
rf_grid.fit(X_train, y_train)

# Retrieve the best hyperparameters
rf_best_params = rf_grid.best_params_
print("Best Parameters for Random Forest:", rf_best_params)

In [None]:

# 6.3.2 Model Evaluation

# Initialize the Random Forest classifier with the best hyperparameters
rf_best_model = RandomForestClassifier(**rf_best_params)

# Fit the final model on the training data
rf_best_model.fit(X_train, y_train)


# Predict the outcomes for the test set
rf_pred = rf_best_model.predict(X_test)
rf_pred_prob = rf_best_model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

# Calculate evaluation metrics
rf_acc = accuracy_score(y_test, rf_pred)
rf_conf_matrix = confusion_matrix(y_test, rf_pred)
rf_roc_auc = roc_auc_score(y_test, rf_pred_prob)
rf_f1 = f1_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)

# Print evaluation results
print(f"Random Forest Accuracy: {rf_acc:.3f}")
print("Random Forest Confusion Matrix:")
print(rf_conf_matrix)
print("\nClassification Report:\n", classification_report(y_test, rf_pred))
print(f"Random Forest ROC-AUC Score: {rf_roc_auc:.3f}")
print(f"F1-Score: {rf_f1:.3f}")
print(f"Precision: {rf_precision:.3f}")
print(f"Recall: {rf_recall:.3f}")

# Visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(rf_conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["No Diabetes", "Diabetes"], yticklabels=["No Diabetes", "Diabetes"])
plt.title("Random Forest Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# ----------------------------
# 6.4 Support Vector Machine (SVM)
# ----------------------------

# 6.4.1 Model Building and Hyperparameter Tuning

# Initialize the Support Vector Machine classifier

#svm = SVC()  # Set random_state for reproducibility

# Define the hyperparameter grid
#svm_params = {
 #
#   "C": [1, 10],  # Regularization parameter
 #
#   "kernel": ["linear", "rbf"],  # Kernel type
#
#    "degree": [2, 3, 4],  # Degree for the polynomial kernel
 #
#   "gamma": ["scale", "auto"],  # Kernel coefficient
#
#}

# Initialize GridSearchCV for hyperparameter tuning
#
#svm_grid = GridSearchCV(
 #
#   estimator=svm,
#
#    param_grid=svm_params,
  #
#  scoring="roc_auc",  # Optimize for ROC-AUC
  #
#  cv=10,  # 10-fold cross-validation
  #
#  verbose=1,  # Display progress
 #
#   n_jobs=-1  # Use all available processors
#
#)

# Fit the GridSearchCV object to find the best hyperparameters
#
##
#svm_grid.fit(X_train_scaled, y_train)

# Retrieve the best hyperparameters
#
#svm_best_params = svm_grid.best_params_
#
#print("Best Parameters for Support Vector Machine:", svm_best_params)

In [None]:
# 6.4.2 Model Evaluation

# Initialize the SVM classifier with the best hyperparameters
#svm_best_model = SVC(**svm_best_params)

# Fit the final model on the training data
#svm_best_model.fit(X_train_scaled, y_train)

# Predict the outcomes for the test set
#svm_pred = svm_best_model.predict(X_test_scaled)

# Calculate evaluation metrics
#svm_acc = accuracy_score(y_test, svm_pred)
#svm_conf_matrix = confusion_matrix(y_test, svm_pred)
#svm_roc_auc = roc_auc_score(y_test, svm_pred)
#svm_f1 = f1_score(y_test, svm_pred)
#svm_precision = precision_score(y_test, svm_pred)
#svm_recall = recall_score(y_test, svm_pred)

# Print evaluation results
#print(f"SVM Accuracy: {svm_acc:.3f}")
#print("SVM Confusion Matrix:")
#print(svm_conf_matrix)
#print("\nClassification Report:\n", classification_report(y_test, svm_pred))
#print(f"SVM ROC-AUC Score: {svm_roc_auc:.3f}")
#print(f"F1-Score: {svm_f1:.3f}")
#print(f"Precision: {svm_precision:.3f}")
#print(f"Recall: {svm_recall:.3f}")

# Visualize the confusion matrix
#plt.figure(figsize=(6, 4))
#sns.heatmap(svm_conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["No Diabetes", "Diabetes"], yticklabels=["No Diabetes", "Diabetes"])
#plt.title("SVM Confusion Matrix")
#plt.xlabel("Predicted")
#plt.ylabel("Actual")
#plt.show()


In [None]:
# ----------------------------
# 6.5 Extreme Gradient Boosting (XGBoost)
# ----------------------------

# 6.5.1 Model Building and Hyperparameter Tuning

# Initialize the Gradient Boosting (XGBoost) classifier
gb = XGBClassifier()

# Define the hyperparameter grid
gb_params = {
    'n_estimators': [50, 100, 150],  # Number of trees
    'learning_rate': [0.01, 0.1, 0.5],  # Step size shrinkage
    'max_depth': [2, 3, 4, 5],  # Maximum depth of trees
    'subsample': [0.5, 0.8, 1.0],  # Fraction of samples for training
    'colsample_bytree': [0.1, 0.2, 0.3, 0.5],  # Fraction of features per tree
    'colsample_bylevel': [0.1, 0.2, 0.3, 0.5],  # Fraction of features per level
    'min_child_weight': [1, 3, 5, 7],  # Minimum sum of weights of child nodes
}

# Initialize GridSearchCV
gb_grid = GridSearchCV(
    estimator=gb,
    param_grid=gb_params,
    scoring="roc_auc",  # Optimize for ROC-AUC
    cv=10,  # 10-fold cross-validation
    verbose=1,
    n_jobs=-1  # Use all available processors
)

# Fit the GridSearchCV object to find the best hyperparameters
gb_grid.fit(X_train, y_train)

# Retrieve the best hyperparameters and best estimator
gb_best_params = gb_grid.best_params_
gb_model = gb_grid.best_estimator_
print("Best Parameters for Gradient Boosting (XGBoost):", gb_best_params)

In [None]:

# 6.5.2 Model Evaluation

# Initialize the Gradient Boosting (XGBoost) classifier with the best hyperparameters
gb_best_model = XGBClassifier(**gb_best_params)

# Fit the final model on the training data
gb_best_model.fit(X_train, y_train)

# Predict the outcomes for the test set
gb_pred = gb_best_model.predict(X_test)
gb_pred_prob = gb_best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Calculate evaluation metrics
gb_acc = accuracy_score(y_test, gb_pred)
gb_conf_matrix = confusion_matrix(y_test, gb_pred)
gb_roc_auc = roc_auc_score(y_test, gb_pred_prob)
gb_f1 = f1_score(y_test, gb_pred)
gb_precision = precision_score(y_test, gb_pred)
gb_recall = recall_score(y_test, gb_pred)

# Calculate sensitivity and specificity
sensitivity = gb_conf_matrix[1, 1] / (gb_conf_matrix[1, 1] + gb_conf_matrix[1, 0])
specificity = gb_conf_matrix[0, 0] / (gb_conf_matrix[0, 0] + gb_conf_matrix[0, 1])

# Print evaluation results
print(f"Gradient Boosting Accuracy: {gb_acc:.2f}")
print("Gradient Boosting Confusion Matrix:")
print(gb_conf_matrix)
print("\nClassification Report:\n", classification_report(y_test, gb_pred))
print(f"ROC-AUC Score: {gb_roc_auc:.2f}")
print(f"F1-Score: {gb_f1:.2f}")
print(f"Precision: {gb_precision:.2f}")
print(f"Recall: {gb_recall:.2f}")
print(f"Sensitivity: {sensitivity:.2f}")
print(f"Specificity: {specificity:.2f}")

# Visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(gb_conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["No Diabetes", "Diabetes"], yticklabels=["No Diabetes", "Diabetes"])
plt.title("Gradient Boosting Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
# ----------------------------
# 6.6 Stacking Classifier
# ----------------------------

# Initialize the base classifiers
base_classifiers = [
    LogisticRegression(**lr_best_params),
    RandomForestClassifier(**rf_best_params),
    XGBClassifier(**gb_best_params)
]

# Define the meta-classifier
meta_classifier = RandomForestClassifier()

# Initialize the StackingCV classifier
stacking = StackingCVClassifier(
    classifiers=base_classifiers,
    meta_classifier=meta_classifier,
    cv=10,  # 10-fold cross-validation
    stratify=True,  # Use stratified folds
    shuffle=True,  # Shuffle the data
    n_jobs=-1,  # Use all available processors
)

# Fit the StackingCV classifier
stacking.fit(X_train, y_train)




In [None]:
# Predict the target variable
y_stacking_pred = stacking.predict(X_test)

# Calculate evaluation metrics
stacking_acc = accuracy_score(y_test, y_stacking_pred)
stacking_conf_matrix = confusion_matrix(y_test, y_stacking_pred)
stacking_roc_auc = roc_auc_score(y_test, y_stacking_pred)
stacking_f1 = f1_score(y_test, y_stacking_pred)
stacking_precision = precision_score(y_test, y_stacking_pred)
stacking_recall = recall_score(y_test, y_stacking_pred)

# Print evaluation results
print(f"Stacking Classifier Accuracy: {stacking_acc:.2f}")
print("Stacking Classifier Confusion Matrix:")
print(stacking_conf_matrix)
print("\nClassification Report:\n", classification_report(y_test, y_stacking_pred))
print(f"ROC-AUC Score: {stacking_roc_auc:.2f}")
print(f"F1-Score: {stacking_f1:.2f}")
print(f"Precision: {stacking_precision:.2f}")
print(f"Recall: {stacking_recall:.2f}")

# Visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(stacking_conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["No Diabetes", "Diabetes"], yticklabels=["No Diabetes", "Diabetes"])
plt.title("Stacking Classifier Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
# ========================================================================
# ----------Section 7: Model Performance Comparison------------
# ========================================================================

# 7.1 Model Performance Metrics 

# Create a DataFrame to display the model performance metrics

# Initialize data for the model performance DataFrame
data = {
    "Model": ["NB", "LR", "RF", "XGB", "Stacking"],
    "Accuracy": [round(nb_acc, 3), round(lr_acc, 3), round(rf_acc, 3), round(gb_acc, 3), round(stacking_acc, 3)],
    "ROC-AUC": [round(nb_roc_auc, 3), round(lr_roc_auc, 3), round(rf_roc_auc, 3), round(gb_roc_auc, 3), round(stacking_roc_auc, 3)],
    "F1-Score": [round(nb_f1, 3), round(lr_f1, 3), round(rf_f1, 3), round(gb_f1, 3), round(stacking_f1, 3)],
    "Precision": [round(nb_precision, 3), round(lr_precision, 3), round(rf_precision, 3), round(gb_precision, 3), round(stacking_precision, 3)],
    "Recall": [round(nb_recall, 3), round(lr_recall, 3), round(rf_recall, 3), round(gb_recall, 3), round(stacking_recall, 3)]
}

# Create the DataFrame
model_comparison = pd.DataFrame(data)

# Display the DataFrame
print("\nModel Comparison:")
print(model_comparison)


# 7.2 Visualizing Model  Performance


# 7.2.1 Create a Bar Plot for Model Comparison

# Convert the data to long format
model_comparison_long = pd.melt(model_comparison, id_vars=["Model"], var_name="Metric", value_name="Value")

# Set the figure size
plt.figure(figsize=(14, 8))

# customed color
colors = ["#e2edc9", "#a4d3b7", "#65c4b9", "#26a7c8", "#607bbc"]
# Create a bar plot to compare the model performance metrics
sns.barplot(x="Model", y="Value", hue="Metric", data=model_comparison_long, palette=colors)

# Customize the plot
plt.title("Model Performance Comparison", fontsize=16)
plt.xlabel("Model", fontsize=16)
plt.ylabel("Performance Metric", fontsize=16)

plt.legend(fontsize=12, bbox_to_anchor=(1, 1), loc='upper left')

# Show the plot
plt.tight_layout()
plt.show()



In [None]:
# ========================================================================
# ----------Section 8: Feature Interpretability------------
# ========================================================================


# ----------------------------
# 8.1 Importance ranking of features in the model built-in
# ----------------------------
# 8.1.1 Random Forest Feature Importance
# Extract feature importances from the Random Forest model
rf_feature_importance = rf_best_model.feature_importances_

# Create a DataFrame to display the feature importance values
rf_fi_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf_feature_importance
})

# Sort the DataFrame based on feature importance
rf_fi_df = rf_fi_df.sort_values(by="Importance", ascending=False).reset_index(drop=True)

# Display the feature importances
print("\nRandom Forest Feature Importance:")
print(rf_fi_df)



# Plot the feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=rf_fi_df, palette="viridis")

# Add percentage labels on each bar
for index, value in enumerate(rf_fi_df["Importance"]):
    plt.text(value, index, f'{value:.2%}', color='black', va="center", fontsize=8)

plt.title("Random Forest Feature Importance", fontsize=16)
plt.xlabel("Importance", fontsize=12)
plt.show()


In [None]:
# 8.1.2 Gradient Boosting Feature Importance

# Extract feature importances from the Gradient Boosting model
gb_feature_importance = gb_best_model.feature_importances_

# Create a DataFrame to display the feature importance values
gb_fi_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": gb_feature_importance
})

# Sort the DataFrame based on feature importance
gb_fi_df = gb_fi_df.sort_values(by="Importance", ascending=False).reset_index(drop=True)

# Display the feature importances
print("\nGradient Boosting Feature Importance:")
print(gb_fi_df)


# Plot the feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=gb_fi_df, palette="viridis")

# Add percentage labels on each bar
for index, value in enumerate(gb_fi_df["Importance"]):
    plt.text(value, index, f'{value:.2%}', color='black', va="center", fontsize=6)

plt.title("Gradient Boosting Feature Importance", fontsize=16)
plt.xlabel("Importance", fontsize=12)
plt.show()

In [None]:
# ----------------------------
# 8.2 Importance ranking of features by SHAP values
# ----------------------------
# ----------------------------
# 8.2.1 SHAP Analysis for Random Forest
# ----------------------------

# Initialize the SHAP explainer for the best Random Forest model
rf_explainer = shap.Explainer(rf_best_model)

# Calculate SHAP values
rf_shap_values = rf_explainer.shap_values(X_test)



#Visualize the SHAP values
shap.summary_plot(rf_shap_values[:,:,1], X_test,  feature_names=X_train.columns)

In [None]:
#8.2.2 SHAP Analysis for Gradient Boosting
# Create object that can calculate shap values for Gradient Boosting model
explainer_gb = shap.TreeExplainer(gb_best_model)

# calculate shap values. This is what we will plot.
# Calculate shap_values for all of X_test rather than a single row, to have more data for plot.
shap_values_gb = explainer_gb.shap_values(X_test)

# Make plot for Gradient Boosting model
shap.summary_plot(shap_values_gb, X_test, feature_names=X.columns)


In [None]:
# ----------------------------
# 8.3 Local Interpretability using LIME
# ----------------------------
# ----------------------------
# 8.3.1 True Positive Case in PIMA
# ----------------------------
# Select a true positive case from the test set
true_positive = y_test[(y_test == 1) & (rf_pred == 1) & (gb_pred ==1) & (y_stacking_pred == 1)].index[0]


# Extract the features for the true positive case
PIMA_X_true_positive = X_test.loc[[true_positive]]

print("\nTrue Positive Case:")
print(PIMA_X_true_positive)

In [None]:
# Lime explainer for the true positive case via random forest

# Initialize the LimeTabularExplainer
explainer = LimeTabularExplainer(X_train.values, mode="classification", feature_names=X.columns, class_names=["Negative", "Positive"])
# Explain the true positive case
exp = explainer.explain_instance(PIMA_X_true_positive.values[0], rf_best_model.predict_proba, num_features=16, top_labels=1)

# Display the explanation
exp.show_in_notebook(show_table=True)

In [None]:
# Lime explainer for the true positive case via XGBoost
# Initialize the LimeTabularExplainer
explainer = LimeTabularExplainer(X_train.values, mode="classification", feature_names=X.columns, class_names=["Negative", "Positive"])
# Explain the true positive case
exp = explainer.explain_instance(PIMA_X_true_positive.values[0], gb_best_model.predict_proba, num_features=16, top_labels=1)

# Display the explanation
exp.show_in_notebook(show_table=True)

In [None]:
# Lime explainer for the true positive case via stacking classifier

# Initialize the LimeTabularExplainer
explainer = LimeTabularExplainer(X_train.values, mode="classification", feature_names=X.columns, class_names=["Negative", "Positive"])
# Explain the true positive case
exp = explainer.explain_instance(PIMA_X_true_positive.values[0], stacking.predict_proba, num_features=16, top_labels=1)

# Display the explanation
exp.show_in_notebook(show_table=True)

In [None]:
# ----------------------------
# 8.3.2 True Negative Case in PIMA
# ----------------------------
# Select a true negative case from the test set
true_negative = y_test[(y_test == 0) & (rf_pred == 0) & (gb_pred ==0) & (y_stacking_pred == 0)].index[0]

# Extract the features for the true negative case
PIMA_X_true_negative = X_test.loc[[true_negative]]
print("\nTrue Negative Case:")
print(PIMA_X_true_negative)

In [None]:

# Lime explainer for the true negative case via random forest

# Initialize the LimeTabularExplainer
explainer = LimeTabularExplainer(X_train.values, mode="classification", feature_names=X.columns, class_names=["Negative", "Positive"])
# Explain the true negative case
exp = explainer.explain_instance(PIMA_X_true_negative.values[0], rf_best_model.predict_proba, num_features=16, top_labels=1)

# Display the explanation
exp.show_in_notebook(show_table=True)

In [None]:
# Lime explainer for the true negative case via XGBoost
# Initialize the LimeTabularExplainer
explainer = LimeTabularExplainer(X_train.values, mode="classification", feature_names=X.columns, class_names=["Negative", "Positive"])

# Explain the true negative case
exp = explainer.explain_instance(PIMA_X_true_negative.values[0], gb_best_model.predict_proba, num_features=16, top_labels=1)

# Display the explanation
exp.show_in_notebook(show_table=True)


In [None]:
# Lime explainer for the true negative case via stacking classifier

# Initialize the LimeTabularExplainer
explainer = LimeTabularExplainer(X_train.values, mode="classification", feature_names=X.columns, class_names=["Negative", "Positive"])

# Explain the true negative case
exp = explainer.explain_instance(PIMA_X_true_negative.values[0], stacking.predict_proba, num_features=16, top_labels=1)

# Display the explanation
exp.show_in_notebook(show_table=True)