#Setting Up the Models

##Install Libraries and Import Packages

In [None]:
!pip install -r requirements_cl.txt

In [None]:
from ucimlrepo import fetch_ucirepo

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

import shap
import graphviz
from sklearn.tree import export_graphviz
from lime.lime_tabular import LimeTabularExplainer

#fetch dataset
heart_disease = fetch_ucirepo(id=45)

#data
df = heart_disease.data.original.copy()

##Data Analysis

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
for col in df.columns:
    plt.boxplot(df[col])
    plt.title(col)
    plt.show()

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

##Handling NaN Values with SimpleImputer

In [None]:
imputer = SimpleImputer(missing_values = np.nan, strategy ='mean')
imputer = imputer.fit(df)
df = imputer.transform(df)

#convert NP to DF
df = pd.DataFrame(df, columns=heart_disease.data.original.columns)

##Combining Classes

In [None]:
X = df.drop('num', axis=1)
y = df['num']

#for plot
feature_names = X.columns

#class 0 -> no disease(0) / classes 1-4 -> disease(1)
y_binary = y.copy()
y_binary[y_binary > 0] = 1

##Spliting and Scaling the Data

In [None]:
#split
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.25, random_state=0, stratify=y_binary)

#scale
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

##Creating the Optimized Models and Then Fitting them

In [None]:
#Decision Tree Model
dt = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                            max_depth=4, max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_samples_leaf=16, min_samples_split=4,
                            min_weight_fraction_leaf=0.0, monotonic_cst=None, random_state=100, splitter='random')

#Logistic Regression Model
lr = LogisticRegression(C=0.010993634452683504, class_weight='balanced', dual=False,
                        fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100,
                        multi_class='deprecated', n_jobs=None, penalty='l2', random_state=100, solver='liblinear',
                        tol=0.0001, verbose=0, warm_start=False)

#Random Forest Model
rf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini',
                            max_depth=15, max_features='sqrt', max_leaf_nodes=None, max_samples=None,
                            min_impurity_decrease=0.0, min_samples_leaf=15, min_samples_split=16,
                            min_weight_fraction_leaf=0.0, monotonic_cst=None, n_estimators=53, n_jobs=None,
                            oob_score=False, random_state=100, verbose=0, warm_start=False)

#XGBoost Model
xgb = XGBClassifier(objective='binary:logistic', base_score=None, booster=None, callbacks=None,
                    colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.7504717399139913,
                    device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None,
                    feature_types=None, gamma=0.6497146261841261, grow_policy=None, importance_type=None,
                    interaction_constraints=None, learning_rate=0.021618964126433812, max_bin=None, max_cat_threshold=None,
                    max_cat_to_onehot=None, max_delta_step=None, max_depth=7, max_leaves=None, min_child_weight=None,
                    missing=np.nan, monotone_constraints=None, multi_strategy=None, n_estimators=347, n_jobs=None,
                    num_parallel_tree=None, random_state=100, reg_alpha=0.0010349590106072711, reg_lambda=0.02637083647277659,
                    sampling_method=None, scale_pos_weight=8.443990241900503, subsample=0.7040523480851058, tree_method=None,
                    validate_parameters=None, verbosity=None)

In [None]:
dt.fit(X_train, y_train)
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

##Evaluating Each Model's Performance

In [None]:
#For dt
dt_pred = dt.predict(X_test)
print("\nFor the Decision Tree Model:\nAccuracy: " + str(accuracy_score(y_test, dt_pred) * 100) + "%")
print("Classification Report:\n" + str(classification_report(y_test, dt_pred)))

In [None]:
#For lr
lr_pred = lr.predict(X_test)
print("\nFor the Logistic Regression Model:\nAccuracy: " + str(accuracy_score(y_test, lr_pred) * 100) + "%")
print("Classification Report:\n" + str(classification_report(y_test, lr_pred)))

In [None]:
#For rf
rf_pred = rf.predict(X_test)
print("\nFor the Random Forest Model:\nAccuracy: " + str(accuracy_score(y_test, rf_pred) * 100) + "%")
print("Classification Report:\n" + str(classification_report(y_test, rf_pred)))

In [None]:
#For xgb
xgb_pred = xgb.predict(X_test)
print("\nFor the XGBoost Model:\nAccuracy: " + str(accuracy_score(y_test, xgb_pred) * 100) + "%")
print("Classification Report:\n" + str(classification_report(y_test, xgb_pred)))

#Global Explainability

##Decision Tree's Tree Structure

In [None]:
dot_data = export_graphviz(dt, out_file=None,
                                feature_names=feature_names,
                                class_names=[str(x) for x in y_binary.unique()],
                                filled=True)

graph = graphviz.Source(dot_data, format="png")
graph

##Logistic Regression Model's Coefficients

In [None]:
coefficients = pd.Series(lr.coef_[0], index=feature_names)
print("Logistic Regression Coefficients:\n")
print(coefficients.sort_values(ascending=False))

In [None]:
coefficients.sort_values().plot(kind="barh", figsize=(8, 6))
plt.title("Logistic Regression Coefficients")
plt.xlabel("Coefficient Value")
plt.tight_layout()
plt.grid()
plt.show()

##Setting Up SHAP explainers

In [None]:
#turn data into DF for SHAP plots
X_test_df = pd.DataFrame(X_test, columns=feature_names)

#SHAP for dt
dt_explainer = shap.TreeExplainer(dt)
dt_shap_values = dt_explainer.shap_values(X_test_df)

#SHAP for lr
lr_explainer = shap.LinearExplainer(lr, masker=shap.maskers.Independent(X_test_df))
lr_shap_values = lr_explainer.shap_values(X_test_df)

#SHAP for rf
rf_explainer = shap.TreeExplainer(rf)
rf_shap_values = rf_explainer.shap_values(X_test_df)

#SHAP for xgb
xgb_explainer = shap.Explainer(xgb)
xgb_shap_values = xgb_explainer.shap_values(X_test_df)

##SHAP Summary Plots For Each Model

In [None]:
dt_shap_values.shape

In [None]:
#dt
shap.summary_plot(dt_shap_values[:, :, 1], X_test_df)

In [None]:
lr_shap_values.shape

In [None]:
#lr
shap.summary_plot(lr_shap_values, X_test_df)

In [None]:
rf_shap_values.shape

In [None]:
#rf
shap.summary_plot(rf_shap_values[:, :, 1], X_test_df)

In [None]:
xgb_shap_values.shape

In [None]:
#xgb
shap.summary_plot(xgb_shap_values, X_test_df)

#Local Explainability

##Selecting Instance

In [None]:
index = 0

##Decision Path

In [None]:
X_instance = X_test[index].reshape(1, -1)

node_indicator = dt.decision_path(X_instance)
leaf_id = dt.apply(X_instance)

print(f"\nDecision path for instance {index}:")
for node_id in node_indicator.indices:
    if dt.tree_.children_left[node_id] != dt.tree_.children_right[node_id]:
        feature = feature_names[dt.tree_.feature[node_id]]
        threshold = dt.tree_.threshold[node_id]
        if X_instance[0, dt.tree_.feature[node_id]] <= threshold:
            threshold_sign = "<="
        else:
            threshold_sign = ">"
        print(f"  {feature} = {X_instance[0, dt.tree_.feature[node_id]]:.2f} "
              f"{threshold_sign} {threshold:.2f}")

pred_class = dt.predict(X_instance)[0]
true_class = y_test.iloc[index] if isinstance(y_test, pd.Series) else y_test[index]

print(f"\nPredicted class: {pred_class}")
print(f"Actual class:    {true_class}")

##Logistic Regression's Contributions For Single Instance

In [None]:
contributions = X_test_df.iloc[index] * lr.coef_[0]
print(contributions.sort_values(ascending=False))

##SHAP Waterfalls

In [None]:
#dt
shap.initjs()
shap.force_plot(dt_explainer.expected_value[1], dt_shap_values[index, :, 1], X_test_df.iloc[index])

In [None]:
#lr
shap.initjs()
shap.force_plot(lr_explainer.expected_value, lr_shap_values[index, :], X_test_df.iloc[index])

In [None]:
#rf
shap.initjs()
shap.force_plot(rf_explainer.expected_value[1], rf_shap_values[index, :, 1], X_test_df.iloc[index])

In [None]:
#xgb
shap.initjs()
shap.force_plot(xgb_explainer.expected_value, xgb_shap_values[index, :], X_test_df.iloc[index])

##Setting up LIME Explainer

In [None]:
explainer = LimeTabularExplainer(X_train, feature_names=feature_names, class_names=['No Disease', 'Disease'], mode='classification')

##LIME Explanations for Each Model

In [None]:
#dt
dt_lime = explainer.explain_instance(X_test[index], dt.predict_proba, num_features=len(feature_names))
dt_lime.as_list()

In [None]:
#lr
lr_lime = explainer.explain_instance(X_test[index], lr.predict_proba, num_features=len(feature_names))
lr_lime.as_list()

In [None]:
#rf
rf_lime = explainer.explain_instance(X_test[index], rf.predict_proba, num_features=len(feature_names))
rf_lime.as_list()

In [None]:
#xgb
xgb_lime = explainer.explain_instance(X_test[index], xgb.predict_proba, num_features=len(feature_names))
xgb_lime.as_list()