
#Setting Up the Models



##Install Libraries and Import Packages

In [None]:
!pip install -r requirements_rg.txt

In [None]:
from ucimlrepo import fetch_ucirepo

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from interpret.glassbox import ExplainableBoostingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import  mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

from interpret import show
import shap
import graphviz
from sklearn.tree import export_graphviz
from lime.lime_tabular import LimeTabularExplainer

# fetch dataset
parkinsons_telemonitoring = fetch_ucirepo(id=189)

# data (as pandas dataframes)
df = parkinsons_telemonitoring.data.original.copy()

##Dataset Analysis

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df.isna().sum()

##Handling Outliers with IQR Method

In [None]:
for col in df.columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    df[col] = np.where(df[col] > upper, upper, df[col])
    df[col] = np.where(df[col] < lower, lower, df[col])

##Split Data

In [None]:
X = df.drop(['total_UPDRS', 'motor_UPDRS', 'subject#'], axis=1)
y = df['total_UPDRS']

#change column names for LightGBM
X.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X.columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

##Creating the Optimized Models and Then Fitting them

In [None]:
dt = DecisionTreeRegressor(ccp_alpha=0.0, criterion='absolute_error', max_depth=18,
                           max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0,
                           min_samples_leaf=2, min_samples_split=11, min_weight_fraction_leaf=0.0,
                           monotonic_cst=None, random_state=100, splitter='best')

ebm = ExplainableBoostingRegressor(callback=None, cat_smooth=10.0, cyclic_progress=False,
                                   early_stopping_rounds=100, early_stopping_tolerance=1e-05, exclude=None,
                                   feature_names=None, feature_types=None, gain_scale=5.0,
                                   greedy_ratio=10.0, inner_bags=0, interaction_smoothing_rounds=100,
                                   interactions=10, learning_rate=0.006286164131029316, max_bins=32,
                                   max_delta_step=0.0, max_interaction_bins=250, max_leaves=3,
                                   max_rounds=50000, min_cat_samples=10, min_hessian=0.0,
                                   min_samples_leaf=14, missing='separate', monotone_constraints=None,
                                   n_jobs=2, objective='rmse', outer_bags=14,
                                   random_state=100, reg_alpha=0.0, reg_lambda=0.0,
                                   smoothing_rounds=500, validation_size=0.15)

cat = CatBoostRegressor(iterations=952, learning_rate=0.05794977501448691, depth=8,
                        l2_leaf_reg=1.1630011660824908, loss_function='RMSE', bootstrap_type='MVS',
                        random_state=100)

lgbm = LGBMRegressor(boosting_type='dart', class_weight=None, colsample_bytree=0.9359031224606773,
                     importance_type='split', learning_rate=0.24546035056829404, max_depth=7,
                     min_child_samples=7, min_child_weight=0.001, min_split_gain=0.0,
                     n_estimators=911, n_jobs=None, num_leaves=2006,
                     objective=None, random_state=100, reg_alpha=1.7057427635070053,
                     reg_lambda=0.5682408091391381, subsample=0.8604189603227428, subsample_for_bin=200000,
                     subsample_freq=0)

In [None]:
dt.fit(X_train, y_train)
ebm.fit(X_train, y_train)
cat.fit(X_train, y_train)
lgbm.fit(X_train, y_train)

##Evaluating Each Model's Performance

In [None]:
#For dt
dt_pred = dt.predict(X_test)
print("\nFor the Decision Tree Model:")
print("\nMSE is : " + str(mean_squared_error(y_test, dt_pred)))
print("\nR2 is : " + str(r2_score(y_test, dt_pred)))
print("\nMAE is : " + str(mean_absolute_error(y_test, dt_pred)))

In [None]:
#For ebm
ebm_pred = ebm.predict(X_test)
print("\nFor the Explainable Boosting Machine Model:")
print("\nMSE is : " + str(mean_squared_error(y_test, ebm_pred)))
print("\nR2 is : " + str(r2_score(y_test, ebm_pred)))
print("\nMAE is : " + str(mean_absolute_error(y_test, ebm_pred)))

In [None]:
#For cat
cat_pred = cat.predict(X_test)
print("\nFor the CatBoost Model:")
print("\nMSE is : " + str(mean_squared_error(y_test, cat_pred)))
print("\nR2 is : " + str(r2_score(y_test, cat_pred)))
print("\nMAE is : " + str(mean_absolute_error(y_test, cat_pred)))

In [None]:
#For lgbm
lgbm_pred = lgbm.predict(X_test)
print("\nFor the LightGBM Model:")
print("\nMSE is : " + str(mean_squared_error(y_test, lgbm_pred)))
print("\nR2 is : " + str(r2_score(y_test, lgbm_pred)))
print("\nMAE is : " + str(mean_absolute_error(y_test, lgbm_pred)))

#Global Explainability

##Decision Tree's Tree Structure

In [None]:
dot_data = export_graphviz(dt, out_file=None,
                                feature_names=X.columns,
                                filled=True)

graph = graphviz.Source(dot_data, format="png")
graph

##EBM Global Explainability

In [None]:
ebm_global = ebm.explain_global()
show(ebm_global)

##Setting Up SHAP explainers

In [None]:
#turn data into DF for SHAP plots
X_test_df = pd.DataFrame(X_test, columns=X.columns)

#SHAP for dt
dt_explainer = shap.TreeExplainer(dt)
dt_shap_values = dt_explainer.shap_values(X_test_df)

#wrapper function for EBM predict to handle feature names
def ebm_predict_wrapper(X):
    X_df = pd.DataFrame(X, columns=X_train.columns)
    return ebm.predict(X_df)

#SHAP for ebm
ebm_explainer = shap.KernelExplainer(ebm_predict_wrapper, X_train)
ebm_shap_values = ebm_explainer.shap_values(X_test_df)

#SHAP for cat
cat_explainer = shap.TreeExplainer(cat)
cat_shap_values = cat_explainer.shap_values(X_test_df)

#SHAP for lgbm
lgbm_explainer = shap.Explainer(lgbm)
lgbm_shap_values = lgbm_explainer.shap_values(X_test_df)

##SHAP Summary Plots For Each Model

In [None]:
#dt
shap.summary_plot(dt_shap_values, X_test_df)

In [None]:
#ebm
shap.summary_plot(ebm_shap_values, X_test_df)

In [None]:
#cat
shap.summary_plot(cat_shap_values, X_test_df)

In [None]:
#lgbm
shap.summary_plot(lgbm_shap_values, X_test_df)

#Local Explainability

##Selecting Instance

In [None]:
index = 0

##Decision Path

In [None]:
X_instance = X_test.iloc[[index]]

node_indicator = dt.decision_path(X_instance)
leaf_id = dt.apply(X_instance)

print(f"\nDecision path for instance {index}:")
for node_id in node_indicator.indices:
    if dt.tree_.children_left[node_id] != dt.tree_.children_right[node_id]:
        feature = X_test.columns[dt.tree_.feature[node_id]]
        threshold = dt.tree_.threshold[node_id]
        if X_instance.iloc[0, dt.tree_.feature[node_id]] <= threshold:
            threshold_sign = "<="
        else:
            threshold_sign = ">"
        print(f"  {feature} = {X_instance.iloc[0, dt.tree_.feature[node_id]]:.2f} "
              f"{threshold_sign} {threshold:.2f}")

pred_value = dt.predict(X_instance)[0]
true_value = y_test.iloc[index] if isinstance(y_test, pd.Series) else y_test[index]

print(f"\nPredicted value: {pred_value}")
print(f"Actual value: {true_value}")

##Local EBM Explainability

In [None]:
ebm_local = ebm.explain_local(X_test, y_test)
show(ebm_local)

##SHAP Waterfalls

In [None]:
#dt
shap.initjs()
shap.force_plot(dt_explainer.expected_value, dt_shap_values[index, :], X_test_df.iloc[index])

In [None]:
#ebm
shap.initjs()
shap.force_plot(ebm_explainer.expected_value, ebm_shap_values[index, :], X_test_df.iloc[index])

In [None]:
#cat
shap.initjs()
shap.force_plot(cat_explainer.expected_value, cat_shap_values[index, :], X_test_df.iloc[index])

In [None]:
#lgbm
shap.initjs()
shap.force_plot(lgbm_explainer.expected_value, lgbm_shap_values[index, :], X_test_df.iloc[index])

##Setting up LIME Explainer

In [None]:
explainer = LimeTabularExplainer(X_train.values, feature_names=X.columns.tolist(), mode='regression')

##LIME Explanations for Each Model

In [None]:
#dt
dt_lime = explainer.explain_instance(X_test.iloc[index].values, lambda x: dt.predict(pd.DataFrame(x, columns=X_test.columns)), num_features=len(X.columns))
dt_lime.as_list()

In [None]:
#ebm
ebm_lime = explainer.explain_instance(X_test.iloc[index].values, lambda x: ebm.predict(pd.DataFrame(x, columns=X_test.columns)), num_features=len(X.columns))
ebm_lime.as_list()

In [None]:
#cat
cat_lime = explainer.explain_instance(X_test.iloc[index].values, lambda x: cat.predict(pd.DataFrame(x, columns=X_test.columns)), num_features=len(X.columns))
cat_lime.as_list()

In [None]:
#lgbm
lgbm_lime = explainer.explain_instance(X_test.iloc[index].values, lambda x: lgbm.predict(pd.DataFrame(x, columns=X_test.columns)), num_features=len(X.columns))
lgbm_lime.as_list()