In [None]:
import os
import joblib
import shap
import pandas as pd
import numpy as np


# setting base directory (folder where your notebook is located)
BASE_DIR = os.path.dirname(os.getcwd())

model_path = os.path.join(BASE_DIR, "models", "lr.pkl")
data_train_path = os.path.join(BASE_DIR, "data", "processed", "sepsis_train.csv")
data_test_path  = os.path.join(BASE_DIR, "data", "processed", "sepsis_test.csv")

# print("Model path:", model_path)
# print("Train data path:", data_train_path)
# print("Test data path:", data_test_path)

# loading model
raw_model = joblib.load(model_path)

# handling RandomizedSearchCV or plain pipeline
if hasattr(raw_model, "best_estimator_"):
    model_best = raw_model.best_estimator_
else:
    model_best = raw_model

# extracting preprocessor and classifier

preprocessor = model_best.named_steps["columntransformer"]
clf = model_best.named_steps["logisticregression"]

# loading & splitting data

sepsis_train = pd.read_csv(data_train_path)
sepsis_test = pd.read_csv(data_test_path)

X_train = sepsis_train.drop(columns=["hospital_outcome"])
y_train = sepsis_train["hospital_outcome"]

X_test = sepsis_test.drop(columns=["hospital_outcome"])
y_test = sepsis_test["hospital_outcome"]

# transforming data using same preprocessing as training

X_train_t = preprocessor.transform(X_train)
X_test_t = preprocessor.transform(X_test)

# converting sparse matrices to dense if needed
if hasattr(X_train_t, "toarray"):
    X_train_t = X_train_t.toarray()
    X_test_t = X_test_t.toarray()

# creating SHAP linear explainer

explainer = shap.LinearExplainer(clf, X_train_t)
shap_values = explainer.shap_values(X_test_t)

#print("SHAP values shape:", np.array(shap_values).shape)

# computing SHAP feature importances

mean_abs_shap = np.abs(shap_values).mean(axis=0)

# getting names of transformed features
raw_feature_names = preprocessor.get_feature_names_out()

# renaming sex_1 to sex and cleaning transformer prefixes
clean_feature_names = []
for name in raw_feature_names:
    clean = (
        name.replace("onehotencoder__", "")
            .replace("standardscaler__", "")
    )
    # renaming sex_1
    if clean == "sex_1":
        clean_feature_names.append("sex")
    else:
        clean_feature_names.append(clean)

# combining names with SHAP values
feature_importance = list(zip(clean_feature_names, mean_abs_shap))

# sorting features by importance
sorted_feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)

# printing results
print("Mean absolute SHAP values are shown below & higher values indicate higher feature importance.\n")

for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.5f}")


age_in_years: 0.82624
sex: 0.08918
episode_number: 0.00859
