In [None]:
import os
import joblib
import shap
import pandas as pd
import numpy as np


# setting base directory (folder where your notebook is located)
BASE_DIR = os.path.dirname(os.getcwd())

model_path = os.path.join(BASE_DIR, "models", "lr.pkl")
data_train_path = os.path.join(BASE_DIR, "data", "processed", "sepsis_train.csv")
data_test_path  = os.path.join(BASE_DIR, "data", "processed", "sepsis_test.csv")

# print("Model path:", model_path)
# print("Train data path:", data_train_path)
# print("Test data path:", data_test_path)

# loading model
raw_model = joblib.load(model_path)

# handling RandomizedSearchCV or plain pipeline
if hasattr(raw_model, "best_estimator_"):
    model_best = raw_model.best_estimator_
else:
    model_best = raw_model

# extracting preprocessor and classifier

preprocessor = model_best.named_steps["columntransformer"]
clf = model_best.named_steps["logisticregression"]

# loading & splitting data

sepsis_train = pd.read_csv(data_train_path)
sepsis_test = pd.read_csv(data_test_path)

X_train = sepsis_train.drop(columns=["hospital_outcome"])
y_train = sepsis_train["hospital_outcome"]

X_test = sepsis_test.drop(columns=["hospital_outcome"])
y_test = sepsis_test["hospital_outcome"]

# transforming data using same preprocessing as training

X_train_t = preprocessor.transform(X_train)
X_test_t = preprocessor.transform(X_test)

# converting sparse matrices to dense if needed
if hasattr(X_train_t, "toarray"):
    X_train_t = X_train_t.toarray()
    X_test_t = X_test_t.toarray()

# creating SHAP linear explainer

explainer = shap.LinearExplainer(clf, X_train_t)
shap_values = explainer.shap_values(X_test_t)

#print("SHAP values shape:", np.array(shap_values).shape)

# computing SHAP feature importances

mean_abs_shap = np.abs(shap_values).mean(axis=0)

# getting names of transformed features
raw_feature_names = preprocessor.get_feature_names_out()

# renaming sex_1 to sex and cleaning transformer prefixes
clean_feature_names = []
for name in raw_feature_names:
    clean = (
        name.replace("onehotencoder__", "")
            .replace("standardscaler__", "")
    )
    # renaming sex_1
    if clean == "sex_1":
        clean_feature_names.append("sex")
    else:
        clean_feature_names.append(clean)

# combining names with SHAP values
feature_importance = list(zip(clean_feature_names, mean_abs_shap))

# sorting features by importance
sorted_feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)

# printing results
print("Mean absolute SHAP values are shown below & higher values indicate higher feature importance.\n")

for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.5f}")


age_in_years: 0.82624
sex: 0.08918
episode_number: 0.00859


SHAP Explanation and Feature Importance

- SHAP (SHapley Additive exPlanations) is a method used to explain how individual features contribute to a machine learning model’s predictions. It assigns a numerical value to each feature that represents how much that feature pushes the prediction up or down. A higher SHAP value indicates a higher level of importance, meaning the feature has a stronger influence on the model's decision. SHAP is particularly useful for understanding both global feature importance and how a single observation is being predicted. In this analysis, SHAP values are used to determine which features are most important in the logistic regression model predicting sepsis hospital survival.  

- To interpret the logistic regression model, SHAP values were computed to measure the contribution of each feature to the model’s output. The model includes three predictors: age_in_years, sex, and episode_number. The SHAP values represent the average magnitude by which each feature affects the prediction across all test samples.  

- The results show that age_in_years is the most important feature, with a mean SHAP value of 0.82624. This clearly indicates that age has the strongest overall influence on the model’s predictions. The feature sex has a mean SHAP value of 0.08918, but this value is relatively small compared to age, suggesting that sex does not meaningfully influence the model’s output. The smallest SHAP value is associated with episode_number, at 0.00859, showing that this feature has almost no effect on the predictions and is effectively ignored by the model.  

- Overall, the SHAP analysis shows that age_in_years is the dominant predictor used by the model, sex has minimal influence, and episode_number contributes almost nothing to the final prediction. This provides a clear understanding of which features drive the model and confirms that age is the primary factor in predicting sepsis survival outcomes.  