ASSIGNMENT 4:
Objectives:
 Use Permutation Importance for global feature contributions.
 Apply SHAP to visualize why some patients are predicted malignant/benign.
 Use LIME to generate local explanations for two patients.
 Compare alignment and differences between methods.
Assignment Details:
 Goal: Interpret how features influence breast cancer classification across multiple
explanation techniques.
 Data: sklearn.datasets.load_breast_cancer()
 Model: GradientBoostingClassifier
 Steps:
1. Train GradientBoostingClassifier.
2. Permutation Importance: Rank features by prediction impact.
3. SHAP: Create global summary plot and local force plots for one malignant and one
benign case.
4. LIME: Generate local explanations for the same two cases.
5. Compare results across methods.
 Deliverables:
o Permutation Importance plot.
o SHAP summary + 2 force plots.
o LIME explanations for 2 cases.
o Comparative analysis across PI, SHAP, and LIME.
Submission Requirements:
 Short methods summary (3–5 lines).
 Clean, runnable code/notebook.
 All required plots (PI, SHAP global + local, LIME local).
 5–10 bullet insights highlighting consistencies and differences.

In [2]:
!pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m143.4/275.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=45abc25034880bca63019507322ae237e99183d160817d700030486f3260645b
  Stored in directory: /root/.cache/pip/wheels/e7/5d/0e/4b4fff9a47468fed5633211fb3b76d1db43fe806a17fb7486a
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [8]:
#from lime.lime_tabular import LimeTabularExplainer

In [9]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import shap
from lime.lime_tabular import LimeTabularExplainer
import joblib

OUT_DIR = "interpretability_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
target_names = data.target_names

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)
joblib.dump(model, os.path.join(OUT_DIR, "gbc_model.joblib"))

perm = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42, n_jobs=-1, scoring="accuracy")
perm_df = pd.DataFrame({"feature": X.columns, "importance_mean": perm.importances_mean, "importance_std": perm.importances_std}).sort_values("importance_mean", ascending=False)
perm_df.to_csv(os.path.join(OUT_DIR, "permutation_importance.csv"), index=False)
plt.figure(figsize=(8,10))
perm_plot_df = perm_df.sort_values("importance_mean", ascending=True)
plt.barh(perm_plot_df["feature"], perm_plot_df["importance_mean"])
plt.xlabel("Decrease in accuracy (mean over repeats)")
plt.title("Permutation Importance (test set)")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "permutation_importance.png"))
plt.close()

explainer = shap.Explainer(model, X_train, feature_names=X.columns)
shap_values = explainer(X_test)
plt.figure(figsize=(8,6))
shap.summary_plot(shap_values, X_test, show=False)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "shap_summary.png"))
plt.close()

preds = model.predict(X_test)
probs = model.predict_proba(X_test)[:, 1]
idx_malignant = np.where(preds == 0)[0]
idx_benign = np.where(preds == 1)[0]
if len(idx_malignant) == 0:
    idx_malignant = [int(np.argmin(probs))]
if len(idx_benign) == 0:
    idx_benign = [int(np.argmax(probs))]
i_m = int(idx_malignant[0])
i_b = int(idx_benign[0])

case_info = {
    "malignant_index_in_test": i_m,
    "benign_index_in_test": i_b,
    "malignant_true_label": int(y_test.iloc[i_m]),
    "benign_true_label": int(y_test.iloc[i_b]),
    "malignant_pred_prob_benign": float(probs[i_m]),
    "benign_pred_prob_benign": float(probs[i_b]),
}
with open(os.path.join(OUT_DIR, "case_info.json"), "w") as f:
    json.dump(case_info, f, indent=2)

plt.figure(figsize=(6,4))
shap.plots.waterfall(shap_values[i_m], show=False)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "shap_malignant_waterfall.png"))
plt.close()

plt.figure(figsize=(6,4))
shap.plots.waterfall(shap_values[i_b], show=False)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "shap_benign_waterfall.png"))
plt.close()

shap.save_html(os.path.join(OUT_DIR, "shap_force_malignant.html"), shap.plots.force(shap_values[i_m]))
shap.save_html(os.path.join(OUT_DIR, "shap_force_benign.html"), shap.plots.force(shap_values[i_b]))

lime_explainer = LimeTabularExplainer(X_train.values, feature_names=X.columns.tolist(), class_names=target_names.tolist(), discretize_continuous=True, random_state=42)
exp_m = lime_explainer.explain_instance(X_test.values[i_m], model.predict_proba, num_features=10)
exp_b = lime_explainer.explain_instance(X_test.values[i_b], model.predict_proba, num_features=10)

fig_m = exp_m.as_pyplot_figure()
fig_m.suptitle("LIME explanation (malignant case)")
fig_m.savefig(os.path.join(OUT_DIR, "lime_malignant.png"), bbox_inches="tight")
plt.close(fig_m)

fig_b = exp_b.as_pyplot_figure()
fig_b.suptitle("LIME explanation (benign case)")
fig_b.savefig(os.path.join(OUT_DIR, "lime_benign.png"), bbox_inches="tight")
plt.close(fig_b)

lime_m_list = exp_m.as_list()
lime_b_list = exp_b.as_list()
pd.DataFrame(lime_m_list, columns=["feature_and_rule", "weight"]).to_csv(os.path.join(OUT_DIR, "lime_malignant.csv"), index=False)
pd.DataFrame(lime_b_list, columns=["feature_and_rule", "weight"]).to_csv(os.path.join(OUT_DIR, "lime_benign.csv"), index=False)

methods_summary = "Trained GradientBoostingClassifier on sklearn breast cancer dataset. Permutation importance for global ranking. SHAP for global and local attributions. LIME for local linear surrogate explanations."
with open(os.path.join(OUT_DIR, "methods_summary.txt"), "w") as f:
    f.write(methods_summary)

comparative_bullets = [
    "Permutation Importance (PI) gives global feature ranking by how shuffling each feature reduces accuracy.",
    "SHAP provides consistent additive contributions per feature (global and local), showing both direction and magnitude.",
    "LIME fits a local linear surrogate — quick, interpretable, but can vary with sampling and discretization.",
    "Top features (PI) often match top average contributors in SHAP summary.",
    "Locally, SHAP gives feature pushes toward/away from the predicted class; LIME often agrees on top contributors but can differ.",
    "If PI is high but SHAP/LIME show low local contribution for a case, that feature is globally important but not relevant for that patient.",
    "Use PI for global view, SHAP for stable local+global attribution, and LIME as a fast sanity-check local surrogate.",
]
with open(os.path.join(OUT_DIR, "comparative_bullets.txt"), "w") as f:
    f.write("\n".join(["- " + b for b in comparative_bullets]))

print("Saved outputs in:", OUT_DIR)
for fn in sorted(os.listdir(OUT_DIR)):
    print("-", fn)

print("\nTop 10 features by permutation importance:")
print(perm_df.head(10).to_string(index=False))

print("\nSelected test-case indices and probs:")
print(json.dumps(case_info, indent=2))




Saved outputs in: interpretability_outputs
- case_info.json
- comparative_bullets.txt
- gbc_model.joblib
- lime_benign.csv
- lime_benign.png
- lime_malignant.csv
- lime_malignant.png
- methods_summary.txt
- permutation_importance.csv
- permutation_importance.png
- shap_benign_waterfall.png
- shap_force_benign.html
- shap_force_malignant.html
- shap_malignant_waterfall.png
- shap_summary.png

Top 10 features by permutation importance:
             feature  importance_mean  importance_std
worst concave points         0.055478        0.015525
       worst texture         0.012121        0.008833
        mean texture         0.007226        0.005560
          area error         0.005594        0.004196
           mean area         0.003497        0.004691
    worst smoothness         0.002331        0.003297
        radius error         0.001399        0.004912
        worst radius         0.001166        0.006276
   worst compactness         0.000932        0.002377
    smoothness error  