<a href="https://colab.research.google.com/github/2303A52430/explainable-AI-LAB/blob/main/xai_4_ass_2430.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [4]:
import shap
from lime import lime_tabular

In [6]:
import os
OUTDIR = "outputs"
os.makedirs(OUTDIR, exist_ok=True)

In [19]:
# Load the dataset)
try:
    df = pd.read_csv('energy_efficiency_data.csv')
except FileNotFoundError:
    print("Error: 'energy_efficiency_data.csv' not found. Please upload the dataset or provide the correct path.")

Error: 'energy_efficiency_data.csv' not found. Please upload the dataset or provide the correct path.


In [21]:
UCI_XLSX_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00242/" \
 "ENB2012_data.xlsx"
try:
    df = pd.read_excel(UCI_XLSX_URL, engine="openpyxl")
except Exception as e:
    # Fallback: try alternate path used by UCI mirrors
    ALT_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00242/" \
 "ENB2012_data.xlsx"
    print("Primary URL failed; trying alternate...")
    df = pd.read_excel(ALT_URL, engine="openpyxl")

In [23]:
# Clean column names (strip spaces/newlines)
df.columns = [str(c).strip().replace(" ", "_") for c in df.columns]
# According to the dataset, first 8 columns are features; last 2 are targets: Y1(Heating) and Y2(Cooling)
# Common canonical names:
# X1: Relative_Compactness, X2: Surface_Area, X3: Wall_Area, X4: Roof_Area,
# X5: Overall_Height, X6: Orientation, X7: Glazing_Area, X8: Glazing_Area_Distribution,
# Y1: Heating_Load, Y2: Cooling_Load
# Standardize to these names where possible
rename_map = {
 "X1": "Relative_Compactness",
 "X2": "Surface_Area",
 "X3": "Wall_Area",
 "X4": "Roof_Area",
 "X5": "Overall_Height",
 "X6": "Orientation",
 "X7": "Glazing_Area",
 "X8": "Glazing_Area_Distribution",
 "Y1": "Heating_Load",
 "Y2": "Cooling_Load",
 }
for k, v in rename_map.items():
    if k in df.columns:
        df.rename(columns={k: v}, inplace=True)
# A few UCI versions include an extra unnamed column; drop any fully-NA columns
na_all_cols = [c for c in df.columns if df[c].isna().all()]
if na_all_cols:
    df.drop(columns=na_all_cols, inplace=True)
# Keep only the expected 10 columns if present
expected_cols = [
 "Relative_Compactness", "Surface_Area", "Wall_Area", "Roof_Area",
 "Overall_Height", "Orientation", "Glazing_Area",
 "Glazing_Area_Distribution", "Heating_Load", "Cooling_Load"
 ]
available_cols = [c for c in expected_cols if c in df.columns]
df = df[available_cols]
# ===============
# 2) Define features/target (focus on Heating_Load Y1 for this assignment)
# ===============
feature_cols = [
 "Relative_Compactness", "Surface_Area", "Wall_Area", "Roof_Area",
 "Overall_Height", "Orientation", "Glazing_Area", "Glazing_Area_Distribution"
 ]
target_col = "Heating_Load" # You can switch to "Cooling_Load" if desired
X = df[feature_cols].copy()
y = df[target_col].copy()
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
 X, y, test_size=0.2, random_state=42
 )
display(X_train.head())
display(y_train.head())

Unnamed: 0,Relative_Compactness,Surface_Area,Wall_Area,Roof_Area,Overall_Height,Orientation,Glazing_Area,Glazing_Area_Distribution
60,0.82,612.5,318.5,147.0,7.0,2,0.1,1
618,0.64,784.0,343.0,220.5,3.5,4,0.4,2
346,0.86,588.0,294.0,147.0,7.0,4,0.25,2
294,0.9,563.5,318.5,122.5,7.0,4,0.25,1
231,0.66,759.5,318.5,220.5,3.5,5,0.1,4


Unnamed: 0,Heating_Load
60,23.53
618,18.9
346,29.27
294,32.84
231,11.43


In [24]:
 rfr = RandomForestRegressor(
 n_estimators=600,
 max_depth=None,
 min_samples_leaf=1,
 random_state=42,
 n_jobs=-1
 )
 rfr.fit(X_train, y_train)

In [26]:
# ===============
# 4) Permutation Importance (PI)
# ===============
pi = permutation_importance(rfr, X_test, y_test, n_repeats=20, random_state=42, n_jobs=-1)
pi_means = pd.Series(pi.importances_mean, index=feature_cols).sort_values(ascending=False)
plt.figure(figsize=(8, 5))
pi_means.plot(kind="bar")
plt.title("Permutation Importance– Heating Load (RandomForest)")
plt.ylabel("Mean decrease in R^2 on shuffle")
plt.tight_layout()
pi_path = os.path.join(OUTDIR, "pi_bar_heating.png")
plt.savefig(pi_path, dpi=180)
plt.close()
print(f"Saved PI bar chart to: {pi_path}")

Saved PI bar chart to: outputs/pi_bar_heating.png


In [30]:
# ===============
# 5) SHAP– global (summary) & local (force) explanations
# ===============
explainer = shap.TreeExplainer(rfr)
shap_values = explainer.shap_values(X_train)
# Global: beeswarm + bar summary
plt.figure()
shap.summary_plot(shap_values, X_train, show=False)
shap_beeswarm_path = os.path.join(OUTDIR, "shap_summary_beeswarm.png")
plt.tight_layout()
plt.savefig(shap_beeswarm_path, dpi=180)
plt.close()
print(f"Saved SHAP beeswarm to: {shap_beeswarm_path}")
plt.figure()
shap.summary_plot(shap_values, X_train, plot_type="bar", show=False)
shap_bar_path = os.path.join(OUTDIR, "shap_summary_bar.png")
plt.tight_layout()
plt.savefig(shap_bar_path, dpi=180)
plt.close()
print(f"Saved SHAP bar summary to: {shap_bar_path}")
# Local: SHAP force plot for one representative building (pick median predicted
# demand)
# Identify an index near the median predicted demand on the test set
preds = rfr.predict(X_test)
median_pred = np.median(preds)
idx_median = int(np.argsort(np.abs(preds- median_pred))[0])
x_instance = X_test.iloc[idx_median:idx_median+1]
y_true_inst = y_test.iloc[idx_median]
y_pred_inst = preds[idx_median]
# Force plot expects a 1D shap_values row and base value
shap_values_test = explainer.shap_values(X_test)
force = shap.force_plot(
 explainer.expected_value,
 shap_values_test[idx_median, :],
 X_test.iloc[idx_median, :],
 matplotlib=False
 )
force_path = os.path.join(OUTDIR, "shap_force_local.html")
shap.save_html(force_path, force)
print(f"Saved SHAP local force plot to: {force_path}")

Saved SHAP beeswarm to: outputs/shap_summary_beeswarm.png
Saved SHAP bar summary to: outputs/shap_summary_bar.png
Saved SHAP local force plot to: outputs/shap_force_local.html


In [29]:
explainer_lime = lime_tabular.LimeTabularExplainer(
 training_data=X_train.values,
 feature_names=feature_cols,
 discretize_continuous=True,
 mode="regression",
 verbose=False,
 random_state=42
 )