In [5]:
import pandas as pd
import numpy as np

# Generate Energy Efficiency dataset (regression)
np.random.seed(42)
n = 1000

data = {
    "Relative_Compactness": np.random.uniform(0.5, 1.0, n),
    "Surface_Area": np.random.uniform(500, 1000, n),
    "Wall_Area": np.random.uniform(200, 500, n),
    "Roof_Area": np.random.uniform(100, 300, n),
    "Overall_Height": np.random.choice([3.5, 7], n),
    "Orientation": np.random.randint(2, 6, n),
    "Glazing_Area": np.random.uniform(0, 0.4, n),
    "Glazing_Area_Distribution": np.random.randint(1, 5, n),
}

# Target: Heating load (simplified function + noise)
heating_load = (
    30
    - 10 * data["Relative_Compactness"]
    + 0.02 * data["Surface_Area"]
    + 0.03 * data["Wall_Area"]
    - 0.05 * data["Roof_Area"]
    + 2 * (np.array(data["Overall_Height"]) == 7).astype(int)
    + np.random.normal(0, 2, n)
)

data["Heating_Load"] = heating_load

df_energy = pd.DataFrame(data)

# Save as CSV
df_energy.to_csv("energy_efficiency.csv", index=False)

print("✅ energy_efficiency.csv created")
print(df_energy.head())


✅ energy_efficiency.csv created
   Relative_Compactness  Surface_Area   Wall_Area   Roof_Area  Overall_Height  \
0              0.687270    592.566464  278.511705  234.540599             7.0   
1              0.975357    770.950474  274.093640  259.336279             3.5   
2              0.865997    936.472918  471.876374  150.093580             7.0   
3              0.799329    866.112443  274.863860  224.974820             3.5   
4              0.578009    903.280574  281.584918  214.349197             3.5   

   Orientation  Glazing_Area  Glazing_Area_Distribution  Heating_Load  
0            2      0.157454                          1     33.059491  
1            5      0.189374                          1     30.892892  
2            2      0.341819                          3     49.020297  
3            3      0.136002                          1     36.521786  
4            2      0.347860                          1     40.887618  


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import shap

# =============================
# 1. Load Dataset
# =============================
df = pd.read_csv("energy_efficiency.csv")   # use the file you just created

X = df.drop("Heating_Load", axis=1)
y = df["Heating_Load"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# =============================
# 2. Train Random Forest Regressor
# =============================
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

print("=== Regression Report ===")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))

# =============================
# 3. Feature Importance (RF built-in)
# =============================
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.bar(range(len(indices)), importances[indices])
plt.xticks(range(len(indices)), X.columns[indices], rotation=45, ha="right")
plt.title("Random Forest Feature Importance")
plt.savefig("rf_feature_importance.png", dpi=300, bbox_inches="tight")
plt.close()
print("✅ Saved rf_feature_importance.png")

# =============================
# 4. SHAP Explainability
# =============================
explainer = shap.TreeExplainer(model)
shap_values = explainer(X_test)

# SHAP Summary Plot
shap.summary_plot(shap_values, X_test, show=False)
plt.savefig("shap_summary.png", dpi=300, bbox_inches="tight")
plt.close()
print("✅ Saved shap_summary.png")

# SHAP Waterfall Plot (first test sample)
shap.plots.waterfall(shap_values[0], show=False)
plt.savefig("shap_waterfall.png", dpi=300, bbox_inches="tight")
plt.close()
print("✅ Saved shap_waterfall.png")

print("\n🎯 All outputs ready: rf_feature_importance.png, shap_summary.png, shap_waterfall.png")


=== Regression Report ===
MAE: 1.864728447711012
MSE: 5.431209048935386
RMSE: 2.3304954513869762
R² Score: 0.8211466168005007
✅ Saved rf_feature_importance.png


  shap.summary_plot(shap_values, X_test, show=False)


✅ Saved shap_summary.png
✅ Saved shap_waterfall.png

🎯 All outputs ready: rf_feature_importance.png, shap_summary.png, shap_waterfall.png
