# 06 - Explainability & Interpretability (XAI)

Understand model predictions using SHAP and other interpretability techniques

In [None]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Load Model and Data

In [None]:
# Load dataset
df = pd.read_csv('../data/processed/featured_dataset.csv')

# Load model and scaler
with open('../models/best_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

with open('../models/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

print(f"* Model loaded: {type(best_model).__name__}")

target_col = 'quantity'
X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"* Data prepared: {X_test_scaled.shape}")

## Feature Importance Analysis

In [None]:
if hasattr(best_model, 'feature_importances_'):
    importances = best_model.feature_importances_
    
    feature_importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': importances,
        'importance_normalized': importances / importances.sum()
    }).sort_values('importance', ascending=False)
    
    print("Top 15 Important Features:")
    print(feature_importance_df.head(15).to_string())
    
    # Visualization
    fig, ax = plt.subplots(figsize=(10, 6))
    feature_importance_df.head(15).plot(x='feature', y='importance', kind='barh', ax=ax, legend=False)
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.title('Top 15 Feature Importances')
    plt.tight_layout()
    plt.savefig('../reports/xai_feature_importance.png', dpi=100)
    plt.show()
else:
    print("Model does not have built-in feature importances")

## SHAP Values Analysis

In [None]:
# Create SHAP explainer
print("Computing SHAP values... (this may take a moment)")

try:
    if hasattr(best_model, 'predict'):
        explainer = shap.TreeExplainer(best_model)
        shap_values = explainer.shap_values(X_test_scaled)
        
        print("✓ SHAP values computed")
        print(f"  SHAP shape: {shap_values.shape}")
except:
    print("Could not compute SHAP for this model type. Trying KernelExplainer...")
    explainer = shap.KernelExplainer(best_model.predict, X_train_scaled[:100])
    shap_values = explainer.shap_values(X_test_scaled[:100])
    print("✓ SHAP values computed (using KernelExplainer)")

## SHAP Summary Plot

In [None]:
# Summary plot
plt.figure(figsize=(12, 6))
shap.summary_plot(shap_values, X_test_scaled, feature_names=X.columns, show=False, plot_type='bar')
plt.tight_layout()
plt.savefig('../reports/shap_summary_bar.png', dpi=100)
plt.show()

## SHAP Dependence Plots

In [None]:
# Get top features for detailed analysis
if hasattr(best_model, 'feature_importances_'):
    top_features = feature_importance_df.head(3)['feature'].tolist()
    
    fig, axes = plt.subplots(1, len(top_features), figsize=(15, 4))
    
    for idx, feature in enumerate(top_features):
        plt.sca(axes[idx])
        shap.dependence_plot(feature, shap_values, X_test_scaled, feature_names=X.columns, show=False)
    
    plt.tight_layout()
    plt.savefig('../reports/shap_dependence_plots.png', dpi=100)
    plt.show()

## Individual Prediction Explanation

In [None]:
# Explain a single prediction
sample_idx = 0
sample_shap = shap_values[sample_idx]

plt.figure(figsize=(12, 6))
shap.force_plot(explainer.expected_value, sample_shap, X_test_scaled[sample_idx], feature_names=X.columns, matplotlib=True, show=False)
plt.tight_layout()
plt.savefig('../reports/shap_force_plot.png', dpi=100)
plt.show()

print(f"Explanation for sample {sample_idx}:")
print(f"  Actual value: {y_test.iloc[sample_idx]:.2f}")
print(f"  Predicted value: {best_model.predict(X_test_scaled[sample_idx:sample_idx+1])[0]:.2f}")

## Model Behavior Insights

In [None]:
print("\n" + "="*60)
print("KEY INSIGHTS FROM MODEL EXPLAINABILITY")
print("="*60)

if hasattr(best_model, 'feature_importances_'):
    print("\nTop 5 Most Important Features:")
    for i, row in feature_importance_df.head(5).iterrows():
        print(f"  {i+1}. {row['feature']}: {row['importance_normalized']*100:.2f}%")

print("\nModel Behavior:")
print(f"  - Average prediction: {best_model.predict(X_test_scaled).mean():.2f}")
print(f"  - Average actual: {y_test.mean():.2f}")
print(f"  - Prediction std: {best_model.predict(X_test_scaled).std():.2f}")
print(f"  - Actual std: {y_test.std():.2f}")

print("\n" + "="*60)

## Recommendations

In [None]:
print("""
BUSINESS INSIGHTS & RECOMMENDATIONS:

1. **Top Drivers**: Focus on understanding and monitoring the top 5 important features
2. **Feature Engineering**: Consider creating interactions between top features
3. **Data Collection**: Ensure high-quality data for important features
4. **Monitoring**: Set up alerts for anomalies in key predictors
5. **Retraining**: Regularly retrain the model as data patterns evolve

NEXT STEPS:
- Deploy the model to production
- Set up monitoring dashboards
- Implement feedback loops for continuous improvement
- Document findings for stakeholders
""")