### Using SHAP for Feature Drift Analysis
**Description**: Utilize SHapley Additive exPlanations (SHAP) values to analyze feature
importance changes over time, indicating feature drift.

In [1]:
import numpy as np
import pandas as pd
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# -------------------------------
# Simulate old and new datasets
# -------------------------------
np.random.seed(0)
data_old = pd.DataFrame({
    'feature_1': np.random.normal(0, 1, 500),
    'feature_2': np.random.normal(5, 2, 500),
    'feature_3': np.random.normal(-2, 1, 500),
    'target': np.random.choice([0, 1], 500)
})

data_new = pd.DataFrame({
    'feature_1': np.random.normal(1, 1, 500),
    'feature_2': np.random.normal(5, 2, 500),
    'feature_3': np.random.normal(-3, 1, 500),
    'target': np.random.choice([0, 1], 500)
})

# -------------------------------
# Train a model on old data
# -------------------------------
X_old = data_old.drop('target', axis=1)
y_old = data_old['target']

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_old, y_old)

# -------------------------------
# SHAP explainer and values
# -------------------------------
explainer = shap.TreeExplainer(model)

# Handle binary classification (only class 1 SHAP values)
shap_values_old = explainer.shap_values(X_old)[1]
shap_values_new = explainer.shap_values(data_new.drop('target', axis=1))[1]

# -------------------------------
# Mean SHAP values
# -------------------------------
mean_shap_old = np.mean(np.abs(shap_values_old), axis=0)
mean_shap_new = np.mean(np.abs(shap_values_new), axis=0)

# Confirm all arrays have same length
print("✅ SHAP old:", len(mean_shap_old), "| SHAP new:", len(mean_shap_new), "| Features:", X_old.shape[1])

# -------------------------------
# Create summary DataFrame
# -------------------------------
features = X_old.columns.tolist()

shap_df = pd.DataFrame({
    'Feature': features,
    'SHAP Mean (Old)': mean_shap_old,
    'SHAP Mean (New)': mean_shap_new,
    'Change (%)': ((mean_shap_new - mean_shap_old) / (mean_shap_old + 1e-8) * 100).round(2)
})

# -------------------------------
# Output
# -------------------------------
print("\n📊 Feature Drift Analysis using SHAP:\n")
print(shap_df.sort_values(by='Change (%)', ascending=False))


  from .autonotebook import tqdm as notebook_tqdm


✅ SHAP old: 2 | SHAP new: 2 | Features: 3


ValueError: All arrays must be of the same length