# SHAP Explainer (sLDA Features)

SHAP values and feature importance interpretation for best model trained on sLDA features.

In [1]:
import joblib
import pandas as pd
import shap

# 1) Load model
best_model = joblib.load("../models/best_slda_model.joblib") # Changed to sLDA model


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Summarize the best model
# 1.1) Summarize best model type and parameters
print("🔍 Best Model Summary")
print("-" * 30)
print(f"Model class: {type(best_model).__name__}")
print("\nParameters:")
for name, value in best_model.get_params().items():
    print(f"  • {name}: {value}")

# If available, show feature importances
if hasattr(best_model, "feature_importances_"):
    print("\nFeature importances (first 10):")
    import numpy as np
    fi = best_model.feature_importances_
    # Get feature names from X_test, which will be defined in the next cell
    # This assumes X_test will be created before this cell is fully interpreted in a typical notebook execution flow
    # For a direct script run, X_test would need to be defined earlier or passed here.
    try:
        feature_names = X_test.columns # Placeholder, X_test defined in next cell
        sorted_indices = np.argsort(fi)[::-1]
        for i in sorted_indices[:10]:
            print(f"  • {feature_names[i]}: {fi[i]:.4f}")
    except NameError: # Fallback if X_test is not yet defined (e.g. running cell by cell)
        print("  (Feature names will be available after X_test is defined in the next cell)")
        for idx, imp in enumerate(fi[:10]):
            print(f"  {idx:2d}: {imp:.4f}")

🔍 Best Model Summary
------------------------------
Model class: BayesianRidge

Parameters:
  • alpha_1: 1e-06
  • alpha_2: 1e-06
  • alpha_init: None
  • compute_score: False
  • copy_X: True
  • fit_intercept: True
  • lambda_1: 1e-06
  • lambda_2: 1e-06
  • lambda_init: None
  • max_iter: 300
  • tol: 0.001
  • verbose: False


In [3]:

# 2) Load features and split into train/test (or load pre-split X_test)
df = pd.read_csv("../data/processed/features_slda_monthly.csv", parse_dates=["month"]) # Changed to sLDA features
# … re-create X_test as in your training script …
features = [c for c in df.columns if c.startswith("topic_") or c.startswith("publication_")] + ["sentiment"]
split_idx = int(len(df) * 0.8)
X_test = df[features].iloc[split_idx:]

# 3) Compute SHAP values
explainer = shap.Explainer(best_model)
shap_values = explainer(X_test)

# 4) Visualize
shap.plots.beeswarm(shap_values)

TypeError: The passed model is not callable and cannot be analyzed directly with the given masker! Model: BayesianRidge()

In [None]:
shap.plots.beeswarm(shap_values, max_display=len(X_test.columns))
