In [58]:
%run '/home/christianl/Zhang-Lab/Zhang Lab Code/Boilerplate_datahandling/Remote boilerplate/Fig_config_utilities.py'


<class 'numpy.ndarray'> (3187, 16101)
<class 'numpy.ndarray'> (3187, 16101)


In [59]:
%run '/home/christianl/Zhang-Lab/Zhang Lab Code/Boilerplate_datahandling/Remote boilerplate/SHAP_storage.py'

In [18]:
from shap_model_comparison import SHAPModelComparator
import os
import json
import pandas as pd
import numpy as np 

# output directory
os.makedirs('/home/christianl/Zhang-Lab/Zhang Lab Data/Saved SHAP values', exist_ok=True)

In [19]:
# loading testing set 
x_test_centered_df = pd.DataFrame(x_test_centered) 
subsetted_x_test_centered = x_test_centered_df.sample(n=1000, random_state=42)
feature_names = subsetted_x_test_centered.columns.tolist()

# loading feature names
feature_names = subsetted_x_test_centered.columns.tolist()


In [None]:
# fixing compatibility issue between version 2.0+ XGBoost class and SHAP package
# likely non impactful given I'm using KernelExplainer on select values instead of using TreeExplainer on XGBRF

def fix_xgboost_for_shap(model):
    try:
        booster = model.get_booster() if hasattr(model, 'get_booster') else model
        config = json.loads(booster.save_config())
        base_score = config['learner']['learner_model_param']['base_score']
        if base_score.startswith('[') and base_score.endswith(']'):
            base_score_float = float(base_score.strip('[]'))
            config['learner']['learner_model_param']['base_score'] = str(base_score_float)
            booster.load_config(json.dumps(config))
            print("✓ Fixed XGBoost model for SHAP compatibility")
    except Exception as e:
        print(f"Note: XGBoost fix not needed or failed: {e}")
    return model

xgbrf_loaded = fix_xgboost_for_shap(xgbrf_loaded)

In [20]:
# loading trained models
models = {
    'MLR': mlr_loaded,
    'XGBRF': xgbrf_loaded
}  # add when RNN retrained 'LEMBAS-RNN': rnn 06/01/26

In [None]:
# for when RNN is retrained and needs to be included 

import torch

class PyTorchRNNWrapper:
    def __init__(self, model, device='cpu'):
        self.model = model
        self.device = device
        self.model.eval()
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        
        X_tensor = torch.FloatTensor(X).to(self.device)
        
        if len(X_tensor.shape) == 2:
            X_tensor = X_tensor.unsqueeze(1)  # Add sequence dimension
        
        with torch.no_grad():
            output = self.model(X_tensor)
        
        return output.cpu().numpy().flatten()

# Load and wrap
rnn_base_model = torch.load('models/lembas_rnn.pth')
rnn_model = PyTorchRNNWrapper(rnn_base_model)

# Test
test_pred = rnn_model.predict(subsetted_x_test_centered[:5])
print(f"✓ RNN loaded and wrapped. Test predictions: {test_pred[:3]}")

In [21]:
print("\n" + "="*80)
print("Initializing SHAP Comparator")
print("="*80)

comparator = SHAPModelComparator(
    models_dict=models,
    X_data=subsetted_x_test_centered, 
    feature_names=feature_names,
    background_samples=100  # make lower if SHAP is slow 
)

print(f"✓ Comparator initialized with {len(models)} models")
print(f"✓ Data: {subsetted_x_test_centered.shape[0]} samples, {subsetted_x_test_centered.shape[1]} features")


Initializing SHAP Comparator
✓ Comparator initialized with 2 models
✓ Data: 1000 samples, 1198 features


In [22]:
print("\n" + "="*80)
print("Computing MLR SHAP Values (this may take a few minutes...)")
print("="*80 + "\n")

# computing for MLR
mlr_shap_values = comparator.compute_shap_values('MLR','linear')
mlr_shap_values = comparator.shap_values['MLR']

print("\n✓ SHAP computation complete!")


Computing MLR SHAP Values (this may take a few minutes...)

Computing SHAP values for MLR...
✓ SHAP values computed for MLR

✓ SHAP computation complete!


In [26]:
mlr_shap_values

array([[[ 6.87678878e-02,  1.16853361e-02, -1.73930026e-04, ...,
          5.04794743e-04, -2.36792920e-04,  2.96545570e-05],
        [ 3.51957870e-16, -5.45876105e-02, -7.87519517e-03, ...,
         -2.20250815e-04, -1.08223485e-02,  2.51890571e-02],
        [-2.29723285e-16,  4.72547504e-03, -9.06017948e-03, ...,
         -1.63843300e-02,  1.18156938e-02,  1.72146683e-03],
        ...,
        [ 1.50122343e-16,  1.32639846e-02, -8.22179118e-03, ...,
         -9.60091120e-03, -9.32970297e-03, -4.50851713e-03],
        [-9.00103362e-18,  2.44495014e-03, -2.26547228e-02, ...,
         -2.23530065e-02,  9.91671970e-03, -2.05070646e-03],
        [ 7.93876158e-17, -2.65408994e-02,  2.35601143e-02, ...,
         -7.12959403e-04,  1.93468187e-02, -5.17547521e-03]],

       [[ 5.70471356e-01,  9.69369536e-02, -1.44285510e-03, ...,
          4.18757869e-03, -1.96434095e-03,  2.46002544e-04],
        [-7.23997047e-18,  1.12289771e-03,  1.61997174e-04, ...,
          4.53068260e-06,  2.22621768e

In [60]:
storage = SHAPValueStorage(base_path='/home/christianl/Zhang-Lab/Zhang Lab Data/Saved SHAP values')

In [None]:
storage.save_as_memmap(
    shap_values=mlr_shap_values,
    model_name='MLR',
    metadata={
        'date_created': '2026-01-13',
        'test_samples': 1000,
        'model_version': 'v2',
        'description': 'SHAP values for Multiple Linear Regression model'}
)

In [None]:
print("\n" + "="*80)
print("Computing XGBRF SHAP Values (this may take a few minutes...)")
print("="*80 + "\n")

# computing for XGBRF automatically
xgbrf_shap_values = comparator.compute_shap_values('XGBRF','kernel')

print("\n✓ SHAP computation complete!")