In [1]:
# 1. Imports
import pandas as pd
import numpy as np
import shap
import joblib
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

# 2. Load Dataset
df = pd.read_csv('C:/Users/ammar/SHAP_ML/outputs/bearing_rul.csv')
X = df.drop(columns=['RUL', 'filename'])
y = df['RUL']

# 3. Setup
N_TRIALS = 25
n_features = X.shape[1]
shap_xgb_matrix = np.zeros((N_TRIALS, n_features))
shap_rf_matrix = np.zeros((N_TRIALS, n_features))

# 4. Run Trials
print("Running 25 trials for Bearings RUL...\n")
for i in range(N_TRIALS):
    print(f"Trial {i+1}/{N_TRIALS}")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42 + i
    )

    # XGBoost
    xgb_model = xgb.XGBRegressor()
    xgb_model.fit(X_train, y_train)
    explainer_xgb = shap.Explainer(xgb_model, X_train)
    shap_vals_xgb = explainer_xgb(X_test)
    shap_xgb_matrix[i, :] = np.abs(shap_vals_xgb.values).mean(axis=0)

    # Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42 + i)
    rf_model.fit(X_train, y_train)
    explainer_rf = shap.Explainer(rf_model, X_train)
    shap_vals_rf = explainer_rf(X_test)

    # Confirm shape and average
    if shap_vals_rf.values.shape[1] == n_features:
        shap_rf_matrix[i, :] = np.abs(shap_vals_rf.values).mean(axis=0)
    else:
        raise ValueError(f"Unexpected SHAP shape: {shap_vals_rf.values.shape}")

# 5. Average and Save
xgb_mean = shap_xgb_matrix.mean(axis=0)
rf_mean = shap_rf_matrix.mean(axis=0)

shap_xgb_df = pd.DataFrame({'feature': X.columns, 'mean_abs_shap': xgb_mean})
shap_rf_df = pd.DataFrame({'feature': X.columns, 'mean_abs_shap': rf_mean})

output_dir = 'C:/Users/ammar/SHAP_ML/trials_v/outputs/Bearings_RUL'
os.makedirs(output_dir, exist_ok=True)

shap_xgb_df.sort_values(by='mean_abs_shap', ascending=False).to_csv(
    os.path.join(output_dir, 'shap_avg_xgb.csv'), index=False)
shap_rf_df.sort_values(by='mean_abs_shap', ascending=False).to_csv(
    os.path.join(output_dir, 'shap_avg_rf.csv'), index=False)

# 6. Plot Top 10
def plot_top10(df, model_name):
    top10 = df.sort_values(by='mean_abs_shap', ascending=False).head(10)
    plt.figure(figsize=(10, 5))
    plt.barh(top10['feature'][::-1], top10['mean_abs_shap'][::-1])
    plt.xlabel('Mean |SHAP value|')
    plt.title(f'Bearings RUL - Top 10 Features ({model_name})')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'top10_{model_name.lower()}_bar.png'), dpi=300)
    plt.close()

plot_top10(shap_xgb_df, "XGBoost")
plot_top10(shap_rf_df, "RandomForest")

print("Bearings RUL SHAP averaging complete.")

Running 25 trials for Bearings RUL...

Trial 1/25
Trial 2/25
Trial 3/25
Trial 4/25
Trial 5/25
Trial 6/25
Trial 7/25
Trial 8/25
Trial 9/25
Trial 10/25
Trial 11/25
Trial 12/25
Trial 13/25
Trial 14/25
Trial 15/25
Trial 16/25
Trial 17/25
Trial 18/25
Trial 19/25
Trial 20/25
Trial 21/25
Trial 22/25
Trial 23/25
Trial 24/25
Trial 25/25
Bearings RUL SHAP averaging complete.
