In [19]:
# SECOM: SHAP Averaging over 25 Trials (XGBoost + Random Forest)

import pandas as pd
import numpy as np
import shap
import joblib
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

# === 1. Load and preprocess SECOM data ===
df_data = pd.read_csv('C:/Users/ammar/SHAP_ML/datasets/secom.data', sep=' ', header=None)
df_labels = pd.read_csv('C:/Users/ammar/SHAP_ML/datasets/secom_labels.data', sep=' ', header=None)
df_data.columns = [f'feature_{i}' for i in range(df_data.shape[1])]
df_data['label'] = df_labels[0]

# Drop features with >50% missing
df_clean = df_data.loc[:, df_data.isna().mean() < 0.5].copy()
df_clean = df_clean.fillna(df_clean.mean())

X = df_clean.drop(columns=['label'])
y = df_clean['label'].replace(-1, 0)
n_features = X.shape[1]

# === 2. SHAP averaging config ===
N_TRIALS = 25
shap_rf_matrix = np.zeros((N_TRIALS, n_features))
shap_xgb_matrix = np.zeros((N_TRIALS, n_features))

# === 3. Run trials ===
print("Running 25 trials for SECOM...\n")
for i in range(N_TRIALS):
    print(f"Trial {i+1}/{N_TRIALS}")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42 + i
    )

    # === XGBoost ===
    xgb_model = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)
    xgb_model.fit(X_train, y_train)
    xgb_explainer = shap.Explainer(xgb_model, X_train)
    shap_vals_xgb = xgb_explainer(X_test)
    shap_xgb_matrix[i, :] = np.abs(shap_vals_xgb.values).mean(axis=0)

    # === Random Forest SHAP: FINAL FIX ===
rf_model = RandomForestClassifier(n_estimators=100, random_state=100 + i)
rf_model.fit(X_train, y_train)

# Use SHAP's TreeExplainer
explainer_rf = shap.TreeExplainer(rf_model)
shap_vals_rf_all = explainer_rf.shap_values(X_test)

# Extract class 1 SHAP values if multiclass
if isinstance(shap_vals_rf_all, list) and len(shap_vals_rf_all) == 2:
    shap_vals_rf = shap_vals_rf_all[1]  # class 1
elif isinstance(shap_vals_rf_all, np.ndarray) and shap_vals_rf_all.ndim == 3:
    shap_vals_rf = shap_vals_rf_all[:, :, 1]  # shape: (samples, features, classes)
else:
    shap_vals_rf = shap_vals_rf_all  # already (samples, features)

# Final shape check
if shap_vals_rf.shape[1] != n_features:
    raise ValueError(f"Expected SHAP values with shape (*, {n_features}), but got {shap_vals_rf.shape}")

# Average across samples (axis=0 → feature-level mean)
shap_rf_matrix[i, :] = np.abs(shap_vals_rf).mean(axis=0)

# === 4. Average and save ===
xgb_mean = shap_xgb_matrix.mean(axis=0)
rf_mean = shap_rf_matrix.mean(axis=0)

shap_xgb_df = pd.DataFrame({'feature': X.columns, 'mean_abs_shap': xgb_mean})
shap_rf_df = pd.DataFrame({'feature': X.columns, 'mean_abs_shap': rf_mean})

# Create output folder if missing
os.makedirs('C:/Users/ammar/SHAP_ML/trials_v/outputs', exist_ok=True)

shap_xgb_df.sort_values(by='mean_abs_shap', ascending=False).to_csv(
    'C:/Users/ammar/SHAP_ML/trials_v/outputs/shap_avg_xgb.csv', index=False)
shap_rf_df.sort_values(by='mean_abs_shap', ascending=False).to_csv(
    'C:/Users/ammar/SHAP_ML/trials_v/outputs/shap_avg_rf.csv', index=False)

# === 5. Plot Top 10 ===
def plot_top10(df, model_name):
    top10 = df.sort_values(by='mean_abs_shap', ascending=False).head(10)
    plt.figure(figsize=(10, 5))
    plt.barh(top10['feature'][::-1], top10['mean_abs_shap'][::-1])
    plt.xlabel('Mean |SHAP value|')
    plt.title(f'SECOM - Top 10 Features ({model_name})')
    plt.tight_layout()
    plt.savefig(f"C:/Users/ammar/SHAP_ML/trials_v/outputs/top10_{model_name.lower()}_bar.png", dpi=300)
    plt.close()

plot_top10(shap_xgb_df, "XGBoost")
plot_top10(shap_rf_df, "RandomForest")

print("\nSECOM SHAP averaging complete. CSV and PNG files saved.")

Running 25 trials for SECOM...

Trial 1/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 2/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 3/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 4/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 5/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 6/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 7/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 8/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 9/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 10/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 11/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 12/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 13/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 14/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 15/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 16/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 17/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 18/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 19/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 20/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 21/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 22/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 23/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 24/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Trial 25/25


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



SECOM SHAP averaging complete. CSV and PNG files saved.
