In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# Load the Ionosphere dataset
data = pd.read_csv("ionosphere.data.csv")

# Check the first few rows
data.head()

# Separate features and target
X = data.drop('Class', axis="columns")
y = data['Class']

# Encode the target variable (Class)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train a RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 4: Choose an instance to explain (keep it as DataFrame to retain feature names)
instance_to_explain = X_test.iloc[0:1]  # Keep as DataFrame
print("Instance to explain:\n", instance_to_explain)


# Define a function to predict (for regression, we directly return the predicted value)
def model_predict(instance):
    return model.predict(instance)[0]

# Function to calculate SHAP values
def calculate_shapley_values(model, instance, X_train):
    num_features = instance.shape[1]
    shap_values = np.zeros(num_features)

    # Predict the baseline (average prediction over the training data)
    baseline_prediction = np.mean([model_predict(X_train.iloc[[i]]) for i in range(len(X_train))])  # Average prediction for all instances
    print("Baseline prediction:", baseline_prediction)

    for i in range(num_features):
        # Create a modified instance with the i-th feature removed (set to 0)
        modified_instance = instance.copy()
        modified_instance.iloc[0, i] = 0  # Set the i-th feature to 0
        
        # Calculate the contribution of the i-th feature
        with_feature = model_predict(instance)  # Prediction with the i-th feature
        without_feature = model_predict(modified_instance)  # Prediction with the i-th feature set to 0
        
        # SHAP value contribution (difference in prediction with and without the feature)
        shap_values[i] = with_feature - without_feature

    return shap_values


# Calculate SHAP values for the chosen instance
shap_values = calculate_shapley_values(model, instance_to_explain, X_train)

# Print SHAP values (Feature Importance)
print("\nSHAP Values (Feature Importance):")
for i, feature in enumerate(X.columns):
    print(f"{feature}: {shap_values[i]:.4f}")


Instance to explain:
      A  B       C        D        E        F        G        H        I  \
157  1  0  0.4709  0.22751  0.42328  0.33598  0.25661  0.47619  0.01852   

           J  ...       Y        Z       AA       AB       AC       AD  \
157  0.49471  ... -0.2328  0.00265  0.03574 -0.31739  0.15873 -0.21693   

          AE       AF      AG       AH  
157  0.24868 -0.24339  0.2672  0.04233  

[1 rows x 34 columns]
Baseline prediction: 0.6539285714285714

SHAP Values (Feature Importance):
A: 0.0100
B: 0.0000
C: 0.2700
D: 0.0000
E: 0.7800
F: 0.0000
G: 0.0300
H: 0.0100
I: 0.0000
J: 0.0000
K: 0.0000
L: 0.0000
M: 0.0000
N: -0.0200
O: 0.0100
P: 0.0000
Q: 0.0000
R: 0.0000
S: 0.0000
T: 0.0000
U: 0.0000
V: 0.0000
W: 0.0000
X: 0.0200
Y: -0.0300
Z: 0.0000
AA: 0.0000
AB: 0.0000
AC: 0.0000
AD: -0.0100
AE: 0.0000
AF: 0.0000
AG: 0.0000
AH: 0.0000
