In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# Load the Red Wine Quality dataset
data = pd.read_csv("winequality-red.csv", delimiter=';')  # Use the correct delimiter

# Check the first few rows
print(data.head())

# Separate features and target
X = data.drop('quality', axis="columns")  # 'quality' is the target variable
y = data['quality']  # No need to encode for regression

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Choose an instance to explain (keep it as DataFrame to retain feature names)
instance_to_explain = X_test.iloc[0:1]  # Keep as DataFrame
print("Instance to explain:\n", instance_to_explain)

# Define a function to predict (for regression, we directly return the predicted value)
def model_predict(instance):
    return model.predict(instance)[0]

# Function to calculate SHAP values
def calculate_shapley_values(model, instance, X_train):
    num_features = instance.shape[1]
    shap_values = np.zeros(num_features)

    # Predict the baseline (average prediction over the training data)
    baseline_prediction = np.mean([model_predict(X_train.iloc[[i]]) for i in range(len(X_train))])
    print("Baseline prediction:", baseline_prediction)

    for i in range(num_features):
        # Create a modified instance with the i-th feature removed (set to 0)
        modified_instance = instance.copy()
        modified_instance.iloc[0, i] = 0  # Set the i-th feature to 0
        
        # Calculate the contribution of the i-th feature
        with_feature = model_predict(instance)  # Prediction with the i-th feature
        without_feature = model_predict(modified_instance)  # Prediction with the i-th feature set to 0
        
        # SHAP value contribution (difference in prediction with and without the feature)
        shap_values[i] = with_feature - without_feature

    return shap_values

# Calculate SHAP values for the chosen instance
shap_values = calculate_shapley_values(model, instance_to_explain, X_train)

# Print SHAP values (Feature Importance)
print("\nSHAP Values (Feature Importance):")
for i, feature in enumerate(X.columns):
    print(f"{feature}: {shap_values[i]:.4f}")



   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 