In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Load the downloaded dataset (ensure the file path is correct)
data = pd.read_csv("winequality-red.csv", delimiter=';')  # Use semicolon as delimiter

# Check the column names
print("Columns in the dataset:", data.columns)

# Clean column names (remove leading/trailing spaces if any)
data.columns = data.columns.str.strip()

# Separate features and target
X = data.drop('quality', axis="columns")  # Ensure 'quality' exists after cleaning
y = data['quality']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Step 4: Choose an instance to explain
instance_to_explain = X_test.iloc[0].values  # Convert to NumPy array for consistency
print("Instance to explain:", instance_to_explain)

# Step 5: Generate perturbed samples
def generate_perturbations(instance, num_samples=100):
    perturbations = []
    for _ in range(num_samples):
        noise = np.random.normal(0, 0.5, size=instance.shape)
        perturbed_instance = instance + noise
        perturbations.append(perturbed_instance)
    return np.array(perturbations)

perturbed_data = generate_perturbations(instance_to_explain)

# Step 6: Get predictions for the perturbed data
perturbed_predictions = rf.predict_proba(perturbed_data)

# Step 7: Calculate weights based on distance to original instance (Euclidean distance)
def calculate_weights(original_instance, perturbed_data):
    distances = np.linalg.norm(perturbed_data - original_instance, axis=1)
    return np.exp(-distances)

weights = calculate_weights(instance_to_explain, perturbed_data)

# Step 8: Fit a simple linear regression model
y_perturbed = perturbed_predictions[:, np.argmax(rf.predict_proba([instance_to_explain])[0])]
lr = LinearRegression()
lr.fit(perturbed_data, y_perturbed, sample_weight=weights)

# Step 9: Print the explanation (Feature importance)
print("\nFeature importance (coefficients):")
for i, feature_name in enumerate(X.columns):
    print(f"{feature_name}: {lr.coef_[i]:.4f}")


Columns in the dataset: Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')
Instance to explain: [ 7.7     0.56    0.08    2.5     0.114  14.     46.      0.9971  3.24
  0.66    9.6   ]





Feature importance (coefficients):
fixed acidity: 0.0104
volatile acidity: 0.0488
citric acid: -0.0208
residual sugar: -0.0067
chlorides: 0.0188
free sulfur dioxide: 0.0031
total sulfur dioxide: -0.0331
density: 0.0430
pH: 0.0023
sulphates: -0.0299
alcohol: -0.0211
