In [1]:
import numpy as np
import pandas as pd

### Creating the complex model (f)

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Not spending any more time making a good model since the goal is to explain the model and that's most interesting with a bad model

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

# Black Box Model 'f' - using default parameters as per LIME's model-agnostic philosophy.
black_box_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
black_box_model.fit(X_train, y_train)

print(f"Model Accuracy: {black_box_model.score(X_test, y_test):.2f}")

Model Accuracy: 0.96


### Picking an instance of this data to be explained (x)

In [5]:
# This is 'x', the instance we want to explain
instance_to_explain = X_test[0]
true_label = y_test[0]
model_prediction = black_box_model.predict_proba(instance_to_explain.reshape(1, -1))[0]

print(f"Instance Features: {instance_to_explain}")
print(f"True Label: {true_label}")
print(f"Model Prediction (Probabilities): {model_prediction}")

Instance Features: [1.247e+01 1.860e+01 8.109e+01 4.819e+02 9.965e-02 1.058e-01 8.005e-02
 3.821e-02 1.925e-01 6.373e-02 3.961e-01 1.044e+00 2.497e+00 3.029e+01
 6.953e-03 1.911e-02 2.701e-02 1.037e-02 1.782e-02 3.586e-03 1.497e+01
 2.464e+01 9.605e+01 6.779e+02 1.426e-01 2.378e-01 2.671e-01 1.015e-01
 3.014e-01 8.750e-02]
True Label: 1
Model Prediction (Probabilities): [8.62720920e-04 9.99137279e-01]


### Creating an interpretable representation of the data (x')

In [6]:
from sklearn.preprocessing import KBinsDiscretizer

# Create a discretizer that will bin each feature into 4 categories
# We use 'uniform' strategy for simplicity
discretizer = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')
discretizer.fit(X_train)

In [7]:
# The raw numbers
print("Original Raw Instance:", instance_to_explain)

# The new "interpretable" instance. The numbers now represent bins (0, 1, 2, 3)
interpretable_instance = discretizer.transform(instance_to_explain.reshape(1, -1))[0]
print("New Interpretable Instance (bins):", interpretable_instance)

Original Raw Instance: [1.247e+01 1.860e+01 8.109e+01 4.819e+02 9.965e-02 1.058e-01 8.005e-02
 3.821e-02 1.925e-01 6.373e-02 3.961e-01 1.044e+00 2.497e+00 3.029e+01
 6.953e-03 1.911e-02 2.701e-02 1.037e-02 1.782e-02 3.586e-03 1.497e+01
 2.464e+01 9.605e+01 6.779e+02 1.426e-01 2.378e-01 2.671e-01 1.015e-01
 3.014e-01 8.750e-02]
New Interpretable Instance (bins): [0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 0. 1. 1. 1.]


### Creating perturbations around this instance (z')

In [8]:
# Calculate the mean of each feature from the training data
feature_means = X_train.mean(axis=0)

def create_lime_perturbation(original_instance, num_samples):
    num_features = original_instance.shape[0]
    # Create random binary vectors (0s and 1s)
    # 1 = keep the original feature value, 0 = replace with mean
    perturbations = np.random.binomial(1, 0.5, size=(num_samples, num_features))
    
    # Create the dataset of fake samples in the original numerical space
    perturbed_data = []
    for p in perturbations:
        # Create a new sample by replacing "off" features with the mean
        new_sample = np.where(p == 1, original_instance, feature_means)
        perturbed_data.append(new_sample)
        
    return np.array(perturbed_data), perturbations

# Generate 5000 fake samples
perturbed_numerical_data, perturbed_interpretable_data = create_lime_perturbation(instance_to_explain, 5000)

# Look at one example
print("Original Instance:\n", instance_to_explain[:5])
print("\nAn interpretable perturbation (binary on/off):\n", perturbed_interpretable_data[0, :5])
print("\nThe same perturbation recovered in numerical space (note which values changed):\n", perturbed_numerical_data[0, :5])
print("\nFeature means for comparison:\n", feature_means[:5])

Original Instance:
 [1.247e+01 1.860e+01 8.109e+01 4.819e+02 9.965e-02]

An interpretable perturbation (binary on/off):
 [0 0 1 0 1]

The same perturbation recovered in numerical space (note which values changed):
 [1.41176352e+01 1.91850330e+01 8.10900000e+01 6.54377582e+02
 9.96500000e-02]

Feature means for comparison:
 [1.41176352e+01 1.91850330e+01 9.18822418e+01 6.54377582e+02
 9.57440220e-02]


### Generating predictions for the perturbations

In [9]:
perturbed_predictions= black_box_model.predict(perturbed_numerical_data)

perturbed_predictions_prob = black_box_model.predict_proba(perturbed_numerical_data)


print(perturbed_predictions_prob)

[[1.36320820e-02 9.86367918e-01]
 [9.09494776e-04 9.99090505e-01]
 [2.86934922e-03 9.97130651e-01]
 ...
 [6.09661322e-03 9.93903387e-01]
 [2.72205781e-03 9.97277942e-01]
 [7.64684088e-03 9.92353159e-01]]


### Calculating the proximity (πx(z)) using Euclidean distance (better for higher dimensions like for images)

In [10]:
from sklearn.metrics.pairwise import euclidean_distances

# Your original_instance needs to be reshaped to (1, n_features) to work with the function
# Your perturbed_numerical_data is already (5000, n_features)

# This calculates the distance from the one original point to all 5000 perturbed points
# The result will be an array of shape (1, 5000), so we take the first element [0]
distances = euclidean_distances(instance_to_explain.reshape(1, -1), perturbed_numerical_data)[0]

print("Shape of distances array:", distances.shape) # Should be (5000,)
print("Example distances:", distances[:5])

Shape of distances array: (5000,)
Example distances: [263.82953561   0.39528748 172.84493414 199.93808651 263.41935017]


In [11]:

# Define the kernel width
num_features = X_train.shape[1]
kernel_width = 0.75 * np.sqrt(num_features)

# Convert the distances to proximity scores (these are your sample weights)
# This is the πx(z) for every perturbation z
proximity_scores = np.exp(-(distances ** 2) / (kernel_width ** 2))

print("Shape of proximity_scores array:", proximity_scores.shape) # Should be (5000,)
print("Example proximity scores:", proximity_scores[:5])

Shape of proximity_scores array: (5000,)
Example proximity scores: [0.         0.99078335 0.         0.         0.        ]


### Creating a training set for the interpretable model (g)

In [23]:
# Select the probability of the class we are interested in (e.g., class 1: malignant)
# The shape will now be (5000,) which pandas can handle with one column name.
y_interpretable = perturbed_predictions_prob[:, 1] 

#Converting to dataframe
df_interpretable = pd.DataFrame(perturbed_interpretable_data, columns=[f"feature_{i}" for i in range(perturbed_interpretable_data.shape[1])])
df_probs = pd.DataFrame(y_interpretable, columns=["prob_malignant"]) # More descriptive name
df_weights = pd.DataFrame(proximity_scores, columns=["proximity"])

# Concatenate safely
training_data = pd.concat([df_interpretable, df_probs, df_weights], axis=1)

print(training_data.head())

   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0          0          0          1          0          1          1   
1          1          1          1          1          1          1   
2          0          0          1          0          0          1   
3          0          1          0          1          1          0   
4          0          0          1          0          1          1   

   feature_6  feature_7  feature_8  feature_9  ...  feature_22  feature_23  \
0          0          1          1          0  ...           0           0   
1          1          1          1          1  ...           1           1   
2          0          1          0          1  ...           0           1   
3          0          0          1          0  ...           0           0   
4          1          1          1          0  ...           1           0   

   feature_24  feature_25  feature_26  feature_27  feature_28  feature_29  \
0           0           0  

### Training the interpretable model (g)

In [13]:
from sklearn.linear_model import LinearRegression

sample_weights_1d = df_weights['proximity']


reg = LinearRegression().fit(df_interpretable, df_probs, sample_weight=sample_weights_1d)


### Interpreting the coefficients of g

In [21]:
# The coefficients are often in a nested list, e.g., [[c1, c2, ...]], so we select the first element.
coefficients = reg.coef_[0] 

df_interpretable = pd.DataFrame(perturbed_interpretable_data, columns=feature_names)

# Then use the same when creating instance_df
instance_df = pd.DataFrame(instance_to_explain.reshape(1, -1), columns=feature_names)


# 2. Pair the feature names with their learned local weights
feature_importance_pairs = list(zip(feature_names, coefficients))

# 3. Sort the pairs by the absolute value of the weight, in descending order
# We use a lambda function to tell sorted() to look at the second item (the weight) in each pair
sorted_features = sorted(feature_importance_pairs, key=lambda x: abs(x[1]), reverse=True)

local_model_prob = reg.predict(instance_df)[0][0] 
# 4. Present the top 5 most influential features for this prediction
print(f"Explanation for instance {0}:")
print(f"Black box model prediction: {perturbed_predictions[0]}")
print(f"Local interpretable model prediction: {local_model_prob:.4f}\n")
print("Top 5 features influencing this prediction:")

for feature, weight in sorted_features[:5]:
    if weight > 0:
        print(f"- '{feature}' ({weight:.4f}) pushed the prediction towards 'malignant'.")
    else:
        print(f"- '{feature}' ({weight:.4f}) pushed the prediction away from 'malignant' (towards 'benign').")

Explanation for instance 0:
Black box model prediction: 1
Local interpretable model prediction: -0.2201

Top 5 features influencing this prediction:
- 'worst perimeter' (-0.0021) pushed the prediction away from 'malignant' (towards 'benign').
- 'area error' (-0.0009) pushed the prediction away from 'malignant' (towards 'benign').
- 'mean concave points' (-0.0003) pushed the prediction away from 'malignant' (towards 'benign').
- 'worst smoothness' (0.0001) pushed the prediction towards 'malignant'.
- 'worst concave points' (-0.0001) pushed the prediction away from 'malignant' (towards 'benign').


In [15]:
print(feature_names)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
