In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Load the Iris dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Train a logistic regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Evaluate model performance on the test set
baseline_accuracy = accuracy_score(y_test, model.predict(X_test))
print(f"Baseline Accuracy: {baseline_accuracy:.4f}")

Baseline Accuracy: 1.0000


In [4]:
def permutation_importance(model, X_test, y_test, metric=accuracy_score, n_repeats=10):
    """
    Compute permutation feature importance.
    
    Parameters:
    - model: Trained machine learning model.
    - X_test: Test feature dataset (pandas DataFrame).
    - y_test: True labels for the test set.
    - metric: Function to evaluate model performance (default is accuracy_score).
    - n_repeats: Number of times to shuffle the feature and recompute the score.
    
    Returns:
    - importance: Dictionary mapping feature names to importance scores.
    """
    baseline_score = metric(y_test, model.predict(X_test))
    importance_scores = {col: [] for col in X_test.columns}

    for col in X_test.columns:
        for _ in range(n_repeats):
            # Shuffle the current feature
            X_test_shuffled = X_test.copy()
            X_test_shuffled[col] = np.random.permutation(X_test_shuffled[col])
            
            # Recompute the score with the shuffled feature
            shuffled_score = metric(y_test, model.predict(X_test_shuffled))
            
            # Compute the drop in performance
            importance_scores[col].append(baseline_score - shuffled_score)
    
    # Average the importance scores over the repetitions
    avg_importance = {col: np.mean(importance_scores[col]) for col in X_test.columns}
    
    return avg_importance

# Compute permutation importance
importances = permutation_importance(model, X_test, y_test)

# Display feature importances
print("\nFeature Importances (Permutation):")
for feature, importance in importances.items():
    print(f"{feature}: {importance:.4f}")


Feature Importances (Permutation):
sepal length (cm): 0.0200
sepal width (cm): 0.0000
petal length (cm): 0.6200
petal width (cm): 0.2000
