In [1]:
import numpy as np
from sklearn.base import clone
from sklearn.dummy import DummyRegressor
from joblib import Parallel, delayed

class LOCI:
    def __init__(self, estimator, random_state=None, loss=None, n_jobs=1):
        self.estimator = estimator
        self.random_state = random_state
        self.loss = loss
        self.n_jobs = n_jobs
        self.feature_names_ = None
        self.X_train_ = None
        self.y_train_ = None

    def fit(self, X, y):
        self.X_train_ = X
        self.y_train_ = y
        self.feature_names_ = X.columns
        return self

    def _score_single_feature(self, j, X_test, y_test, v0):
        fname = self.feature_names_[j]
        model_j = clone(self.estimator)
        model_j.fit(self.X_train_[[fname]], self.y_train_)
        preds_j = model_j.predict(X_test[[fname]])
        vj = self.loss(y_test, preds_j)
        return fname, v0 - vj

    def score(self, X_test, y_test):
        if self.X_train_ is None:
            raise ValueError("You must call `fit` before `score`.")

        # Baseline: constant (mean) model
        dummy = DummyRegressor(strategy="mean")
        dummy.fit(self.X_train_, self.y_train_)
        pred_dummy = dummy.predict(np.zeros((len(X_test), 1)))  # doesn't use features
        v0 = self.loss(y_test, pred_dummy)

        # LOCI: leave-one-covariate-in
        results = Parallel(n_jobs=self.n_jobs)(
            delayed(self._score_single_feature)(j, X_test, y_test, v0)
            for j in range(X_test.shape[1])
        )

        return dict(results)


In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import pandas as pd

# Load dataset
data = load_diabetes()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create LOCI object
loci = LOCI(
    estimator=RandomForestRegressor(random_state=42),
    random_state=42,
    loss=mean_squared_error,
    n_jobs=-1,
)

loci.fit(X_train, y_train)
loci_importance = loci.score(X_test, y_test)

print("LOCI Importances:\n", loci_importance)


LOCI Importances:
 {'age': -270.82926799915003, 'sex': 27.756080827185542, 'bmi': 1230.2753625796322, 'bp': -23.115972397773476, 's1': -2365.93678873304, 's2': -2575.9084327976316, 's3': 205.40949575640116, 's4': 872.8721525349347, 's5': 1225.332414583554, 's6': 528.2932637845697}


In [7]:
np.array(list(loci_importance.values()))

array([ -270.829268  ,    27.75608083,  1230.27536258,   -23.1159724 ,
       -2365.93678873, -2575.9084328 ,   205.40949576,   872.87215253,
        1225.33241458,   528.29326378])