In this notebook, we will estimate the computational cost of all the methods presented in the report.
This code uses a Cython optimised version of UFI available on the following fork of scikit-learn : https://github.com/GaetandeCast/scikit-learn/tree/unbiased-feature-importance

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import time
from sage import MarginalImputer, PermutationEstimator

from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from data_generator import generate_gaussian_causal_network
from oob_methods import compute_oob_fis


In [2]:
# We will first consider the following DAG (arrows are directed downwards):
#           x_0
#            |
#       x_1 x_2
#    x_3  \  | \
#      \   \ |  \
#       x_4 x_5  x_6
#         \ /
#         x_7    x_8
# x_5 will be the target. Its Markov Blanket is (x_1, x_2, x_4, x_7) so we want these
# features to receive the highest importance.
# The only non zero weights will be w_2,0 ; w_4,3 ; w_5,1 ; w_5,2 ; w_6,2 ; w_7,4 ; w_7,5
# We choose arbitrary values for the weight, bias and variance terms and therefore define:

#             x0  x1 x2 x3 x4 x5  x6 x7 x8
b = np.array([1, -1, 0, 2, 1, 2, -2, 3, 5])

#              x0   x1   x2   x3   x4   x5   x6   x7   x8
v = np.array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5])

w = np.array(
    [
        # x0   x1   x2   x3   x4   x5   x6   x7   x8
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # x0
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # x1
        [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # x2
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # x3
        [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # x4
        [0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # x5
        [0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # x6
        [0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0],  # x7
        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # x8
    ]
)

n_feature = len(b) - 1
idx_target = 5
idx_features = [i for i in range(n_feature + 1) if i != idx_target]
rng = np.random.RandomState(42)

In [33]:
n_simulations = 50
train_size, test_size = (500, 500)
n_estimators = 50

mdi_res = list()
ufi_high_level_res = list()
ufi_low_level_res = list()
sage_res = list()
permut_res = list()

for sim in range(n_simulations):
    Xy, mu, Sigma = generate_gaussian_causal_network(
        n_sample=train_size + test_size,
        b=b,
        v=v,
        w=w,
        random_state=rng,
    )
    X, y = Xy[:, idx_features], Xy[:, idx_target]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, shuffle=True, random_state=rng
    )

    regressor = RandomForestRegressor(
        n_estimators=n_estimators,
        oob_score=True,
        n_jobs=-1,
        random_state=rng,
    ).fit(X_train, y_train)

    start_time = time.time()
    _ = regressor._unnormalized_feature_importances
    mdi_res.append(time.time() - start_time)
    
    start_time = time.time()
    _ = compute_oob_fis(
        regressor,
        X_train,
        y_train,
        mean_squared_error,
        ["UFI"],
    )
    ufi_high_level_res.append(time.time() - start_time)

    start_time = time.time()
    _ = regressor._compute_ufi_and_oob_pred(X_train, y_train, sample_weight=None)
    ufi_low_level_res.append(time.time() - start_time)

    start_time = time.time()
    imputer = MarginalImputer(regressor, X_test)
    estimator = PermutationEstimator(imputer, "mse", random_state=rng)
    _ = estimator(X_test, y_test, thresh=0.1, bar=False).values
    sage_res.append(time.time() - start_time)

    start_time = time.time()
    _ = permutation_importance(
        regressor, X_test, y_test, n_repeats=5, random_state=rng
    ).importances_mean
    permut_res.append(time.time() - start_time)


In [34]:
methods = {
    'MDI': mdi_res,
    'UFI high level': ufi_high_level_res, 
    'UFI low level': ufi_low_level_res,
    'Permutation': permut_res,
    'SAGE': sage_res,
}
print("Feature importance computation times")
print("=" * 50)
print(f"Number of train/test points: {(train_size, test_size)}")

for method_name, results_array in methods.items():
    print(f"{method_name:15} |  {np.mean(results_array)*1000:8.1f} ms (+/- {np.std(results_array)*1000:8.1f})")


Feature importance computation times
Number of train/test points: (500, 500)
MDI             |      16.8 ms (+/-      1.3)
UFI high level  |    5506.7 ms (+/-    329.9)
UFI low level   |     192.6 ms (+/-     26.5)
Permutation     |     872.8 ms (+/-    121.4)
SAGE            |    2835.7 ms (+/-    966.2)


In [7]:
n_simulations = 50
train_size, test_size = (1000, 1000)
n_estimators = 50

mdi_res = list()
ufi_high_level_res = list()
ufi_low_level_res = list()
sage_res = list()
permut_res = list()

for sim in range(n_simulations):
    Xy, mu, Sigma = generate_gaussian_causal_network(
        n_sample=train_size + test_size,
        b=b,
        v=v,
        w=w,
        random_state=rng,
    )
    X, y = Xy[:, idx_features], Xy[:, idx_target]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, shuffle=True, random_state=rng
    )

    regressor = RandomForestRegressor(
        n_estimators=n_estimators,
        oob_score=True,
        n_jobs=-1,
        random_state=rng,
    ).fit(X_train, y_train)

    start_time = time.time()
    _ = regressor._unnormalized_feature_importances
    mdi_res.append(time.time() - start_time)
    
    start_time = time.time()
    _ = compute_oob_fis(
        regressor,
        X_train,
        y_train,
        mean_squared_error,
        ["UFI"],
    )
    ufi_high_level_res.append(time.time() - start_time)

    start_time = time.time()
    _ = regressor._compute_ufi_and_oob_pred(X_train, y_train, sample_weight=None)
    ufi_low_level_res.append(time.time() - start_time)

    start_time = time.time()
    imputer = MarginalImputer(regressor, X_test)
    estimator = PermutationEstimator(imputer, "mse", random_state=rng)
    _ = estimator(X_test, y_test, thresh=0.1, bar=False).values
    sage_res.append(time.time() - start_time)

    start_time = time.time()
    _ = permutation_importance(
        regressor, X_test, y_test, n_repeats=5, random_state=rng
    ).importances_mean
    permut_res.append(time.time() - start_time)


In [32]:
methods = {
    'MDI': mdi_res,
    'UFI high level': ufi_high_level_res, 
    'UFI low level': ufi_low_level_res,
    'Permutation': permut_res,
    'SAGE': sage_res,
}
print("Feature importance computation times")
print("=" * 50)
print(f"Number of train/test points: {(train_size, test_size)}")

for method_name, results_array in methods.items():
    print(f"{method_name:15} |  {np.mean(results_array)*1000:8.1f} ms (+/- {np.std(results_array)*1000:8.1f})")


Feature importance computation times
Number of train/test points: (1000, 1000)
MDI             |      17.4 ms (+/-      1.4)
UFI high level  |   13606.6 ms (+/-   1149.4)
UFI low level   |     355.1 ms (+/-     62.7)
Permutation     |    1294.8 ms (+/-    170.2)
SAGE            |    7028.1 ms (+/-   1657.0)
