In [1]:
import pandas as pd
X = pd.DataFrame({'age':[25, 32, 47, 51, 62],
                  'height':[182, 176, 174, 168, 181],
                  'weight': [75, 71, 78, 72, 86]})
X

Unnamed: 0,age,height,weight
0,25,182,75
1,32,176,71
2,47,174,78
3,51,168,72
4,62,181,86


In [2]:
y=pd.Series([20, 32, 45, 55, 61],name='income')
y

0    20
1    32
2    45
3    55
4    61
Name: income, dtype: int64

In [3]:
import numpy as np
### make X_shadow by randomly permuting each column of X
np.random.seed(42)
X_shadow = X.apply(np.random.permutation)
X_shadow.columns = ['shadow_' + feat for feat in X.columns]
### make X_boruta by appending X_shadow to X
X_boruta = pd.concat([X, X_shadow], axis = 1)
X_boruta

Unnamed: 0,age,height,weight,shadow_age,shadow_height,shadow_weight
0,25,182,75,32,168,71
1,32,176,71,62,176,75
2,47,174,78,47,174,72
3,51,168,72,25,182,86
4,62,181,86,51,181,78


In [4]:
from sklearn.ensemble import RandomForestRegressor

### fit a random forest (suggested max_depth between 3 and 7)
forest = RandomForestRegressor(max_depth = 5, random_state = 42)
forest.fit(X_boruta, y)

### store feature importances
feat_imp_X = forest.feature_importances_[:len(X.columns)]
feat_imp_shadow = forest.feature_importances_[len(X.columns):]

### compute hits
hits = feat_imp_X > feat_imp_shadow.max()
hits

array([ True, False, False])

In [5]:
print(feat_imp_X)
print(feat_imp_shadow)

[0.28112381 0.15808892 0.0925464 ]
[0.10162149 0.21182001 0.15479937]


In [6]:
### initialize hits counter
hits = np.zeros((len(X.columns)))
### repeat 20 times
for iter_ in range(20):
   ### make X_shadow by randomly permuting each column of X
   np.random.seed(iter_)
   X_shadow = X.apply(np.random.permutation)
   X_boruta = pd.concat([X, X_shadow], axis = 1)
   ### fit a random forest (suggested max_depth between 3 and 7)
   forest = RandomForestRegressor(max_depth = 5, random_state = 42)
   forest.fit(X_boruta, y)
   ### store feature importance
   feat_imp_X = forest.feature_importances_[:len(X.columns)]
   feat_imp_shadow = forest.feature_importances_[len(X.columns):]
   ### compute hits for this trial and add to counter
   hits += (feat_imp_X > feat_imp_shadow.max())
print(hits)

[18.  5.  0.]


In [7]:
import scipy

trials = 20
pmf = [scipy.stats.binom.pmf(x, trials, .5) for x in range(trials + 1)]
pmf

[9.5367431640625e-07,
 1.9073486328125e-05,
 0.0001811981201171875,
 0.0010871887207031235,
 0.004620552062988286,
 0.014785766601562505,
 0.03696441650390626,
 0.07392883300781253,
 0.12013435363769544,
 0.16017913818359386,
 0.1761970520019531,
 0.16017913818359392,
 0.12013435363769544,
 0.07392883300781251,
 0.03696441650390626,
 0.014785766601562505,
 0.004620552062988286,
 0.0010871887207031233,
 0.00018119812011718745,
 1.9073486328124997e-05,
 9.5367431640625e-07]