In [26]:
import pandas as pd
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# FOR REGRESSION

In [74]:
data = pd.read_csv('data/housing.csv')
data.dropna(inplace=True)
data.head()

Unnamed: 0,per_capita_crime_rate,proportion_of_residential_land_over_25000_sq.ft.,proportion_of_non-retail_business_acres_per_town,Charles_River_dummy_variable_(1_if_tract_bounds_river;_0_otherwise),nitric_oxides_concentration_(parts_per_10_million),average_number_of_rooms_per_dwelling,proportion_of_owner-occupied_units_built_prior_to_1940,weighted_distances_to_five_Boston_employment_centres,index_of_accessibility_to_radial_highways,full-value_property-tax_rate_per_$10000,pupil-teacher_ratio_by_town,1000(Bk _0.63)^2_where_Bk_is_the_proportion_of_blacks_by_town,%_lower_status_population,Median_House_Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43,22.9


In [75]:
target = 'Median_House_Price'

In [76]:
forest = RandomForestRegressor(
   n_jobs = -1, 
   max_depth = 5
)

boruta = BorutaPy(
   estimator = forest, 
   n_estimators = 'auto',
   max_iter = 100 # number of trials to perform
)

# fit boruta

In [85]:
%%time
model = boruta.fit(data[[column for column in data.columns if column!=target]].values, data[[target]].values.ravel())

Wall time: 27.3 s


# results

In [86]:
model.ranking_, data.columns.values.tolist()

(array([1, 5, 3, 5, 1, 1, 1, 1, 4, 1, 1, 2, 1]),
 ['per_capita_crime_rate',
  'proportion_of_residential_land_over_25000_sq.ft.',
  'proportion_of_non-retail_business_acres_per_town',
  'Charles_River_dummy_variable_(1_if_tract_bounds_river;_0_otherwise)',
  'nitric_oxides_concentration_(parts_per_10_million)',
  'average_number_of_rooms_per_dwelling',
  'proportion_of_owner-occupied_units_built_prior_to_1940',
  'weighted_distances_to_five_Boston_employment_centres',
  'index_of_accessibility_to_radial_highways',
  'full-value_property-tax_rate_per_$10000',
  'pupil-teacher_ratio_by_town',
  '1000(Bk _0.63)^2_where_Bk_is_the_proportion_of_blacks_by_town',
  '%_lower_status_population',
  'Median_House_Price'])

In [87]:
green_area = data[[column for column in data.columns if column!=target]].columns[boruta.support_].to_list()
blue_area = data[[column for column in data.columns if column!=target]].columns[boruta.support_weak_].to_list()
print('features in the green area:', green_area)
print('\nfeatures in the blue area:', blue_area)

features in the green area: ['per_capita_crime_rate', 'nitric_oxides_concentration_(parts_per_10_million)', 'average_number_of_rooms_per_dwelling', 'proportion_of_owner-occupied_units_built_prior_to_1940', 'weighted_distances_to_five_Boston_employment_centres', 'full-value_property-tax_rate_per_$10000', 'pupil-teacher_ratio_by_town', '%_lower_status_population']

features in the blue area: ['1000(Bk _0.63)^2_where_Bk_is_the_proportion_of_blacks_by_town']


# FOR CLASSIFICATION

In [88]:
data = pd.read_csv('data/iris_data.csv')
data.dropna(inplace=True)
data.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [89]:
target = 'class'

In [90]:
forest = RandomForestClassifier(
   n_jobs = -1, 
   max_depth = 5
)

boruta = BorutaPy(
   estimator = forest, 
   n_estimators = 'auto',
   max_iter = 100 # number of trials to perform
)

# fit boruta

In [91]:
%%time
model = boruta.fit(data[[column for column in data.columns if column!=target]].values, data[[target]].values.ravel())

Wall time: 1.97 s


# results

In [92]:
model.ranking_, data.columns.values.tolist()

(array([1, 1, 1, 1]),
 ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'])

In [93]:
green_area = data[[column for column in data.columns if column!=target]].columns[boruta.support_].to_list()
blue_area = data[[column for column in data.columns if column!=target]].columns[boruta.support_weak_].to_list()
print('features in the green area:', green_area)
print('\nfeatures in the blue area:', blue_area)

features in the green area: ['sepal-length', 'sepal-width', 'petal-length', 'petal-width']

features in the blue area: []
