# Feature selection with impurity and permutation importance on the Ames Housing dataset.
In this notebook, we will compare UFI, MDI and Permutation importance on their ability to perform feature selection on the high-dimensional Ames Housing dataset, and their associated computational cost.

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import warnings

from sklearn.compose import make_column_transformer
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

# Data fetching

In [2]:
ames_housing = fetch_openml(data_id=42165, as_frame=True, return_X_y=False)
y = ames_housing["target"].astype(float)
ames_housing = ames_housing["data"].drop(columns="Id")
ames_housing.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


# Data preprocessing

In [3]:
numerical_features = ames_housing.select_dtypes("number").columns
categorical_features = ames_housing.columns.difference(numerical_features)

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = make_column_transformer(
    (categorical_pipeline, categorical_features),
    (SimpleImputer(strategy="mean"), numerical_features),
    remainder='passthrough'
)

ames_housing_transformed = preprocessor.fit_transform(ames_housing)

feature_names = (
    categorical_features.tolist() + 
    numerical_features.tolist()
)

ames_housing_preprocessed = pd.DataFrame(
    ames_housing_transformed,
    columns=feature_names,
    index=ames_housing.index
)

# Augment the dataset with random features of varying cardinality

In [4]:
random_cat_sizes = [2, 5, 10, 20, 50, 100, 200]
n_random_num = 10
random_features = list()
n_sample = len(y)
rng = np.random.RandomState(42)

X = ames_housing_preprocessed.copy()
for cat_size in random_cat_sizes:
    X[f"random_cat_{cat_size}"] = rng.randint(0, cat_size, size=n_sample)
    random_features.append(f"random_cat_{cat_size}")
for i in range(n_random_num):
    X[f"random_num_{i}"] = rng.normal(size=n_sample)
    random_features.append(f"random_num_{i}")

X.head()

Unnamed: 0,Alley,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,CentralAir,Condition1,Condition2,...,random_num_0,random_num_1,random_num_2,random_num_3,random_num_4,random_num_5,random_num_6,random_num_7,random_num_8,random_num_9
0,0.0,0.0,3.0,3.0,2.0,5.0,2.0,1.0,2.0,2.0,...,0.725073,-1.223779,-0.240672,-1.333454,0.861844,-0.267616,-1.212832,-1.11856,0.020618,-0.285014
1,0.0,0.0,3.0,1.0,0.0,5.0,2.0,1.0,1.0,2.0,...,1.443513,-0.390764,-0.848696,-1.763622,0.73704,-1.273313,0.857255,-0.912569,0.132836,-1.998611
2,0.0,0.0,3.0,2.0,2.0,5.0,2.0,1.0,2.0,2.0,...,-0.6081,-1.174722,0.551793,1.091203,0.301727,0.384974,1.645823,-0.337125,1.258517,-0.868401
3,0.0,0.0,1.0,3.0,0.0,5.0,3.0,1.0,2.0,2.0,...,0.780452,-0.377384,-0.435901,0.66294,1.316805,-0.133807,-1.632656,0.118621,-1.470126,1.174689
4,0.0,0.0,3.0,0.0,2.0,5.0,2.0,1.0,2.0,2.0,...,-0.712072,-0.465242,0.644522,0.698413,0.197395,-0.178086,-0.020094,0.499037,-0.493514,-0.599126


# Benchmark a RF on the full dataset

In [5]:
cv = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)

baseline_cv_results = cross_validate(
    RandomForestRegressor(random_state=0, n_jobs=-1),
    X,
    y,
    scoring="r2",
    cv=cv,
)

# Cross-validate the performance of a RFECV that uses MDI for feature selection

In [6]:
# Keep at least 25% of features
min_features_to_select = int(X.shape[1]*0.25)

mdi_cv_results = cross_validate(
    RFECV(
        RandomForestRegressor(random_state=0, n_jobs=-1),
        importance_getter="feature_importances_",
        min_features_to_select=min_features_to_select,
        cv=cv,
    ),
    X=X,
    y=y,
    scoring="r2",
    cv=cv,
    return_estimator=True,
)

# Cross-validate the performance of a RFECV that uses UFI for feature selection

In [7]:
warnings.filterwarnings(
    "ignore", message=r"The number of unique classes is greater than 50% .*"
)
ufi_cv_results = cross_validate(
    RFECV(
        RandomForestRegressor(oob_score=True, random_state=0, n_jobs=-1),
        importance_getter="unbiased_feature_importances_",
        min_features_to_select=min_features_to_select,
        cv=cv,
    ),
    X=X,
    y=y,
    scoring="r2",
    cv=cv,
    return_estimator=True,
)

# Summarize the results

In [12]:
def summarize_cv_results(cv_results, method_name):
    print(
        f"Average cross val score for {method_name}: {cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f} std.\n"
        f"Total run time: {cv_results['fit_time'].mean():.1f} seconds.\n"
    )

    estimators = cv_results.get("estimator")
    if estimators is not None and isinstance(estimators[0], RFECV):
        mean_random_features = np.mean(
            [
                sum(
                    column in random_features
                    for column in X.columns[feature_selector.support_]
                )
                for feature_selector in estimators
            ]
        )

        print(
            f"Selected {np.mean([e.support_.sum() for e in cv_results['estimator']]):.1f} features on average (total of {X.shape[1]}).\n"
            f"Average number of random features kept by the procedure: {mean_random_features:.1f} ({len(random_features)} in total).\n"
        )
        for feature_selector in estimators:
            print(X.columns[feature_selector.support_].to_list())

In [13]:
summarize_cv_results(baseline_cv_results, "RF with all features")

Average cross val score for RF with all features: 0.834 +/- 0.018 std.
Total run time: 0.4 seconds.



In [14]:
summarize_cv_results(mdi_cv_results, "RFECV with MDI")

Average cross val score for RFECV with MDI: 0.833 +/- 0.019 std.
Total run time: 147.6 seconds.

Selected 62.6 features on average (total of 96).
Average number of random features kept by the procedure: 14.6 (17 in total).

['BsmtFinType1', 'BsmtQual', 'CentralAir', 'ExterQual', 'GarageFinish', 'GarageType', 'KitchenQual', 'LotShape', 'MSZoning', 'Neighborhood', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'MoSold', 'random_cat_10', 'random_cat_20', 'random_cat_50', 'random_cat_100', 'random_cat_200', 'random_num_0', 'random_num_1', 'random_num_2', 'random_num_3', 'random_num_4', 'random_num_5', 'random_num_6', 'random_num_7', 'random_num_8', 'random_num_9']
['Alley', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtQ

In [15]:
summarize_cv_results(ufi_cv_results, "RFECV with UFI")

Average cross val score for RFECV with UFI: 0.852 +/- 0.018 std.
Total run time: 197.6 seconds.

Selected 29.4 features on average (total of 96).
Average number of random features kept by the procedure: 0.6 (17 in total).

['BsmtFinType1', 'BsmtQual', 'CentralAir', 'ExterQual', 'GarageFinish', 'GarageType', 'KitchenQual', 'LotShape', 'MSZoning', 'Neighborhood', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'random_num_4']
['BsmtExposure', 'BsmtQual', 'CentralAir', 'ExterQual', 'Exterior2nd', 'GarageFinish', 'GarageType', 'KitchenQual', 'MSZoning', 'Neighborhood', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', '