# Feature selection with impurity and permutation importance on the Ames Housing dataset.
In this notebook, we will compare UFI, MDI and Permutation importance on their ability to perform feature selection on the high-dimensional Ames Housing dataset, and their associated computational cost.

# Import Libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import warnings

from sklearn.compose import make_column_transformer
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

# Data fetching

In [2]:
ames_housing = fetch_openml(data_id=42165, as_frame=True, return_X_y=False)
y = ames_housing["target"].astype(float)
ames_housing = ames_housing["data"].drop(columns="Id")
ames_housing.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


# Data preprocessing

In [3]:
numerical_features = ames_housing.select_dtypes("number").columns
categorical_features = ames_housing.columns.difference(numerical_features)

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = make_column_transformer(
    (categorical_pipeline, categorical_features),
    (SimpleImputer(strategy="mean"), numerical_features),
    remainder='passthrough'
)

ames_housing_transformed = preprocessor.fit_transform(ames_housing)

feature_names = (
    categorical_features.tolist() + 
    numerical_features.tolist()
)

ames_housing_preprocessed = pd.DataFrame(
    ames_housing_transformed,
    columns=feature_names,
    index=ames_housing.index
)

# Add random features of varying cardinality

In [4]:
random_cat_sizes = [2, 5, 10, 20, 50, 100]
random_features = list()
n_sample = len(y)
rng = np.random.RandomState(42)

X = ames_housing_preprocessed.copy()
for cat_size in random_cat_sizes:
    X[f"random_cat_{cat_size}"] = rng.randint(0, cat_size, size=n_sample)
    random_features.append(f"random_cat_{cat_size}")
X["random_num"] = rng.normal(size=n_sample)
random_features.append("random_num")

X.head()

Unnamed: 0,Alley,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtQual,CentralAir,Condition1,Condition2,...,MiscVal,MoSold,YrSold,random_cat_2,random_cat_5,random_cat_10,random_cat_20,random_cat_50,random_cat_100,random_num
0,0.0,0.0,3.0,3.0,2.0,5.0,2.0,1.0,2.0,2.0,...,0.0,2.0,2008.0,0,4,7,19,36,68,-1.583187
1,0.0,0.0,3.0,1.0,0.0,5.0,2.0,1.0,1.0,2.0,...,0.0,5.0,2007.0,1,4,0,17,45,53,0.114476
2,0.0,0.0,3.0,2.0,2.0,5.0,2.0,1.0,2.0,2.0,...,0.0,9.0,2008.0,0,4,4,9,23,53,1.573412
3,0.0,0.0,1.0,3.0,0.0,5.0,3.0,1.0,2.0,2.0,...,0.0,2.0,2006.0,0,2,2,8,15,27,0.558137
4,0.0,0.0,3.0,0.0,2.0,5.0,2.0,1.0,2.0,2.0,...,0.0,12.0,2008.0,0,2,0,7,12,55,-0.10579


# Train and optimize a `RandomForestRegressor`.

In [5]:
# Define parameter grid for min_samples_leaf
param_grid = {"min_samples_leaf": [1, 5, 20, 100], "max_features":["log2", "sqrt", 1.0], "n_estimators": [10, 50, 100, 200]}

# Initialize Random Forest
rf = RandomForestRegressor(random_state=rng, n_jobs=-1)
rf.fit(X, y)

# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring="r2",
    return_train_score=True,
    n_jobs=-1,
    verbose=1,
)

# Fit the grid search
grid_search.fit(X, y)

# Get the best parameter and estimator
best_params = grid_search.best_params_
best_score_gs = grid_search.best_score_

print(f"Best params: {best_params}")
print(f"Best score: {best_score_gs:.3f}")

common_params = best_params
common_params["n_jobs"] = -1



Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best params: {'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 200}
Best score: 0.857


# Feature selection procedure

In [6]:
def feature_selection_RFECV(method, common_params, X, y, n_fold, random_state):
    warnings.filterwarnings(
        "ignore", message=r"The number of unique classes is greater than 50% .*"
    )
    res = dict()
    if method == "UFI":
        # With oob_score=True, rf will compute UFI during fit
        common_params["oob_score"] = True
        getter = "unbiased_feature_importances_"

    elif method == "MDI":
        common_params["oob_score"] = False
        getter = "feature_importances_"

    rng = np.random.RandomState(random_state)
    common_params["random_state"] = rng
    # Define a rf regressor
    rf = RandomForestRegressor(**common_params)

    start_time = time.time()
    for fold in range(n_fold):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=int(0.5 * X.shape[0]), random_state=rng
        )
        init_score = r2_score(y_test, rf.fit(X_train, y_train).predict(X_test))

        feature_selector = RFECV(
            estimator=rf,
            scoring="r2",
            n_jobs=-1,
            importance_getter=getter,
        )

        feature_selector.fit(X_train, y_train)
        cv_results = feature_selector.cv_results_

        best_score = r2_score(
            y_test,
            feature_selector.estimator_.predict(
                X_test.to_numpy()[:, feature_selector.support_]
            ),
        )
        res[fold] = dict(
            init_test_score=init_score,
            best_test_score=best_score,
            init_cv_score=cv_results["mean_test_score"][-1],
            best_cv_score=np.max(cv_results["mean_test_score"]),
            n_random_features=sum(
                column in random_features
                for column in X.columns[feature_selector.support_]
            ),
            n_removed_features=len(X.columns) - feature_selector.n_features_,
        )

    run_time = time.time() - start_time

    return {
        "best_test_score_mean": np.mean(
            [res[fold]["best_test_score"] for fold in range(n_fold)]
        ),
        "best_test_score_std": np.std(
            [res[fold]["best_test_score"] for fold in range(n_fold)]
        ),
        "init_test_score_mean": np.mean(
            [res[fold]["init_test_score"] for fold in range(n_fold)]
        ),
        "init_test_score_std": np.std(
            [res[fold]["init_test_score"] for fold in range(n_fold)]
        ),
        "init_cv_score_mean": np.mean(
            [res[fold]["init_cv_score"] for fold in range(n_fold)]
        ),
        "init_cv_score_std": np.std(
            [res[fold]["init_cv_score"] for fold in range(n_fold)]
        ),
        "best_cv_score_mean": np.mean(
            [res[fold]["best_cv_score"] for fold in range(n_fold)]
        ),
        "best_cv_score_std": np.std(
            [res[fold]["best_cv_score"] for fold in range(n_fold)]
        ),
        "n_random_features_mean": np.mean(
            [res[fold]["n_random_features"] for fold in range(n_fold)]
        ),
        "n_removed_features_mean": np.mean(
            [res[fold]["n_removed_features"] for fold in range(n_fold)]
        ),
        "run_time": run_time,
    }


In [7]:
n_fold = 5
random_state = 42

# MDI for feature selection

In [8]:
res_MDI = feature_selection_RFECV("MDI", common_params, X, y, n_fold, random_state)
print(
    f"MDI results across {n_fold} folds:\n"
    f"Total run time: {res_MDI["run_time"]:.1f}.\n"
    f"Removed {res_MDI["n_removed_features_mean"]} features on average. ({X.shape[1]} in total)\n"
    f"Average cross val score of the full model: {res_MDI["init_cv_score_mean"]:.3f} +/- {res_MDI["init_cv_score_std"]:.3f} std.\n"
    f"Average test score of the full model: {res_MDI["init_test_score_mean"]:.3f} +/- {res_MDI["init_test_score_std"]:.3f} std.\n"
    f"Average cross val score of the best model: {res_MDI["best_cv_score_mean"]:.3f} +/- {res_MDI["best_cv_score_std"]:.3f} std.\n"
    f"Average test score of the best model: {res_MDI["best_test_score_mean"]:.3f} +/- {res_MDI["best_test_score_std"]:.3f} std.\n"
    f"Average number of random features kept by the procedure {res_MDI["n_random_features_mean"]:.3f} ({len(random_features)} in total)."
)

MDI results across 5 folds:
Total run time: 395.4.
Removed 57.2 features on average. (86 in total)
Average cross val score of the full model: 0.848 +/- 0.019 std.
Average test score of the full model: 0.826 +/- 0.017 std.
Average cross val score of the best model: 0.860 +/- 0.021 std.
Average test score of the best model: 0.829 +/- 0.021 std.
Average number of random features kept by the procedure 2.200 (7 in total).


# UFI for feature selection

In [9]:
res_UFI = feature_selection_RFECV("UFI", common_params, X, y, n_fold, random_state)
print(
    f"UFI results across {n_fold} folds:\n"
    f"Total run time: {res_UFI["run_time"]:.1f}.\n"
    f"Removed {res_UFI["n_removed_features_mean"]} features on average. ({X.shape[1]} in total)\n"
    f"Average cross val score of the full model: {res_UFI["init_cv_score_mean"]:.3f} +/- {res_UFI["init_cv_score_std"]:.3f} std.\n"
    f"Average test score of the full model: {res_UFI["init_test_score_mean"]:.3f} +/- {res_UFI["init_test_score_std"]:.3f} std.\n"
    f"Average cross val score of the best model: {res_UFI["best_cv_score_mean"]:.3f} +/- {res_UFI["best_cv_score_std"]:.3f} std.\n"
    f"Average test score of the best model: {res_UFI["best_test_score_mean"]:.3f} +/- {res_UFI["best_test_score_std"]:.3f} std.\n"
    f"Average number of random features kept by the procedure {res_UFI["n_random_features_mean"]:.3f} ({len(random_features)} in total)."
)

UFI results across 5 folds:
Total run time: 653.1.
Removed 57.8 features on average. (86 in total)
Average cross val score of the full model: 0.848 +/- 0.019 std.
Average test score of the full model: 0.826 +/- 0.017 std.
Average cross val score of the best model: 0.861 +/- 0.019 std.
Average test score of the best model: 0.833 +/- 0.023 std.
Average number of random features kept by the procedure 0.000 (7 in total).
