# Feature selection and shap clustering

In [None]:
# this cell's metadata contains
# "nbsphinx": "hidden" so it is hidden by nbsphinx
PATH_YIELD_ENGINE = 'src'
def set_paths() -> None:
    import sys
    import os
    
    if 'cwd' not in globals():
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir)
        os.chdir(cwd)   
    print(f"working dir is '{os.getcwd()}'")                            
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    print(f"added `{sys.path[0]}` to python paths")
set_paths()

In [None]:
import pandas as pd
#from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

from gamma import Sample
from gamma.viz.dendrogram import DendrogramDrawer, DendrogramFeatMapStyle, DendrogramLineStyle, DendrogramReportStyle
from gamma.sklearndf.pipeline import PipelineDF, ModelPipelineDF
from gamma.sklearndf.transformation import SimpleImputerDF, MissingIndicatorDF
from gamma.sklearndf.transformation.extra import BorutaDF
from gamma.sklearndf.regression import RandomForestRegressorDF, LGBMRegressorDF
from gamma.model.inspection import ModelInspector
from gamma.model.prediction import PredictorFitCV
from gamma.model.selection import ModelGrid, ModelRanker, summary_report
from gamma.model.validation import CircularCrossValidator
from gamma.yieldengine.simulation import UnivariateSimulator
from gamma.yieldengine.partition import ContinuousRangePartitioning

In [None]:
df = pd.read_csv('data/ames-housing-dataset/train.csv')
df.head()
df = df.drop(['Id', 'YrSold', 'MoSold'], axis=1)
TARGET = "SalePrice"
num_features = df.select_dtypes('number').columns
df = df[list(num_features)]
sample = Sample(observations=df, target_name=TARGET)

## BorutaDF

In [None]:
boruta_selector = PipelineDF(steps = [
  ('preprocess', SimpleImputerDF()),
  ('boruta', BorutaDF(estimator=RandomForestRegressor(), n_estimators=10, verbose=2, max_iter=10, random_state=0, perc=90))])

In [None]:
boruta_selector.fit(sample.features, sample.target);

In [None]:
boruta_selector.columns_original

In [None]:
sample_post_boruta = sample.select_features(boruta_selector.columns_original)

## Shap clustering vizualisation

In [None]:
cv = CircularCrossValidator(test_ratio=1/3, num_splits=6)
model = ModelPipelineDF(predictor=RandomForestRegressorDF(n_estimators=10, random_state=0), preprocessing=SimpleImputerDF())
predictor = PredictorFitCV(model=model, cv=cv, sample=sample_post_boruta)
inspector = ModelInspector(predictor_fit=predictor)

In [None]:
linkage_tree = inspector.cluster_dependent_features()

In [None]:
ax = plt.figure(figsize=(10, 10)).add_subplot(111)
style = DendrogramFeatMapStyle(ax)
DendrogramDrawer(title=TARGET, linkage_tree=linkage_tree, style=style).draw()

In [None]:
ax = plt.figure(figsize=(10, 10)).add_subplot(111)
style = DendrogramLineStyle(ax)
DendrogramDrawer(title=TARGET, linkage_tree=linkage_tree, style=style).draw()

In [None]:
style = DendrogramReportStyle()
DendrogramDrawer(title=TARGET, linkage_tree=linkage_tree, style=style).draw()

## Shap clustering iteration

The next step is to use the denrogram visualization to discard features.
A strategy here is to choose features which are strongly correlated with others, and have smaller feature importance.
Based on the [heat map dendrogram](#HeatMap-dendrogram-style) we could decide to discard 
the features ``1stFlrSF`` and  ``2ndFlrSF``.

In [None]:
black_list = ["1stFlrSF", "2ndFlrSF"]
white_list = sorted(list(set(sample_post_boruta.feature_names) - set(black_list)))
new_sample = sample_post_boruta.select_features(white_list)

In [None]:
predictor_new = PredictorFitCV(model=model, cv=cv, sample=new_sample)
inspector_new = ModelInspector(predictor_new)

And we can visualize the new shap clustering:

In [None]:
linkage_tree_new = inspector_new.cluster_dependent_features()

In [None]:
ax = plt.figure(figsize=(10, 10)).add_subplot(111)
style = DendrogramFeatMapStyle(ax)
DendrogramDrawer(title=TARGET, linkage_tree=linkage_tree_new, style=style).draw()

If necessary, we can keep going and make a new round of feature selection based on shap clustering.