# Introduction

Our input is the resulting dataset from the Exploratory Data Analysis and the Feature Engineering performed on any original datasets.

# Notebooks & dashboard

There are 5 different notebooks:
* __P7_EDA__: focusing on Exploratory Data Analysis,
* __P7_FE__: focusing on Feature engineering and a first model-agnostic Feature Selection,
* __Current: P7_FS__: focusing on Feature Selection,
* __P7_Model__ : focusing on scoring with model evaluation,
* __P7_Interpretation__ : focusing on model interpretation, <br/>
And a dashboard python file.

# Forewords
In this FS notebook, we process a model-agnostic Feature Selection in order to focus on valuable features. There are many alternate techniques, we just found this one interesting to get to a truncated dataset and provide models of any kind with the same inputs. We could as well consider doing a more accurate feature selection step dedicated to the best model found.

In [1]:
import pickle

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from BorutaShap import BorutaShap

In [3]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd
pd.options.display.max_rows = 999

## Result of Feature Engineering

In [4]:
with open("Data/application_miss.pkl", mode="rb") as df:
    application = pickle.load(df)

In [5]:
# keep training applications only
train = application[application['TARGET'].notna()]

In [6]:
# set index with SK_ID_CURR
train = train.set_index('SK_ID_CURR')

# "Model agnostic" Boruta Feature Selection

In [7]:
# isolate target
X_ = train.drop(columns=['TARGET'])
y_ = train['TARGET']

In [8]:
del train

In [9]:
# sampling if test
X_sample, _, y_sample, _ = train_test_split(
        X_,
        y_,
        stratify=y_,
        test_size=0.9,
        random_state=42)

In [10]:
del X_
del y_

In [11]:
# impute strategy 'mean' ! introduce a dependency
imputer = SimpleImputer(
    missing_values=np.nan,
    strategy='mean')
imputer.fit(X_sample)
X_sample = imputer.transform(X_sample)

In [12]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=-1,
                            class_weight='balanced',
                            max_depth=4)

In [13]:
# define Boruta feature selection method
feat_selector = BorutaPy(rf,
                         n_estimators='auto',
                         max_iter=25,
                         verbose=2,
                         random_state=1)

In [14]:
# find all relevant features - features should be selected
feat_selector.fit(X_sample, y_sample)

Iteration: 	1 / 25
Confirmed: 	0
Tentative: 	950
Rejected: 	0
Iteration: 	2 / 25
Confirmed: 	0
Tentative: 	950
Rejected: 	0
Iteration: 	3 / 25
Confirmed: 	0
Tentative: 	950
Rejected: 	0
Iteration: 	4 / 25
Confirmed: 	0
Tentative: 	950
Rejected: 	0
Iteration: 	5 / 25
Confirmed: 	0
Tentative: 	950
Rejected: 	0
Iteration: 	6 / 25
Confirmed: 	0
Tentative: 	950
Rejected: 	0
Iteration: 	7 / 25
Confirmed: 	0
Tentative: 	950
Rejected: 	0
Iteration: 	8 / 25
Confirmed: 	258
Tentative: 	176
Rejected: 	516
Iteration: 	9 / 25
Confirmed: 	258
Tentative: 	176
Rejected: 	516
Iteration: 	10 / 25
Confirmed: 	258
Tentative: 	176
Rejected: 	516
Iteration: 	11 / 25
Confirmed: 	258
Tentative: 	176
Rejected: 	516
Iteration: 	12 / 25
Confirmed: 	265
Tentative: 	137
Rejected: 	548
Iteration: 	13 / 25
Confirmed: 	265
Tentative: 	137
Rejected: 	548
Iteration: 	14 / 25
Confirmed: 	265
Tentative: 	137
Rejected: 	548
Iteration: 	15 / 25
Confirmed: 	265
Tentative: 	137
Rejected: 	548
Iteration: 	16 / 25
Confirmed: 	

BorutaPy(estimator=RandomForestClassifier(class_weight='balanced', max_depth=4,
                                          n_estimators=686, n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x1AAE99DB6A8),
         max_iter=25, n_estimators='auto',
         random_state=RandomState(MT19937) at 0x1AAE99DB6A8, verbose=2)

In [15]:
# create df feature ranking & store
features = application.set_index('SK_ID_CURR').drop(columns=['TARGET']).columns
ranks = feat_selector.ranking_
with open("Data/boruta_ranking_mean.pkl", mode="wb") as array:
    pickle.dump(ranks, array)
df = pd.DataFrame({'feature': features, 'rank': ranks})
df = df.sort_values(by=['rank'], ascending=True)

In [16]:
df = pd.DataFrame({'feature': features, 'rank': ranks})

In [17]:
df = df.sort_values(by=['rank'], ascending=True)

In [18]:
type(df)

pandas.core.frame.DataFrame

In [19]:
df.to_csv('Data/boruta_ranking_mean.csv')

In [20]:
# get support features and filter resulting dataset & store
support = application.set_index('SK_ID_CURR').drop(columns=['TARGET']).columns[feat_selector.support_]
with open("Data/boruta_support_mean.pkl", mode="wb") as cols:
    pickle.dump(support, cols)
selected = []
selected = ['SK_ID_CURR'] + ['TARGET'] + support.tolist()
sel_app = application.filter(selected)
with open("Data/sel_app_mean.pkl", mode="wb") as df:
    pickle.dump(sel_app, df)

In [21]:
sel_app

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_EMPLOYED,DAYS_REGISTRATION,...,CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,CC_CNT_DRAWINGS_ATM_CURRENT_VAR,CC_CNT_DRAWINGS_CURRENT_MAX,CC_CNT_DRAWINGS_CURRENT_MEAN,CC_CNT_DRAWINGS_CURRENT_SUM,CC_CNT_DRAWINGS_CURRENT_VAR,CC_CNT_DRAWINGS_POS_CURRENT_MAX,CC_CNT_DRAWINGS_POS_CURRENT_MEAN,N_CC_USE_LATE_MONTH_6,N_CC_USE_LATE_MONTH_2
0,100002,1.0,1,202500.0,406597.5,24700.5,351000.0,0.018801,-637.0,-3648.0,...,,,,,,,,,,
1,100003,0.0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-1188.0,-1186.0,...,,,,,,,,,,
2,100004,0.0,1,67500.0,135000.0,6750.0,135000.0,0.010032,-225.0,-4260.0,...,,,,,,,,,,
3,100006,0.0,0,135000.0,312682.5,29686.5,297000.0,0.008019,-3039.0,-9833.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000
4,100007,0.0,1,121500.0,513000.0,21865.5,513000.0,0.028663,-3038.0,-4311.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356246,456221,,0,121500.0,412560.0,17473.5,270000.0,0.002042,-5169.0,-9094.0,...,,,,,,,,,,
356247,456222,,0,157500.0,622413.0,31909.5,495000.0,0.035792,-1149.0,-3015.0,...,,,,,,,,,,
356248,456223,,0,202500.0,315000.0,33205.5,315000.0,0.026392,-3037.0,-2681.0,...,,,,,,,,,,
356249,456224,,1,225000.0,450000.0,25128.0,450000.0,0.018850,-2731.0,-1461.0,...,,,,,,,,,,


In [22]:
del sel_app
del application