# IMPORTS

## Libraries

In [36]:
import warnings

import numpy  as np
import pandas as pd

import seaborn           as sns
import matplotlib.pyplot as plt

from scipy import stats as ss

from IPython.display      import Image
from IPython.core.display import HTML

from boruta                  import BorutaPy
from sklearn.ensemble        import RandomForestClassifier
from sklearn.preprocessing   import RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

## Load Dataset

In [37]:
dfRawTrain = pd.read_feather('00-Data/FeatherData/trainDatasetScaling.feather')

## Helper Functions

#### Jupyter Settings

In [38]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML('<style>.container { width:100% !important; }</style>'))
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.expand_frame_repr', False)
    
    seed = 0
    np.random.seed(seed)
    
    warnings.filterwarnings("ignore")
    
    sns.set()

In [39]:
jupyter_settings()  

Populating the interactive namespace from numpy and matplotlib


#### Functions

# FEATURE SELECTION

In [40]:
df06 = dfRawTrain.copy()

## X,y Split

In [41]:
# Train
XTrain = df06.drop('Exited', axis=1)
yTrain = df06['Exited']

## Boruta Feature Selection

In [42]:
# Training and Validation dataset for Boruta
XTrainN = XTrain.to_numpy()
yTrainN = yTrain.values.ravel()

# define RandomForestRegressor
rf = RandomForestClassifier(n_estimators=500, n_jobs=-1)

# define Boruta
boruta = BorutaPy(rf, random_state=42)
boruta.fit(XTrainN, yTrainN)

BorutaPy(estimator=RandomForestClassifier(n_estimators=1000, n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x281591BE740),
         random_state=RandomState(MT19937) at 0x281591BE740)

In [43]:
colsSelected = boruta.support_.tolist()

# Best Features
XTrainFS = XTrain
colsSelectedBoruta = XTrainFS.iloc[:, colsSelected].columns.tolist()

colsNotSelectBoruta = list(np.setdiff1d(XTrainFS.columns, colsSelectedBoruta))

In [44]:
colsSelectedBoruta

['Age', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'AgeGroup_Midlife']