#### Implementing Forward Stepwise selection
Refernce: https://www.kdnuggets.com/2018/06/step-forward-feature-selection-python.html

In [2]:
from __future__ import division, print_function, unicode_literals
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns #visualization library
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.linear_model import LogisticRegression #problem will be solved with scikit
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis #linear discriminant analysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis #quadratic discriminant analysis

import statsmodels.api as sm #to compute p-values
from patsy import dmatrices
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve

In [3]:
credit_df=pd.read_csv('../../data/Credit.csv',usecols=[i for i in range(1,12)] )

In [17]:
#credit_df.describe(include='object')
credit_df.describe()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Balance
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,45.218885,4735.6,354.94,2.9575,55.6675,13.45,520.015
std,35.244273,2308.198848,154.724143,1.371275,17.249807,3.125207,459.758877
min,10.354,855.0,93.0,1.0,23.0,5.0,0.0
25%,21.00725,3088.0,247.25,2.0,41.75,11.0,68.75
50%,33.1155,4622.5,344.0,3.0,56.0,14.0,459.5
75%,57.47075,5872.75,437.25,4.0,70.0,16.0,863.0
max,186.634,13913.0,982.0,9.0,98.0,20.0,1999.0


In [4]:
credit_df['Active'] = np.where(credit_df['Balance']>0, 'Yes', 'No')  
credit_df.Active.describe()

count     400
unique      2
top       Yes
freq      310
Name: Active, dtype: object

In [6]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.ensemble import RandomForestClassifier

In [26]:
credit_df

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance,Active
0,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333,Yes
1,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903,Yes
2,104.593,7075,514,4,71,11,Male,No,No,Asian,580,Yes
3,148.924,9504,681,3,36,11,Female,No,No,Asian,964,Yes
4,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...
395,12.096,4100,307,3,32,13,Male,No,Yes,Caucasian,560,Yes
396,13.364,3838,296,5,65,17,Male,No,No,African American,480,Yes
397,57.872,4171,321,5,67,12,Female,No,Yes,Caucasian,138,Yes
398,37.728,2525,192,1,44,13,Male,No,Yes,Caucasian,0,No


In [20]:
feature_cols=["Income","Limit","Rating","Cards","Age","Education","Balance"]
predict_col=["Active"]
X_train, X_test, y_train, y_test = train_test_split(
    #credit_df.values[:,:-1],
    credit_df[feature_cols],
    credit_df[predict_col],
    #credit_df.values[:,-1:],
    test_size=0.25,
    random_state=42)

#y_train = y_train.ravel()
#y_test = y_test.ravel()

print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)


Training dataset shape: (300, 7) (300, 1)
Testing dataset shape: (100, 7) (100, 1)


In [10]:
## Splitting in Bins using Numba
## Reference:
# https://stackoverflow.com/questions/45273731/binning-column-with-python-pandas

from numba import njit

@njit
def cut(arr):
    bins = np.empty(arr.shape[0])
    for idx, x in enumerate(arr):
        if (x >= 0) & (x <= 580):
            bins[idx] = 0
        elif (x >= 580) & (x < 670):
            bins[idx] = 1
        elif (x >= 670) & (x < 740):
            bins[idx] = 2
        elif (x >= 740) & (x < 800):
            bins[idx] = 3
        elif (x >= 800) & (x < 850):
            bins[idx] = 4
    return bins
#credit_df['binned_balance']=pd.cut
cut(credit_df['Balance'].to_numpy())

array([ 0.00000000e+000, -1.36311572e+057,  0.00000000e+000,
       -1.36311572e+057,  0.00000000e+000, -1.36311572e+057,
        0.00000000e+000, -1.36311572e+057,  0.00000000e+000,
       -1.36311572e+057, -1.36311572e+057,  0.00000000e+000,
        0.00000000e+000, -1.36311572e+057,  0.00000000e+000,
        0.00000000e+000,  0.00000000e+000,  0.00000000e+000,
       -1.36311572e+057, -1.36311572e+057,  0.00000000e+000,
       -1.36311572e+057,  0.00000000e+000,  0.00000000e+000,
        0.00000000e+000,  2.00000000e+000,  1.00000000e+000,
        0.00000000e+000, -1.36311572e+057, -1.36311572e+057,
       -1.36311572e+057,  0.00000000e+000,  0.00000000e+000,
        0.00000000e+000,  0.00000000e+000,  0.00000000e+000,
        3.00000000e+000,  0.00000000e+000,  0.00000000e+000,
        0.00000000e+000,  0.00000000e+000,  0.00000000e+000,
        0.00000000e+000,  0.00000000e+000,              nan,
        0.00000000e+000,              nan,  3.00000000e+000,
        0.00000000e+000,

In [11]:
credit_df['Active'].min()

'No'

In [21]:
# Build RF classifier to use in feature selection
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

# Build step forward feature selection
sfs1 = sfs(clf,
           k_features=5,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5)

# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train

In [22]:
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)

[0, 3, 4, 5, 6]


In [29]:
feat_cols=list(sfs1.k_feature_names_)
print(feat_cols)


['Income', 'Cards', 'Age', 'Education', 'Balance']


In [40]:
X_train.loc[:,feat_cols]

Unnamed: 0,Income,Cards,Age,Education,Balance
247,36.364,3,50,19,0
110,34.537,3,57,17,47
16,53.598,3,73,17,0
66,113.829,4,38,13,1388
153,92.112,3,32,17,0
...,...,...,...,...,...
71,58.781,2,81,12,1103
106,16.819,2,74,15,0
270,15.866,1,39,13,136
348,13.433,3,70,14,0


In [41]:
# Build full model with selected features
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train.loc[:, feat_cols], y_train)

y_train_pred = clf.predict(X_train.loc[:, feat_cols])
print('Training accuracy on selected features: %.3f' % accuracy_score(y_train, y_train_pred))

y_test_pred = clf.predict(X_test.loc[:, feat_cols])
print('Testing accuracy on selected features: %.3f' % accuracy_score(y_test, y_test_pred))


  This is separate from the ipykernel package so we can avoid doing imports until


Training accuracy on selected features: 1.000
Testing accuracy on selected features: 0.990
