### Forward selection – 

* This method is an iterative approach where we initially start with an empty set of features and keep adding a feature which best improves our model after each iteration. 
* The stopping criterion is till the addition of a new variable does not improve the performance of the model.


In [1]:
# We have to firt install mlxtend 

! pip install mlxtend



In [2]:
# importing libraries

import pandas as pd

from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv("wine.csv")
df.sample(3)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
8,14.83,1.64,2.17,14.0,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045,0
127,11.79,2.13,2.78,28.5,92,2.13,2.24,0.58,1.76,3.0,0.97,2.44,466,1
61,12.64,1.36,2.02,16.8,100,2.02,1.41,0.53,0.62,5.75,0.98,1.59,450,1


1. We are going to see out of 13 independent varaibles how many of them are significant for our target variable and we are going   to find those features which can influence the model with respect to  the target variable.

In [4]:
df.shape

(178, 14)

In [5]:
df.isna().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
target                          0
dtype: int64

In [6]:
# Checking for any null value in a row 
df[df.isna().any(axis = 1)]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target


### Note
    * If we write y = df[['target']] we will get an error while fitting it into the Sequential Feature Selector cuz 
    It will be Converted into 2D array so we need that it should be in 1D array so that we can fit it into the model.

In [12]:
# splitting the data 
x = df.iloc[:, 0:13]
y = df.iloc[:, -1]

In [13]:
X.head(2)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050


In [14]:
y.head(2)

0    0
1    0
Name: target, dtype: int64

In [15]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)

In [16]:
print('x_train_shape: ', x_train.shape, '\nx_test_shape: ', x_test.shape)

x_train_shape:  (142, 13) 
x_test_shape:  (36, 13)


* k_features = is the number of features we want to select for the model .
* forward = True  , if it be equal to False then it will perform the Backward elimination.
* verbose = 2  will give us how the model is fitting and will give us the results while it's fitting to the model.
* scoring = accuracy   cuz we wanna check the accuracy matrix , so it can be different for different model which you wanna fit.
* CV  = Cross validation and here we wanna do 5 times cross validation 

In [17]:
rfc = RandomForestClassifier()
forward_feature_selection = sfs(rfc,k_features= 6, forward=True, verbose=2, scoring= "accuracy", cv= 5)

In [19]:
forward_feature_selection.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:   24.2s finished

[2022-11-18 06:02:19] Features: 1/6 -- score: 0.7672413793103449[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:   20.1s finished

[2022-11-18 06:02:39] Features: 2/6 -- score: 0.9091133004926109[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   18.4s finished

[2022-11-18 06:02:58] Features: 3/6 -- score: 0.9719211822660098[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

In [20]:
# We simply create a pandas data frame to understand better what was written in the each iteration which we have done above.

pd.DataFrame.from_dict(forward_feature_selection.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(6,)","[0.7586206896551724, 0.8275862068965517, 0.75,...",0.767241,"(flavanoids,)",0.048633,0.037838,0.018919
2,"(6, 9)","[0.8620689655172413, 0.8620689655172413, 0.928...",0.909113,"(flavanoids, color_intensity)",0.052138,0.040565,0.020282
3,"(0, 6, 9)","[0.9655172413793104, 0.9655172413793104, 0.964...",0.971921,"(alcohol, flavanoids, color_intensity)",0.018059,0.01405,0.007025
4,"(0, 4, 6, 9)","[0.9310344827586207, 1.0, 0.9285714285714286, ...",0.971921,"(alcohol, magnesium, flavanoids, color_intensity)",0.044212,0.034398,0.017199
5,"(0, 4, 5, 6, 9)","[0.9655172413793104, 1.0, 0.9285714285714286, ...",0.978818,"(alcohol, magnesium, total_phenols, flavanoids...",0.036569,0.028452,0.014226
6,"(0, 4, 5, 6, 9, 10)","[1.0, 1.0, 0.9642857142857143, 1.0, 1.0]",0.992857,"(alcohol, magnesium, total_phenols, flavanoids...",0.018361,0.014286,0.007143


In [21]:
# The significant features that can influence the model

forward_feature_selection.k_feature_idx_

(0, 4, 5, 6, 9, 10)

In [22]:
forward_feature_selection.k_feature_names_

('alcohol',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'color_intensity',
 'hue')

In [23]:
# checking the accuracy of the model

forward_feature_selection.k_score_

0.9928571428571429

### Now we let the Sequential Feature Selector itself decides which columns and how many columns to select.


In [29]:
forward_feature_selection = sfs(rfc,k_features= (1, 13), forward=True, verbose=2, scoring= "accuracy", cv= 5)

In [30]:
forward_feature_selection.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    7.8s finished

[2022-11-18 06:15:13] Features: 1/13 -- score: 0.7672413793103449[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    7.1s finished

[2022-11-18 06:15:20] Features: 2/13 -- score: 0.9231527093596059[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    6.5s finished

[2022-11-18 06:15:26] Features: 3/13 -- score: 0.9650246305418719[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

In [31]:
pd.DataFrame.from_dict(forward_feature_selection.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(6,)","[0.7586206896551724, 0.8275862068965517, 0.75,...",0.767241,"(flavanoids,)",0.048633,0.037838,0.018919
2,"(6, 9)","[0.896551724137931, 0.8620689655172413, 0.9642...",0.923153,"(flavanoids, color_intensity)",0.050934,0.039628,0.019814
3,"(4, 6, 9)","[0.9310344827586207, 0.9655172413793104, 0.964...",0.965025,"(magnesium, flavanoids, color_intensity)",0.028041,0.021817,0.010909
4,"(0, 4, 6, 9)","[1.0, 1.0, 0.9285714285714286, 1.0, 1.0]",0.985714,"(alcohol, magnesium, flavanoids, color_intensity)",0.036723,0.028571,0.014286
5,"(0, 4, 6, 7, 9)","[0.9655172413793104, 0.9655172413793104, 0.964...",0.979064,"(alcohol, magnesium, flavanoids, nonflavanoid_...",0.021979,0.0171,0.00855
6,"(0, 4, 6, 7, 9, 10)","[1.0, 1.0, 0.9285714285714286, 1.0, 1.0]",0.985714,"(alcohol, magnesium, flavanoids, nonflavanoid_...",0.036723,0.028571,0.014286
7,"(0, 4, 6, 7, 8, 9, 10)","[0.9655172413793104, 1.0, 0.9642857142857143, ...",0.985961,"(alcohol, magnesium, flavanoids, nonflavanoid_...",0.022106,0.017199,0.0086
8,"(0, 1, 4, 6, 7, 8, 9, 10)","[1.0, 1.0, 0.9642857142857143, 0.9642857142857...",0.985714,"(alcohol, malic_acid, magnesium, flavanoids, n...",0.022488,0.017496,0.008748
9,"(0, 1, 2, 4, 6, 7, 8, 9, 10)","[0.9310344827586207, 1.0, 1.0, 0.9642857142857...",0.979064,"(alcohol, malic_acid, ash, magnesium, flavanoi...",0.03562,0.027713,0.013857
10,"(0, 1, 2, 4, 5, 6, 7, 8, 9, 10)","[1.0, 1.0, 0.9642857142857143, 0.9642857142857...",0.978571,"(alcohol, malic_acid, ash, magnesium, total_ph...",0.022488,0.017496,0.008748


In [26]:
forward_feature_selection.k_feature_idx_

(0, 5, 6, 7, 9, 10)

In [27]:
forward_feature_selection.k_feature_names_

('alcohol',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'color_intensity',
 'hue')

In [28]:
forward_feature_selection.k_score_

0.9928571428571429