### Backward elimination –
 
* This method is also an iterative approach where we initially start with all features and after each iteration, we remove the least significant feature. 
* The stopping criterion is till no improvement in the performance of the model is observed after the feature is removed.



In [2]:
# installing mlxtend
!pip install mlxtend



In [61]:
# importing libraries

import pandas as pd

from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [62]:
df = df = pd.read_csv("C:\\Users\KIIT\OneDrive - kiit.ac.in\Desktop\Books\datasets\wine.csv")
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,0
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,0
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,0
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,0
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,0


In [63]:
df.shape

(178, 14)

In [64]:
df.isna().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
target                          0
dtype: int64

In [65]:
df[df.isna().any(axis = 1)]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target


### Note
    * if you write y = df[['target']] you will get an error while fitting it into the Sequential Feature Selector cuz 
    It will be Converted into 2D array so we need that it should be in 1D array so that we can fit it into the model.

In [66]:
x = df.drop('target', axis = 1)
y = df['target']

In [67]:
x.head(2)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050


In [68]:
y.head(2)

0    0
1    0
Name: target, dtype: int64

In [69]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)

In [70]:
print('x_train_shape: ', x_train.shape, '\nx_test_shape: ', x_test.shape)

x_train_shape:  (142, 13) 
x_test_shape:  (36, 13)


In [91]:
rfc = RandomForestClassifier()
backward_feature_eliminator = sfs(rfc, k_features = 9 , forward = False,
                                                        verbose = 2,
                                                        scoring = 'accuracy' )

In [92]:
backward_feature_eliminator.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:   21.8s finished

[2022-11-18 05:50:53] Features: 12/9 -- score: 0.9857142857142858[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:   19.3s finished

[2022-11-18 05:51:12] Features: 11/9 -- score: 0.9859605911330049[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   18.7s finished

[2022-11-18 05:51:31] Features: 10/9 -- score: 0.9788177339901478[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

In [93]:
pd.DataFrame.from_dict(backward_feature_eliminator.get_metric_dict() ).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
13,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[1.0, 0.9655172413793104, 0.9285714285714286, ...",0.964532,"(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.041062,0.031948,0.015974
12,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12)","[1.0, 1.0, 0.9642857142857143, 0.9642857142857...",0.985714,"(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.022488,0.017496,0.008748
11,"(0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12)","[1.0, 0.9655172413793104, 0.9642857142857143, ...",0.985961,"(alcohol, ash, alcalinity_of_ash, magnesium, t...",0.022106,0.017199,0.0086
10,"(0, 2, 3, 4, 6, 7, 8, 9, 10, 12)","[1.0, 0.9655172413793104, 0.9642857142857143, ...",0.978818,"(alcohol, ash, alcalinity_of_ash, magnesium, f...",0.022237,0.017301,0.008651
9,"(0, 2, 3, 4, 6, 7, 9, 10, 12)","[1.0, 0.9655172413793104, 0.9642857142857143, ...",0.978818,"(alcohol, ash, alcalinity_of_ash, magnesium, f...",0.022237,0.017301,0.008651


In [76]:
backward_feature_eliminator.k_feature_idx_

(0, 1, 2, 4, 6, 7, 8, 9, 12)

In [77]:
backward_feature_eliminator.k_feature_names_

('alcohol',
 'malic_acid',
 'ash',
 'magnesium',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'proline')

In [80]:
backward_feature_eliminator.k_score_

0.9857142857142858

### Now we let the Sequential Feature Selector itself decides which columns and how many columns to remove.


In [82]:
backward_feature_eliminator = sfs(rfc, k_features =( 1,13), forward = False, floating = False ,scoring = 'accuracy'
                                 , verbose = 2)


In [83]:
backward_feature_eliminator.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    7.7s finished

[2022-11-18 05:45:49] Features: 12/1 -- score: 0.9857142857142858[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    7.2s finished

[2022-11-18 05:45:56] Features: 11/1 -- score: 0.9785714285714286[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    6.7s finished

[2022-11-18 05:46:03] Features: 10/1 -- score: 0.9788177339901478[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

In [89]:
pd.DataFrame.from_dict(backward_feature_eliminator.get_metric_dict() ).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
13,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[1.0, 0.9655172413793104, 0.9285714285714286, ...",0.971675,"(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.034272,0.026665,0.013332
12,"(0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[1.0, 1.0, 0.9642857142857143, 0.9642857142857...",0.985714,"(alcohol, malic_acid, ash, magnesium, total_ph...",0.022488,0.017496,0.008748
11,"(0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11)","[1.0, 1.0, 0.9642857142857143, 0.9285714285714...",0.978571,"(alcohol, malic_acid, ash, magnesium, total_ph...",0.036723,0.028571,0.014286
10,"(0, 1, 2, 4, 5, 6, 8, 9, 10, 11)","[0.9655172413793104, 1.0, 0.9642857142857143, ...",0.978818,"(alcohol, malic_acid, ash, magnesium, total_ph...",0.022237,0.017301,0.008651
9,"(0, 1, 2, 4, 5, 6, 8, 9, 10)","[0.9655172413793104, 1.0, 0.9642857142857143, ...",0.978818,"(alcohol, malic_acid, ash, magnesium, total_ph...",0.022237,0.017301,0.008651
8,"(0, 1, 2, 4, 5, 6, 9, 10)","[1.0, 1.0, 1.0, 0.9642857142857143, 0.96428571...",0.985714,"(alcohol, malic_acid, ash, magnesium, total_ph...",0.022488,0.017496,0.008748
7,"(0, 1, 4, 5, 6, 9, 10)","[1.0, 1.0, 1.0, 0.9642857142857143, 1.0]",0.992857,"(alcohol, malic_acid, magnesium, total_phenols...",0.018361,0.014286,0.007143
6,"(0, 1, 4, 6, 9, 10)","[0.9655172413793104, 1.0, 1.0, 0.9642857142857...",0.985961,"(alcohol, malic_acid, magnesium, flavanoids, c...",0.022106,0.017199,0.0086
5,"(0, 1, 4, 6, 10)","[1.0, 0.9310344827586207, 0.9642857142857143, ...",0.979064,"(alcohol, malic_acid, magnesium, flavanoids, hue)",0.03562,0.027713,0.013857
4,"(0, 4, 6, 10)","[0.9655172413793104, 1.0, 0.9642857142857143, ...",0.964532,"(alcohol, magnesium, flavanoids, hue)",0.029039,0.022593,0.011297


In [84]:
backward_feature_eliminator.k_feature_idx_

(0, 1, 4, 5, 6, 9, 10)

In [85]:
backward_feature_eliminator.k_feature_names_

('alcohol',
 'malic_acid',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'color_intensity',
 'hue')

In [86]:
backward_feature_eliminator.k_score_

0.9928571428571429