In [48]:
from sklearn.datasets import load_iris, load_breast_cancer, load_wine
from sklearn.model_selection import GridSearchCV, train_test_split
import pandas as pd, seaborn as sns, matplotlib.pyplot as plt, numpy as np
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, RandomForestRegressor
from sklearn import svm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS, ExhaustiveFeatureSelector as EFS

In [49]:
data = load_wine()

In [50]:
print(data.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [51]:
X = pd.DataFrame(data.data)
y = data.target

In [52]:
X.columns = data.feature_names
X.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

Step forward feature selection

In [54]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1,), k_features=8,
          forward= True,
          floating=False,
          verbose=2,
          scoring='accuracy',
          cv = 4,
          n_jobs= -1
          ).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   22.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   22.8s finished

[2022-04-27 20:12:42] Features: 1/8 -- score: 0.7674603174603174[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    8.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    8.3s finished

[2022-04-27 20:12:50] Features: 2/8 -- score: 0.9718253968253968[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    8.9s finished

[2022-04-27 20:12:59] Features: 3/8 -- score: 0.9859126984126985[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.4s finished

[2022-04-27 20:13:09] Features: 4/8 -- score: 0.978968

In [55]:
sfs.k_feature_names_

('alcohol',
 'ash',
 'magnesium',
 'flavanoids',
 'proanthocyanins',
 'color_intensity',
 'od280/od315_of_diluted_wines',
 'proline')

In [56]:
sfs.k_feature_names_

('alcohol',
 'ash',
 'magnesium',
 'flavanoids',
 'proanthocyanins',
 'color_intensity',
 'od280/od315_of_diluted_wines',
 'proline')

In [57]:
sfs.k_score_

0.9791666666666666

In [58]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(6,)","[0.7222222222222222, 0.8333333333333334, 0.742...",0.76746,"(flavanoids,)",0.06709,0.041853,0.024164
2,"(6, 9)","[0.9444444444444444, 1.0, 0.9714285714285714, ...",0.971825,"(flavanoids, color_intensity)",0.031492,0.019646,0.011343
3,"(4, 6, 9)","[0.9722222222222222, 1.0, 0.9714285714285714, ...",0.985913,"(magnesium, flavanoids, color_intensity)",0.022586,0.01409,0.008135
4,"(4, 6, 9, 12)","[0.9722222222222222, 0.9722222222222222, 0.971...",0.978968,"(magnesium, flavanoids, color_intensity, proline)",0.019471,0.012147,0.007013
5,"(2, 4, 6, 9, 12)","[0.9444444444444444, 0.9722222222222222, 0.971...",0.972024,"(ash, magnesium, flavanoids, color_intensity, ...",0.03149,0.019645,0.011342
6,"(2, 4, 6, 8, 9, 12)","[0.9722222222222222, 0.9722222222222222, 0.971...",0.978968,"(ash, magnesium, flavanoids, proanthocyanins, ...",0.019471,0.012147,0.007013
7,"(0, 2, 4, 6, 8, 9, 12)","[0.9444444444444444, 0.9722222222222222, 1.0, ...",0.979167,"(alcohol, ash, magnesium, flavanoids, proantho...",0.03692,0.023032,0.013298
8,"(0, 2, 4, 6, 8, 9, 11, 12)","[0.9444444444444444, 0.9722222222222222, 1.0, ...",0.979167,"(alcohol, ash, magnesium, flavanoids, proantho...",0.03692,0.023032,0.013298


In [59]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1,), k_features=(1,13),
          forward= True,
          floating=False,
          verbose=2,
          scoring='accuracy',
          cv = 4,
          n_jobs= -1
          ).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   19.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   19.4s finished

[2022-04-27 20:14:24] Features: 1/13 -- score: 0.7674603174603174[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   14.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   14.7s finished

[2022-04-27 20:14:39] Features: 2/13 -- score: 0.9718253968253968[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:   13.8s finished

[2022-04-27 20:14:53] Features: 3/13 -- score: 0.9859126984126985[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.6s finished

[2022-04-27 20:15:04] Features: 4/13 -- score: 0.97

In [60]:
sfs.k_score_

0.9861111111111112

In [61]:
sfs.k_feature_names_

('alcohol',
 'ash',
 'magnesium',
 'flavanoids',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline')

In [62]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(6,)","[0.7222222222222222, 0.8333333333333334, 0.742...",0.76746,"(flavanoids,)",0.06709,0.041853,0.024164
2,"(6, 9)","[0.9444444444444444, 1.0, 0.9714285714285714, ...",0.971825,"(flavanoids, color_intensity)",0.031492,0.019646,0.011343
3,"(4, 6, 9)","[0.9722222222222222, 1.0, 0.9714285714285714, ...",0.985913,"(magnesium, flavanoids, color_intensity)",0.022586,0.01409,0.008135
4,"(4, 6, 9, 12)","[0.9722222222222222, 0.9722222222222222, 0.971...",0.978968,"(magnesium, flavanoids, color_intensity, proline)",0.019471,0.012147,0.007013
5,"(2, 4, 6, 9, 12)","[0.9444444444444444, 0.9722222222222222, 0.971...",0.972024,"(ash, magnesium, flavanoids, color_intensity, ...",0.03149,0.019645,0.011342
6,"(2, 4, 6, 8, 9, 12)","[0.9722222222222222, 0.9722222222222222, 0.971...",0.978968,"(ash, magnesium, flavanoids, proanthocyanins, ...",0.019471,0.012147,0.007013
7,"(0, 2, 4, 6, 8, 9, 12)","[0.9444444444444444, 0.9722222222222222, 1.0, ...",0.979167,"(alcohol, ash, magnesium, flavanoids, proantho...",0.03692,0.023032,0.013298
8,"(0, 2, 4, 6, 8, 9, 11, 12)","[0.9444444444444444, 0.9722222222222222, 1.0, ...",0.979167,"(alcohol, ash, magnesium, flavanoids, proantho...",0.03692,0.023032,0.013298
9,"(0, 2, 4, 6, 8, 9, 10, 11, 12)","[0.9722222222222222, 0.9722222222222222, 1.0, ...",0.986111,"(alcohol, ash, magnesium, flavanoids, proantho...",0.022264,0.013889,0.008019
10,"(0, 1, 2, 4, 6, 8, 9, 10, 11, 12)","[0.9722222222222222, 0.9722222222222222, 1.0, ...",0.986111,"(alcohol, malic_acid, ash, magnesium, flavanoi...",0.022264,0.013889,0.008019


Step backward selection

In [63]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1,), k_features=(1,13),
          forward= False,
          floating=False,
          verbose=2,
          scoring='accuracy',
          cv = 4,
          n_jobs= -1
          ).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   10.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   10.6s finished

[2022-04-27 20:16:22] Features: 12/1 -- score: 0.9861111111111112[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   11.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   11.3s finished

[2022-04-27 20:16:33] Features: 11/1 -- score: 0.9861111111111112[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:   13.8s finished

[2022-04-27 20:16:47] Features: 10/1 -- score: 0.9791666666666666[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.2s finished

[2022-04-27 20:16:59] Features: 9/1 -- score: 0.986

In [64]:
sfs.k_score_

0.9861111111111112

In [65]:
sfs.k_feature_names_

('alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'proline')

In [66]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
13,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[0.9444444444444444, 0.9722222222222222, 1.0, ...",0.972024,"(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.03149,0.019645,0.011342
12,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12)","[0.9722222222222222, 0.9722222222222222, 1.0, ...",0.986111,"(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.022264,0.013889,0.008019
11,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12)","[0.9722222222222222, 0.9722222222222222, 1.0, ...",0.986111,"(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.022264,0.013889,0.008019
10,"(0, 1, 2, 3, 4, 5, 6, 7, 9, 12)","[0.9444444444444444, 0.9722222222222222, 1.0, ...",0.979167,"(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.03692,0.023032,0.013298
9,"(0, 1, 2, 3, 4, 5, 6, 7, 9)","[0.9722222222222222, 0.9722222222222222, 1.0, ...",0.986111,"(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.022264,0.013889,0.008019
8,"(0, 1, 2, 3, 4, 6, 7, 9)","[0.9722222222222222, 1.0, 1.0, 0.9714285714285...",0.985913,"(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.022586,0.01409,0.008135
7,"(0, 1, 2, 3, 4, 6, 9)","[0.9722222222222222, 0.9722222222222222, 1.0, ...",0.978968,"(alcohol, malic_acid, ash, alcalinity_of_ash, ...",0.019471,0.012147,0.007013
6,"(0, 2, 3, 4, 6, 9)","[0.9722222222222222, 1.0, 0.9714285714285714, ...",0.985913,"(alcohol, ash, alcalinity_of_ash, magnesium, f...",0.022586,0.01409,0.008135
5,"(0, 2, 4, 6, 9)","[0.9722222222222222, 0.9722222222222222, 0.971...",0.978968,"(alcohol, ash, magnesium, flavanoids, color_in...",0.019471,0.012147,0.007013
4,"(0, 2, 6, 9)","[0.9722222222222222, 0.9722222222222222, 0.971...",0.971825,"(alcohol, ash, flavanoids, color_intensity)",0.000636,0.000397,0.000229


Exhaustie Feature selection

In [67]:
# efs = EFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1,), 
#           min_features= 2,
#           max_features=10,
#           scoring='accuracy',
#           cv = 4,
#           n_jobs= -1
#           ).fit(X_train, y_train)

In [87]:
dataset = pd.read_csv('../../dataset/heart_2020_cleaned.csv')
dataset.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [88]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [89]:
le = LabelEncoder()
dataset['HeartDisease'] = le.fit_transform(dataset['HeartDisease'])
dataset.Smoking = le.fit_transform(dataset.Smoking)
dataset.AlcoholDrinking = le.fit_transform(dataset.AlcoholDrinking)
dataset.Stroke = le.fit_transform(dataset.Stroke)
dataset.DiffWalking = le.fit_transform(dataset.DiffWalking)
dataset.Sex = le.fit_transform(dataset.Sex)
dataset.Race = le.fit_transform(dataset.Race)
dataset.PhysicalActivity = le.fit_transform(dataset.PhysicalActivity)
dataset.GenHealth = le.fit_transform(dataset.GenHealth)
dataset.Asthma = le.fit_transform(dataset.Diabetic)
dataset.Diabetic = le.fit_transform(dataset.Diabetic)
dataset.KidneyDisease = le.fit_transform(dataset.KidneyDisease)
dataset.SkinCancer = le.fit_transform(dataset.SkinCancer)
dataset.AgeCategory = le.fit_transform(dataset.AgeCategory)

In [90]:
X2 = dataset.drop(['SkinCancer'], axis= 1)
y2 = dataset.SkinCancer

In [94]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size= 0.3, random_state= 30)

In [95]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1,), k_features=(1,16),
          forward= True,
          floating=False,
          verbose=2,
          scoring='accuracy',
          cv = 4,
          n_jobs= -1
          ).fit(X2_train, y2_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 out of  17 | elapsed:  8.8min finished

[2022-04-27 21:12:09] Features: 1/16 -- score: 0.9071456650704024[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:  8.3min finished

[2022-04-27 21:20:30] Features: 2/16 -- score: 0.9071456650704024[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  7.8min finished

[2022-04-27 21:28:20] Features: 3/16 -- score: 0.9071456650704024[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:  8.2min finished

[2022-04-27 21:36:30] Features: 4/16 -- score: 0.9071456650704024[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:  8.7min remaining: