## Feature Selection for diabetes multi-classifier

In [53]:
import boto3
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
pd.set_option('display.max_columns', 50)

In [54]:
## Using pandas to read the training and testing data files

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

file_key = 'diabetes_train.csv'
file_key2 = 'diabetes_test.csv'

bucket_object = bucket.Object(file_key)
bucket_object2 = bucket.Object(file_key2)

file_object = bucket_object.get()
file_object2 = bucket_object2.get()

file_content_stream = file_object.get('Body')
file_content_stream2 = file_object2.get('Body')

train = pd.read_csv(file_content_stream)
test = pd.read_csv(file_content_stream2)

train.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_012
0,0.0,1.0,1.0,30.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2.0,30.0,5.0,0.0,0.0,6.0,6.0,8.0,2.0
1,1.0,1.0,1.0,30.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,11.0,4.0,7.0,0.0
2,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,6.0,7.0,0.0
3,1.0,1.0,1.0,24.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,4.0,0.0,1.0,7.0,5.0,7.0,0.0
4,0.0,0.0,1.0,35.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,4.0,0.0,0.0,0.0,1.0,7.0,4.0,1.0,0.0


In [55]:
## Defining the input and target variables
X = train.drop(columns = ['Diabetes_012'])
Y = train['Diabetes_012']

## Splitting the data into training, validation, and testing sets
X_training, X_validation, Y_training, Y_validation = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [57]:
## RFECV with DecisionTreeClassifier

for i in range(0, 5):
    
    ## Defining empty lists to store results
    variable_support = []
    
    ## Defining the binary Y data for the class 0
    Y_training_training = np.where(Y_training == 0, 1, 0)
    
    ## Building the RFECV model
    tree_rfecv = RFECV(estimator = DecisionTreeClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1').fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(tree_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 1
    Y_training_training = np.where(Y_training == 1, 1, 0)
    
    ## Building the RFECV model
    tree_rfecv = RFECV(estimator = DecisionTreeClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1').fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(tree_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 2
    Y_training_training = np.where(Y_training == 2, 1, 0)
    
    ## Building the RFECV model
    tree_rfecv = RFECV(estimator = DecisionTreeClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1').fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(tree_rfecv.support_)

In [59]:
support = pd.DataFrame(variable_support, columns = X_training.columns)
support = 100 * support.apply(np.sum, axis = 0) / support.shape[0]

In [None]:
## RFECV with RandomForestClassifier

for i in range(0, 3):
    
    ## Defining empty lists to store results
    variable_support = []
    
    ## Defining the binary Y data for the class 0
    Y_training_training = np.where(Y_training == 0, 1, 0)
    
    ## Building the RFECV model
    rf_rfecv = RFECV(estimator = RandomForestClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1').fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(rf_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 1
    Y_training_training = np.where(Y_training == 1, 1, 0)
    
    ## Building the RFECV model
    rf_rfecv = RFECV(estimator = RandomForestClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1').fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(rf_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 2
    Y_training_training = np.where(Y_training == 2, 1, 0)
    
    ## Building the RFECV model
    rf_rfecv = RFECV(estimator = RandomForestClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1').fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(rf_rfecv.support_)

In [None]:
support = pd.DataFrame(variable_support, columns = X_training.columns)
support = 100 * support.apply(np.sum, axis = 0) / support.shape[0]
suppport

In [None]:
## RFECV with AdaBoostClassifier

for i in range(0, 3):
    
    ## Defining empty lists to store results
    variable_support = []
    
    ## Defining the binary Y data for the class 0
    Y_training_training = np.where(Y_training == 0, 1, 0)
    
    ## Building the RFECV model
    ada_rfecv = RFECV(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier()), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1').fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(ada_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 1
    Y_training_training = np.where(Y_training == 1, 1, 0)
    
    ## Building the RFECV model
    ada_rfecv = RFECV(estimator = AdaBoostClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1').fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(ada_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 2
    Y_training_training = np.where(Y_training == 2, 1, 0)
    
    ## Building the RFECV model
    ada_rfecv = RFECV(estimator = AdaBoostClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1').fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(ada_rfecv.support_)

In [None]:
support = pd.DataFrame(variable_support, columns = X_training.columns)
support = 100 * support.apply(np.sum, axis = 0) / support.shape[0]
suppport