## Feature Selection for diabetes multi-classifier

In [1]:
import boto3
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
pd.set_option('display.max_columns', 100)

In [2]:
## Using pandas to read the training and testing data files

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

file_key = 'diabetes_train.csv'
file_key2 = 'diabetes_test.csv'

bucket_object = bucket.Object(file_key)
bucket_object2 = bucket.Object(file_key2)

file_object = bucket_object.get()
file_object2 = bucket_object2.get()

file_content_stream = file_object.get('Body')
file_content_stream2 = file_object2.get('Body')

train = pd.read_csv(file_content_stream)
test = pd.read_csv(file_content_stream2)

train.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_012
0,0.0,1.0,1.0,30.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2.0,30.0,5.0,0.0,0.0,6.0,6.0,8.0,2.0
1,1.0,1.0,1.0,30.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,11.0,4.0,7.0,0.0
2,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,6.0,7.0,0.0
3,1.0,1.0,1.0,24.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,4.0,0.0,1.0,7.0,5.0,7.0,0.0
4,0.0,0.0,1.0,35.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,4.0,0.0,0.0,0.0,1.0,7.0,4.0,1.0,0.0


In [3]:
## BMI Categoricals
train['BMI_Underweight'] = np.where(train['BMI'] < 18.5, 1, 0)
train['BMI_Healthy'] = np.where((train['BMI'] >= 18.5) & (train['BMI'] < 25), 1, 0)
train['BMI_Overweight'] = np.where((train['BMI'] >= 25) & (train['BMI'] < 30), 1, 0)
train['BMI_Obese'] = np.where(train['BMI'] >= 30, 1, 0)

## Log(BMI)
train['Log_BMI'] = np.log(train['BMI'])

## Creating dummy variables for Education, and Income

train = pd.concat([train.drop(columns = ['Education']), pd.get_dummies(train['Education'])], axis = 1)
train = train.rename(columns = { 1: 'Never_Attended', 2: 'Grades_1_8', 3: 'Grades_9_11', 4: 'GED', 5: 'College_1_3', 
                              6: 'College_4+'})

train = pd.concat([train.drop(columns = ['Income']), pd.get_dummies(train['Income'])], axis = 1)
train = train.rename(columns = { 1: '<10,000', 2: '<15,000', 3: '<20,000', 4: '<25,000', 5: '<35,000', 
                                      6: '<50,000',  7: '<75,000',  8: '75,000+'})

## Other
train['MentHlth_cat'] = np.where((train.MentHlth <=10), 0, 
                                 np.where((train.MentHlth > 10) & (train.MentHlth <= 20), 1, 2))

train['PhysHlth_cat'] = np.where((train.PhysHlth <=10), 0, 
                              np.where((train.PhysHlth > 10) & (train.PhysHlth <= 20), 1, 2))

train['GenHlth_cat'] = np.where((train.GenHlth <=2), 0, 
                             np.where((train.GenHlth > 3) & (train.GenHlth <= 5), 1, 2))

## Creating interactions of top variables
train['Interaction_1'] = train['HighBP'] * train['GenHlth']
train['Interaction_2'] = train['HighBP'] * train['GenHlth_cat']
train['Interaction_3'] = train['HighBP'] * train['HighChol']
train['Interaction_4'] = train['GenHlth'] * train['GenHlth_cat']
train['Interaction_5'] = train['GenHlth'] * train['HighChol']
train['Interaction_6'] = train['GenHlth_cat'] * train['HighChol']

## Creating tree interactions
train['Tree_1'] = np.where((train['Interaction_2'] <= 0.5) & (train['Interaction_5'] <= 1.5) & (train['Age'] <= 8.5), 1, 0)
train['Tree_2'] = np.where((train['Interaction_2'] <= 0.5) & (train['Interaction_5'] <= 1.5) & (train['Age'] > 8.5), 1, 0)
train['Tree_3'] = np.where((train['Interaction_2'] <= 0.5) & (train['Interaction_5'] > 1.5) & (train['Log_BMI'] <= 3.384), 1, 0)
train['Tree_4'] = np.where((train['Interaction_2'] <= 0.5) & (train['Interaction_5'] > 1.5) & (train['Log_BMI'] > 3.384), 1, 0)
train['Tree_5'] = np.where((train['Interaction_2'] > 0.5) & (train['Interaction_5'] <= 3.5) & (train['BMI'] <= 30.5), 1, 0)
train['Tree_6'] = np.where((train['Interaction_2'] > 0.5) & (train['Interaction_5'] <= 3.5) & (train['BMI'] > 30.5), 1, 0)
train['Tree_7'] = np.where((train['Interaction_2'] > 0.5) & (train['Interaction_5'] > 3.5) & (train['Log_BMI'] <= 3.481), 1, 0)
train['Tree_8'] = np.where((train['Interaction_2'] > 0.5) & (train['Interaction_5'] > 3.5) & (train['Log_BMI'] > 3.481), 1, 0)

## Printing the first five observations
train.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Diabetes_012,BMI_Underweight,BMI_Healthy,BMI_Overweight,BMI_Obese,Log_BMI,Never_Attended,Grades_1_8,Grades_9_11,GED,College_1_3,College_4+,"<10,000","<15,000","<20,000","<25,000","<35,000","<50,000","<75,000","75,000+",MentHlth_cat,PhysHlth_cat,GenHlth_cat,Interaction_1,Interaction_2,Interaction_3,Interaction_4,Interaction_5,Interaction_6,Tree_1,Tree_2,Tree_3,Tree_4,Tree_5,Tree_6,Tree_7,Tree_8
0,0.0,1.0,1.0,30.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2.0,30.0,5.0,0.0,0.0,6.0,2.0,0,0,0,1,3.401197,0,0,0,0,0,1,0,0,0,0,0,0,0,1,2,0,0,0.0,0.0,0.0,0.0,2.0,0.0,0,0,0,1,0,0,0,0
1,1.0,1.0,1.0,30.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,11.0,0.0,0,0,0,1,3.401197,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,2.0,0.0,1.0,0.0,2.0,0.0,0,0,0,1,0,0,0,0
2,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,0.0,0,0,1,0,3.258097,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0
3,1.0,1.0,1.0,24.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,4.0,0.0,1.0,7.0,0.0,0,1,0,0,3.178054,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,2,3.0,2.0,1.0,6.0,3.0,2.0,0,0,0,0,1,0,0,0
4,0.0,0.0,1.0,35.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,4.0,0.0,0.0,0.0,1.0,7.0,0.0,0,0,0,1,3.555348,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,4.0,0.0,0.0,1,0,0,0,0,0,0,0


In [None]:
## Defining the input and target variables
X = train.drop(columns = ['Diabetes_012'])
Y = train['Diabetes_012']

## Splitting the data into training, validation, and testing sets
X_training, X_validation, Y_training, Y_validation = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [None]:
## RFECV with DecisionTreeClassifier

for i in tqdm(range(0, 2)):
    
    ## Defining empty lists to store results
    variable_support = []
    
    ## Defining the binary Y data for the class 0
    Y_training_training = np.where(Y_training == 0, 1, 0)
    
    ## Building the RFECV model
    tree_rfecv = RFECV(estimator = DecisionTreeClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(tree_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 1
    Y_training_training = np.where(Y_training == 1, 1, 0)
    
    ## Building the RFECV model
    tree_rfecv = RFECV(estimator = DecisionTreeClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(tree_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 2
    Y_training_training = np.where(Y_training == 2, 1, 0)
    
    ## Building the RFECV model
    tree_rfecv = RFECV(estimator = DecisionTreeClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(tree_rfecv.support_)

In [None]:
support = pd.DataFrame(variable_support, columns = X_training.columns)
support2 = 100 * support.apply(np.sum, axis = 0) / support.shape[0]
support3 = pd.DataFrame({'Variable': support2.index, 'Score': support2.values})
support3

In [None]:
## RFECV with RandomForestClassifier

for i in tqdm(range(0, 10)):
    
    ## Defining empty lists to store results
    variable_support = []
    
    ## Defining the binary Y data for the class 0
    Y_training_training = np.where(Y_training == 0, 1, 0)
    
    ## Building the RFECV model
    rf_rfecv = RFECV(estimator = RandomForestClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(rf_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 1
    Y_training_training = np.where(Y_training == 1, 1, 0)
    
    ## Building the RFECV model
    rf_rfecv = RFECV(estimator = RandomForestClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(rf_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 2
    Y_training_training = np.where(Y_training == 2, 1, 0)
    
    ## Building the RFECV model
    rf_rfecv = RFECV(estimator = RandomForestClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(rf_rfecv.support_)

In [None]:
## RFECV with AdaBoostClassifier

for i in tqdm(range(0, 1)):
    
    ## Defining empty lists to store results
    variable_support = []
    
    ## Defining the binary Y data for the class 0
    Y_training_training = np.where(Y_training == 0, 1, 0)
    
    ## Building the RFECV model
    ada_rfecv = RFECV(estimator = AdaBoostClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(ada_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 1
    Y_training_training = np.where(Y_training == 1, 1, 0)
    
    ## Building the RFECV model
    ada_rfecv = RFECV(estimator = AdaBoostClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(ada_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 2
    Y_training_training = np.where(Y_training == 2, 1, 0)
    
    ## Building the RFECV model
    ada_rfecv = RFECV(estimator = AdaBoostClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(ada_rfecv.support_)

In [None]:
## RFECV with GradientBoostingClassifier

for i in tqdm(range(0, 1)):
    
    ## Defining empty lists to store results
    variable_support = []
    
    ## Defining the binary Y data for the class 0
    Y_training_training = np.where(Y_training == 0, 1, 0)
    
    ## Building the RFECV model
    gb_rfecv = RFECV(estimator = GradientBoostingClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(gb_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 1
    Y_training_training = np.where(Y_training == 1, 1, 0)
    
    ## Building the RFECV model
    gb_rfecv = RFECV(estimator = GradientBoostingClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(gb_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 2
    Y_training_training = np.where(Y_training == 2, 1, 0)
    
    ## Building the RFECV model
    gb_rfecv = RFECV(estimator = GradientBoostingClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(gb_rfecv.support_)

In [None]:
## RFECV with SVC

scaler = MinMaxScaler()

X_training_svm = scaler.fit_transform(X_training)

for i in tqdm(range(0, 1)):
    
    ## Defining empty lists to store results
    variable_support = []
    
    ## Defining the binary Y data for the class 0
    Y_training_training = np.where(Y_training == 0, 1, 0)
    
    ## Building the RFECV model
    svc_rfecv = RFECV(estimator = SVC(kernel = 'linear'), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training_svm, Y_training_training)
    
    ## Appending results to list
    variable_support.append(svc_rfecv.support_)
    
    ## ----------------
    
    ## Defining the binary Y data for the class 1
    Y_training_training = np.where(Y_training == 1, 1, 0)
    
    ## Building the RFECV model
    gb_rfecv = RFECV(estimator = GradientBoostingClassifier(), step = 1, min_features_to_select = 2, 
                       cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
    ## Appending results to list
    variable_support.append(gb_rfecv.support_)
    
#     ## ----------------
    
#     ## Defining the binary Y data for the class 2
#     Y_training_training = np.where(Y_training == 2, 1, 0)
    
#     ## Building the RFECV model
#     gb_rfecv = RFECV(estimator = GradientBoostingClassifier(), step = 1, min_features_to_select = 2, 
#                        cv = 3, scoring = 'f1', n_jobs = -1).fit(X_training, Y_training_training)
    
#     ## Appending results to list
#     variable_support.append(gb_rfecv.support_)

In [None]:
support = pd.DataFrame(variable_support, columns = X_training.columns)
support_final = 100 * support.apply(np.sum, axis = 0) / support.shape[0]
support_final