## Feature Engineering work

In [1]:
## Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', 100)


ImportError: cannot import name plot_tree

In [None]:
## Reading the data set
train = pd.read_csv('/Users/EvanCallaghan/Documents/Courses/Predictive Analytics/DATA-448-Project/Data/diabetes_train.csv')

## Printing the first five observations
train.head()

### Exploration

In [None]:
plt.hist(train['BMI'], bins = 20)

In [None]:
plt.hist(train['GenHlth'], bins = 5)

In [None]:
plt.hist(train['MentHlth'], bins = 30)

In [None]:
plt.hist(train['PhysHlth'], bins = 30)

In [None]:
plt.hist(train['Age'], bins = 8, edgecolor = 'black')

In [None]:
plt.hist(train['Education'], bins = 6)

In [None]:
plt.hist(train['Income'], bins = 8)

### Feature Engineering

In [None]:
## BMI Categoricals

train['BMI_Underweight'] = np.where(train['BMI'] < 18.5, 1, 0)
train['BMI_Healthy'] = np.where((train['BMI'] >= 18.5) & (train['BMI'] < 25), 1, 0)
train['BMI_Overweight'] = np.where((train['BMI'] >= 25) & (train['BMI'] < 30), 1, 0)
train['BMI_Obese'] = np.where(train['BMI'] >= 30, 1, 0)


## Log(BMI)
train['Log_BMI'] = np.log(train['BMI'])


## Creating dummy variables for Sex, Education, and Income

train = pd.concat([train.drop(columns = ['Sex']), pd.get_dummies(train['Sex'])], axis = 1)
train = train.rename(columns = { 0: 'Female', 1: 'Male'})

train = pd.concat([train.drop(columns = ['Education']), pd.get_dummies(train['Education'])], axis = 1)
train = train.rename(columns = { 1: 'Never_Attended', 2: 'Grades_1_8', 3: 'Grades_9_11', 4: 'GED', 5: 'College_1_3', 
                              6: 'College_4+'})

train = pd.concat([train.drop(columns = ['Income']), pd.get_dummies(train['Income'])], axis = 1)
train = train.rename(columns = { 1: '<10,000', 2: '<15,000', 3: '<20,000', 4: '<25,000', 5: '<35,000', 
                                      6: '<50,000',  7: '<75,000',  8: '75,000+'})


## Fruits and Veggies
train['Fruits+Veggies'] = np.where((train['Fruits'] == 1) & (train['Veggies'] == 1), 1, 0)

## Health care issues
train['HealthCareIssues'] = np.where((train['AnyHealthcare'] == 0) & (train['NoDocbcCost'] == 0), 1, 0)

## Poor diet
train['PoorDiet'] = np.where((train['Fruits'] == 0) & (train['Veggies'] == 0) & 
                                (train['HvyAlcoholConsump'] == 1), 1, 0)


train['MentHlth_cat'] = np.where((train.MentHlth <=10), 0, 
                                 np.where((train.MentHlth > 10) & (train.MentHlth <= 20), 1, 2))

train['PhysHlth_cat'] = np.where((train.PhysHlth <=10), 0, 
                              np.where((train.PhysHlth > 10) & (train.PhysHlth <= 20), 1, 2))

train['GenHlth_cat'] = np.where((train.GenHlth <=2), 0, 
                             np.where((train.GenHlth > 3) & (train.GenHlth <= 5), 1, 2))

In [None]:
## Calculating variable importance scores

## Defining the input and target variables
X = train.drop(columns = ['Diabetes_012'])
Y = train['Diabetes_012']

## Defining a list to store results
results = []

## Repeating process 100 times
for i in tqdm(range(0, 25)):
    
    ## Splitting the data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ## Building the model
    OneVsRest_md = OneVsRestClassifier(estimator = RandomForestClassifier(max_depth = 3, 
                                                                   n_estimators = 500)).fit(X_train, Y_train)
    
    ## Appending feature importance results
    for md in OneVsRest_md.estimators_:
        
        ## Extracting the scores from each model
        results.append(md.feature_importances_)

    

## Changing results list to a dataframe
results = pd.DataFrame(results, columns = X.columns)

## Computing averages and sorting variables by importance
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results = pd.DataFrame({'Feature': results.index, 'Importance': results[0].values}).sort_values(by = 'Importance', ascending = False)

## Printing the 10 most important variables
results.head(10)

In [None]:
## Creating interactions

train['Interaction_1'] = train['HighBP'] * train['GenHlth']

train['Interaction_2'] = train['HighBP'] * train['GenHlth_cat']

train['Interaction_3'] = train['HighBP'] * train['HighChol']

train['Interaction_4'] = train['GenHlth'] * train['GenHlth_cat']

train['Interaction_5'] = train['GenHlth'] * train['HighChol']

train['Interaction_6'] = train['GenHlth_cat'] * train['HighChol']

In [None]:
## Creating a plot tree to engineer more interactions

## Defining the input and target variables
X = train.drop(columns = ['Diabetes_012'])
Y = train['Diabetes_012']

## Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, stratify = Y)

## Building a decision tree model with max depth = 3 on the train data-frame
tree_md = DecisionTreeClassifier(max_depth = 3).fit(X_train, Y_train)

## Visualizing the decision tree model and identify any interesting interactions/features
fig = plt.figure(figsize = (20, 20))
plot_tree(tree_md, feature_names = X.columns, filled = True)

In [None]:
## Creating tree interactions

train['Tree_1'] = np.where((train['Interaction_2'] <= 0.5) & (train['Interaction_5'] <= 1.5) & (train['Age'] <= 8.5), 1, 0)

train['Tree_2'] = np.where((train['Interaction_2'] <= 0.5) & (train['Interaction_5'] <= 1.5) & (train['Age'] > 8.5), 1, 0)

train['Tree_3'] = np.where((train['Interaction_2'] <= 0.5) & (train['Interaction_5'] > 1.5) & (train['Log_BMI'] <= 3.384), 1, 0)

train['Tree_4'] = np.where((train['Interaction_2'] <= 0.5) & (train['Interaction_5'] > 1.5) & (train['Log_BMI'] > 3.384), 1, 0)

train['Tree_5'] = np.where((train['Interaction_2'] > 0.5) & (train['Interaction_5'] <= 3.5) & (train['BMI'] <= 30.5), 1, 0)

train['Tree_6'] = np.where((train['Interaction_2'] > 0.5) & (train['Interaction_5'] <= 3.5) & (train['BMI'] > 30.5), 1, 0)

train['Tree_7'] = np.where((train['Interaction_2'] > 0.5) & (train['Interaction_5'] > 3.5) & (train['Log_BMI'] <= 3.481), 1, 0)

train['Tree_8'] = np.where((train['Interaction_2'] > 0.5) & (train['Interaction_5'] > 3.5) & (train['Log_BMI'] > 3.481), 1, 0)

In [None]:
train.head()

In [None]:
train.shape