In this notebook the entire project from a-z (preprocessing, model training, prediction) is included. 


## import libraries

In [103]:
import pandas as pd 

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
import time


import warnings
warnings.filterwarnings('ignore')

## load dataset

In [52]:
path_to_dataset = 'data/diabetes_data.csv'
# load csv file to dataframe
dataframe = pd.read_csv(path_to_dataset)
# make a copy of dataset
df = dataframe.copy()

## preprocess dataset

In [53]:
# change the column name of target variable
df.rename(columns = {'Diabetes_012': 'Diabetes'}, inplace=True)

In [54]:
## find most correlated features
df_corr = df.corr()
df_diabetes_corr = df_corr.iloc[0, :]
df_diabetes_corr.sort_values(ascending=False) # sort most correlated features
most_correlated_features = df_diabetes_corr[df_diabetes_corr > 0.2] # select features correlation greater than 0.2
most_correlated_features.sort_values(ascending=False)

Diabetes    1.000000
GenHlth     0.302587
HighBP      0.271596
BMI         0.224379
DiffWalk    0.224239
HighChol    0.209085
Name: Diabetes, dtype: float64

In [56]:
# drop least correlated features 
df = df.drop(
    ['CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 
    'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 
    'AnyHealthcare', 'NoDocbcCost', 'MentHlth', 'PhysHlth', 'Sex', 
    'Age', 'Education', 'Income'], axis=1)

In [57]:
df.head()

Unnamed: 0,Diabetes,HighBP,HighChol,BMI,GenHlth,DiffWalk
0,0.0,1.0,1.0,40.0,5.0,1.0
1,0.0,0.0,0.0,25.0,3.0,0.0
2,0.0,1.0,1.0,28.0,5.0,1.0
3,0.0,1.0,0.0,27.0,2.0,0.0
4,0.0,1.0,1.0,24.0,2.0,0.0


### change data type

In [58]:
# convert float64 to type int   
df['Diabetes'] = df['Diabetes'].astype('int')
df['HighBP'] = df['HighBP'].astype('int')
df['HighChol'] = df['HighChol'].astype('int')
df['BMI'] = df['BMI'].astype('int')
df['GenHlth'] = df['GenHlth'].astype('int')
df['DiffWalk'] = df['DiffWalk'].astype('int')

### handle class imbalanced

In [59]:
# replace class 2 with class 1
df.Diabetes.replace([2.0], [1.0], inplace=True)

In [60]:
# seperate features from labels
X = df.drop(['Diabetes'], axis=1)
y = df['Diabetes']

In [61]:
# shape & class distribution before resampling
X.shape, y.shape, y.value_counts()

((253680, 5),
 (253680,),
 0    213703
 1     39977
 Name: Diabetes, dtype: int64)

In [62]:
# oversampling the dataset using SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [63]:
# shape & class distribution after resampling
X.shape, y.shape, y.value_counts()

((427406, 5),
 (427406,),
 0    213703
 1    213703
 Name: Diabetes, dtype: int64)

### split dataset to train set, test set

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

### feature engineering

In [65]:
# transform continous features
continuous_features = ['BMI']
scaler_1 = StandardScaler()
X_train[continuous_features] = scaler_1.fit_transform(X_train[continuous_features])

X_test[continuous_features] = scaler_1.transform(X_test[continuous_features])

In [68]:
X_test.head()

Unnamed: 0,HighBP,HighChol,BMI,GenHlth,DiffWalk
361136,0,0,1.017289,2,1
129670,0,0,-0.670829,1,0
152550,1,1,-0.670829,2,0
296195,1,0,0.032554,2,1
359410,1,1,0.17323,1,1


In [71]:
# transform categorical features
encoder = OneHotEncoder()
cat_cols  = ['GenHlth']
cont_cols = ['HighBP', 'HighChol', 'BMI', 'GenHlth']

transformer = ColumnTransformer([
('onehot', encoder, cat_cols ), # apply one-hot encoding to cat_cols
('passthrough', 'passthrough', cont_cols) ]) # do nothing to cont_cols 

transformer.fit(X_train)
X_train_transformed = transformer.transform(X_train)
X_test_transformed = transformer.transform(X_test)

## train the model

In [84]:
## Random forest model ##
# set hyperparameters of the model
N_ESTIMATOR_rf = 10
MAX_FEATURES_rf = 'log2'
model_rf = RandomForestClassifier(
    n_estimators=N_ESTIMATOR_rf, max_features=MAX_FEATURES_rf, random_state=7)

In [86]:
# train the model
model_rf.fit(X_train_transformed, y_train) 

In [88]:
y_pred = model_rf.predict(X_test_transformed) # predict true labels
accuracy = accuracy_score(y_test, y_pred) # calculate accuracy of the model
print(accuracy) # print accuracy

0.7312161719517711


## predict on user data

In [91]:
# read users input data
user_df = pd.read_csv('data/user_input_data.csv')


### ################### ###
### transform user data ###
### ################### ###

# convert float64 to type int   
user_df['HighBP'] = user_df['HighBP'].astype('int')
user_df['HighChol'] = user_df['HighChol'].astype('int')
user_df['BMI'] = user_df['BMI'].astype('int')
user_df['GenHlth'] = user_df['GenHlth'].astype('int')
user_df['DiffWalk'] = user_df['DiffWalk'].astype('int')

# transform continuous feature
user_df[continuous_features] = scaler_1.transform(user_df[continuous_features])

user_df_transformed = transformer.transform(user_df)

In [106]:
# predict user results
start = time.time() + 1  # start time of prediction
user_preds = model_rf.predict(user_df_transformed)
end = time.time() + 1 # end time of prediction
duration = end - start # duration of prediction
print('time to predic 3 samples:', duration)

time to predic 3 samples: 0.0029926300048828125


In [108]:
user_true_data = pd.read_csv('data/user_output_data.csv') # read user true labels
display('True users results: ', user_true_data.head()) 
print('predicted users results: ', user_preds)


'True users results: '

Unnamed: 0,1,0,1.1


predicted users results:  [1 0 0]


The model accuracy is about 71%. (However when the less correlated features were not eliminated (using random oversampling) this score was about 91%) 

Testing the model for 3 real people obtains 2/3 ~ about 66% accuracy which is close to model's accuracy. In order to have a have a more accurate saying on model's performance, the model must be tested on a larger population to obtain more accurate results.  