In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import imblearn
from imblearn.under_sampling import NearMiss
from sklearn.linear_model import LogisticRegression, SGDClassifier, LassoCV, SGDClassifier, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.feature_selection import SelectKBest, f_regression, f_classif, chi2, SequentialFeatureSelector
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from xgboost import XGBClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', 25)






In [310]:
pip install --user --upgrade pip

Collecting pip
  Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
     ---------------------------------------- 2.1/2.1 MB 19.1 MB/s eta 0:00:00
Installing collected packages: pip
Successfully installed pip-23.3.1
Note: you may need to restart the kernel to use updated packages.




# Creating Cleaned and Balanced Dataset

In [2]:
##Recreating the cleaned Dataset
df = pd.read_csv('diabetes.csv')
undersample = NearMiss(version=1)
X = df.loc[:, df.columns != 'Diabetes_binary']
y = df.loc[:, df.columns == 'Diabetes_binary']
X, y = undersample.fit_resample(X, y)

print(X.info())
print(y.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70692 entries, 0 to 70691
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   HighBP                70692 non-null  float64
 1   HighChol              70692 non-null  float64
 2   CholCheck             70692 non-null  float64
 3   BMI                   70692 non-null  float64
 4   Smoker                70692 non-null  float64
 5   Stroke                70692 non-null  float64
 6   HeartDiseaseorAttack  70692 non-null  float64
 7   PhysActivity          70692 non-null  float64
 8   Fruits                70692 non-null  float64
 9   Veggies               70692 non-null  float64
 10  HvyAlcoholConsump     70692 non-null  float64
 11  AnyHealthcare         70692 non-null  float64
 12  NoDocbcCost           70692 non-null  float64
 13  GenHlth               70692 non-null  float64
 14  MentHlth              70692 non-null  float64
 15  PhysHlth           

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
df_undersampled_train = pd.DataFrame(X_train_scaled, columns = X.columns)
df_undersampled_train['Diabetes_binary'] = y_train
df_undersampled_train

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,-1.212894,-1.140353,0.074482,-1.221972,1.158253,-0.225623,-0.384172,0.514775,0.669234,0.418265,-0.139079,0.147309,-0.237966,-0.639948,-0.332699,-0.446002,-0.485309,-1.036097,-0.514634,0.864409,0.356032,0.0
1,0.824475,0.876922,0.074482,0.697964,1.158253,-0.225623,2.603001,-1.942597,0.669234,-2.390830,-0.139079,0.147309,-0.237966,1.255723,-0.332699,-0.446002,2.060543,-1.036097,0.367769,-1.162526,-2.571665,0.0
2,-1.212894,-1.140353,0.074482,-1.061978,1.158253,-0.225623,-0.384172,0.514775,-1.494245,0.418265,-0.139079,0.147309,-0.237966,-0.639948,-0.332699,-0.446002,-0.485309,0.965161,-0.073432,-0.149059,0.843982,0.0
3,0.824475,-1.140353,0.074482,-0.581994,-0.863369,-0.225623,-0.384172,0.514775,0.669234,0.418265,-0.139079,0.147309,-0.237966,-1.587783,-0.332699,-0.446002,-0.485309,-1.036097,0.808970,-0.149059,0.843982,0.0
4,0.824475,-1.140353,0.074482,1.177948,1.158253,-0.225623,-0.384172,-1.942597,0.669234,-2.390830,-0.139079,0.147309,-0.237966,-0.639948,2.664163,0.116531,-0.485309,-1.036097,-0.955835,-1.162526,-0.131917,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49479,0.824475,-1.140353,0.074482,-0.262004,1.158253,-0.225623,-0.384172,-1.942597,0.669234,0.418265,-0.139079,0.147309,-0.237966,0.307887,-0.332699,-0.446002,-0.485309,0.965161,0.808970,-1.162526,-0.619867,1.0
49480,-1.212894,0.876922,0.074482,-0.262004,-0.863369,-0.225623,-0.384172,0.514775,-1.494245,0.418265,-0.139079,0.147309,-0.237966,-0.639948,-0.332699,-0.446002,-0.485309,0.965161,-0.955835,0.864409,0.843982,1.0
49481,0.824475,0.876922,0.074482,-1.061978,-0.863369,4.432171,-0.384172,0.514775,0.669234,0.418265,-0.139079,0.147309,-0.237966,-0.639948,-0.332699,-0.220989,-0.485309,-1.036097,0.808970,0.864409,-1.107816,1.0
49482,-1.212894,-1.140353,0.074482,0.057985,-0.863369,-0.225623,-0.384172,0.514775,0.669234,0.418265,-0.139079,0.147309,-0.237966,-0.639948,-0.332699,-0.446002,-0.485309,0.965161,-0.073432,0.864409,0.843982,1.0


In [5]:
df_undersampled_test = pd.DataFrame(X_test_scaled, columns = X.columns)
df_undersampled_test['Diabetes_binary'] = y_test
df_undersampled_test

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,-1.212894,0.876922,0.074482,-1.061978,1.158253,-0.225623,-0.384172,0.514775,-1.494245,0.418265,-0.139079,0.147309,-0.237966,1.255723,-0.332699,-0.446002,-0.485309,-1.036097,0.367769,-1.162526,-0.619867,
1,-1.212894,-1.140353,0.074482,0.377975,1.158253,-0.225623,-0.384172,0.514775,0.669234,0.418265,-0.139079,0.147309,-0.237966,-0.639948,-0.332699,-0.446002,-0.485309,0.965161,-2.279439,0.864409,0.843982,
2,0.824475,0.876922,0.074482,1.017954,-0.863369,-0.225623,-0.384172,0.514775,0.669234,0.418265,-0.139079,0.147309,-0.237966,0.307887,-0.332699,-0.220989,-0.485309,0.965161,-0.073432,0.864409,0.843982,
3,-1.212894,0.876922,0.074482,0.377975,-0.863369,-0.225623,-0.384172,0.514775,0.669234,0.418265,-0.139079,0.147309,-0.237966,-0.639948,-0.332699,-0.446002,-0.485309,-1.036097,-0.514634,0.864409,0.356032,
4,0.824475,-1.140353,0.074482,2.777896,-0.863369,-0.225623,-0.384172,0.514775,0.669234,0.418265,-0.139079,0.147309,-0.237966,1.255723,4.162593,0.679064,2.060543,0.965161,-1.397036,0.864409,-1.595766,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21203,0.824475,0.876922,0.074482,-0.262004,1.158253,-0.225623,-0.384172,0.514775,0.669234,0.418265,-0.139079,0.147309,-0.237966,-0.639948,-0.332699,-0.446002,-0.485309,0.965161,0.367769,0.864409,0.843982,
21204,-1.212894,-1.140353,0.074482,4.857827,-0.863369,-0.225623,-0.384172,-1.942597,0.669234,0.418265,-0.139079,0.147309,-0.237966,1.255723,-0.332699,-0.446002,2.060543,-1.036097,0.367769,0.864409,-1.107816,0.0
21205,-1.212894,-1.140353,0.074482,-1.221972,-0.863369,-0.225623,-0.384172,0.514775,-1.494245,0.418265,-0.139079,0.147309,-0.237966,-0.639948,-0.332699,-0.446002,-0.485309,-1.036097,-0.073432,0.864409,0.843982,
21206,-1.212894,0.876922,0.074482,-0.581994,1.158253,-0.225623,-0.384172,0.514775,0.669234,0.418265,7.190175,0.147309,-0.237966,2.203558,4.162593,0.454051,2.060543,0.965161,-1.838238,-0.149059,-2.571665,


In [6]:
lasso = LassoCV(cv=5, random_state=0).fit(X_train_scaled, y_train)

coef = lasso.coef_
col = X.columns
for index in range(len(coef)):
    if coef[index] > 0.01:
        print(f'{col[index]}: {np.round(coef[index], 3)}')

  y = column_or_1d(y, warn=True)


HighBP: 0.031
HighChol: 0.013
BMI: 0.065
Smoker: 0.016
Stroke: 0.015
HeartDiseaseorAttack: 0.029
HvyAlcoholConsump: 0.017
NoDocbcCost: 0.015
GenHlth: 0.11
MentHlth: 0.016
PhysHlth: 0.024
DiffWalk: 0.03


In [7]:
corr = df_undersampled_train.corr()
corr

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
HighBP,1.0,0.290281,0.01919,0.252404,0.132512,0.119859,0.192153,-0.185341,-0.103985,-0.123597,0.022629,-0.032929,0.077367,0.322079,0.124688,0.188203,0.225602,-0.011667,0.275838,-0.227568,-0.28253,-0.012546
HighChol,0.290281,1.0,0.012837,0.12913,0.12524,0.081647,0.160192,-0.123335,-0.089436,-0.084961,0.02874,-0.023433,0.065678,0.223396,0.117957,0.139995,0.147227,-0.00769,0.163192,-0.13482,-0.163382,-0.01103
CholCheck,0.01919,0.012837,1.0,-0.004101,-0.00519,-0.000986,-0.003985,0.019316,0.015673,0.010999,-0.005638,0.079857,-0.059807,-0.020087,-0.027875,-0.014818,-0.009001,-0.010735,0.022321,0.006402,0.031126,-0.002274
BMI,0.252404,0.12913,-0.004101,1.0,0.063508,0.057133,0.098065,-0.255187,-0.158303,-0.123689,0.000244,-0.064963,0.129655,0.345232,0.201497,0.255436,0.316306,-0.04402,-0.112717,-0.201685,-0.242094,-0.004097
Smoker,0.132512,0.12524,-0.00519,0.063508,1.0,0.072942,0.144389,-0.102905,-0.102695,-0.060063,0.066169,-0.028095,0.042161,0.181521,0.108281,0.140699,0.140045,0.115278,0.145365,-0.17196,-0.152527,-0.006953
Stroke,0.119859,0.081647,-0.000986,0.057133,0.072942,1.0,0.233298,-0.126957,-0.046131,-0.084637,-0.008613,-0.028131,0.077127,0.223366,0.141691,0.210303,0.236295,-0.019964,0.083517,-0.120345,-0.194099,-0.002408
HeartDiseaseorAttack,0.192153,0.160192,-0.003985,0.098065,0.144389,0.233298,1.0,-0.147135,-0.063526,-0.085198,-0.008301,-0.021283,0.083611,0.311562,0.140186,0.246269,0.271997,0.073961,0.173327,-0.153377,-0.216515,-0.004789
PhysActivity,-0.185341,-0.123335,0.019316,-0.255187,-0.102905,-0.126957,-0.147135,1.0,0.165934,0.204442,-0.010663,0.072273,-0.120715,-0.372282,-0.224465,-0.333044,-0.363598,0.081863,-0.080338,0.273416,0.307727,0.002891
Fruits,-0.103985,-0.089436,0.015673,-0.158303,-0.102695,-0.046131,-0.063526,0.165934,1.0,0.245612,-0.027089,0.046059,-0.07079,-0.189348,-0.10377,-0.113491,-0.116434,-0.09609,0.042817,0.145128,0.140828,0.006887
Veggies,-0.123597,-0.084961,0.010999,-0.123689,-0.060063,-0.084637,-0.085198,0.204442,0.245612,1.0,0.002835,0.057291,-0.088002,-0.214869,-0.119102,-0.151267,-0.165544,-0.027881,-0.029046,0.214979,0.238881,-0.001889


In [8]:
corr_target = abs(corr["Diabetes_binary"])
relevant_features = corr_target[corr_target>0.006]
relevant_features

HighBP             0.012546
HighChol           0.011030
Smoker             0.006953
Fruits             0.006887
AnyHealthcare      0.007229
MentHlth           0.011795
DiffWalk           0.012193
Sex                0.012702
Income             0.008735
Diabetes_binary    1.000000
Name: Diabetes_binary, dtype: float64

# Creating new DF with selected variables

In [9]:
X_selected_train = df_undersampled_train.loc[:, ['Sex', 'HighBP', 'DiffWalk', 'MentHlth', 'HighChol', 'AnyHealthcare', 'Smoker',
                       'Fruits','Income']]
print(X_selected_train.info())

X_selected_test = df_undersampled_test.loc[:, ['Sex', 'HighBP', 'DiffWalk', 'MentHlth', 'HighChol', 'AnyHealthcare', 'Smoker',
                       'Fruits','Income']]
print(X_selected_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49484 entries, 0 to 49483
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sex            49484 non-null  float64
 1   HighBP         49484 non-null  float64
 2   DiffWalk       49484 non-null  float64
 3   MentHlth       49484 non-null  float64
 4   HighChol       49484 non-null  float64
 5   AnyHealthcare  49484 non-null  float64
 6   Smoker         49484 non-null  float64
 7   Fruits         49484 non-null  float64
 8   Income         49484 non-null  float64
dtypes: float64(9)
memory usage: 3.4 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21208 entries, 0 to 21207
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sex            21208 non-null  float64
 1   HighBP         21208 non-null  float64
 2   DiffWalk       21208 non-null  float64
 3   MentHlth       21208 non-null 

# Perceptron

In [223]:
gs_linear = GridSearchCV(estimator = Perceptron(),
                       param_grid = {'penalty': ['l2', 'l1', 'elasticnet', 'None'],
                                     'alpha': [0.0001, 0.001, 0.01, 1, 2],
                                    'early_stopping': [True, False],
                                    'random_state': [42]},
                       cv = 5,
                       scoring='accuracy',
                        verbose = 3,
                        n_jobs = -1)
gs_linear.fit(X_selected_train, y_train.values.ravel())

Fitting 5 folds for each of 40 candidates, totalling 200 fits


GridSearchCV(cv=5, estimator=Perceptron(), n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 1, 2],
                         'early_stopping': [True, False],
                         'penalty': ['l2', 'l1', 'elasticnet', 'None'],
                         'random_state': [42]},
             scoring='accuracy', verbose=3)

In [224]:
print(gs_linear.best_params_)
print(gs_linear.best_score_)

{'alpha': 0.0001, 'early_stopping': False, 'penalty': 'None', 'random_state': 42}
0.7810608891632318


# KNN

In [150]:
gs = GridSearchCV(estimator=KNeighborsClassifier(),
                 param_grid = {'n_neighbors': range(3, 10, 1),
                              'weights': ['uniform', 'distance'],
                              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                              'p': [1, 2],
                              },
                 cv=5,
                 scoring='accuracy',
                 verbose = 3,
                 n_jobs = -1)
gs.fit(X_selected_train, y_train.values.ravel())

Fitting 5 folds for each of 112 candidates, totalling 560 fits


  return self._fit(X, y)


GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': range(3, 10), 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy', verbose=3)

In [151]:
print(gs.best_params_)
print(gs.best_score_)

{'algorithm': 'ball_tree', 'n_neighbors': 6, 'p': 2, 'weights': 'distance'}
0.8514267754149062


# Random Forest

In [232]:
random_gs = GridSearchCV(estimator=RandomForestClassifier(),
                 param_grid = {'criterion': ['gini', 'entropy', 'log_loss'],
                              'min_samples_split': range(2, 22, 5),
                              'min_samples_leaf': range(2, 22, 5),
                              'max_features': ['sqrt', 'log2', 'None'],
                              'random_state': [42],
                              'max_depth': range(5, 30, 5)},
                 cv=5,
                 scoring='accuracy',
                 verbose = 3,
                 n_jobs = -1)
random_gs.fit(X_selected_train, y_train.values.ravel())

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


2000 fits failed out of a total of 3600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1200 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Felipe\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Felipe\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "C:\Users\Felipe\anaconda3\lib\site-packages\joblib\parallel.py", line 1863, in __call__
    return output if self.return_generator else list(output)
  File "C:\Users\Felipe\anaconda3\lib\site-packages\joblib\parallel.py", line 1792, in _get_sequential_output
    res = func(*args,

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': range(5, 30, 5),
                         'max_features': ['sqrt', 'log2', 'None'],
                         'min_samples_leaf': range(2, 22, 5),
                         'min_samples_split': range(2, 22, 5),
                         'random_state': [42]},
             scoring='accuracy', verbose=3)

rf = RandomForestClassifier()
random_gs = GridSearchCV(estimator = rf,
                 param_grid= {'criterion': ['gini', 'entropy', 'log_loss'],
                             'max_depth': range(5, 30, 5),
                             'min_samples_split': range(2, 20, 2),
                             'random_state': [42]},
                 cv = 5,
                 scoring='accuracy',
                        verbose = 3,
                        n_jobs = -1)
random_gs.fit(X_selected_train, y_train.values.ravel())

In [234]:
print(random_gs.best_params_)
print(random_gs.best_score_)

{'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 17, 'random_state': 42}
0.813717613161726


# SVM

In [235]:
svc_gs = GridSearchCV(estimator = SVC(),
                 param_grid = {'C': [0.1, 1, 5, 10, 15, 20],
                              'kernel': ['linear', 'poly', 'rbf'],
                              'degree': [3, 6, 9],
                              'gamma': ['scale', 'auto']},
                 cv = 5,
                 scoring = 'accuracy',
                     verbose = 3,
                     n_jobs = -1)
svc_gs.fit(X_selected_train, y_train.values.ravel())

Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=5, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.1, 1, 5, 10, 15, 20], 'degree': [3, 6, 9],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'poly', 'rbf']},
             scoring='accuracy', verbose=3)

In [236]:
print(svc_gs.best_params_)
print(svc_gs.best_score_)

{'C': 5, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
0.8129496648952277


# Logistic Regression

In [228]:
log_gs = GridSearchCV(estimator = LogisticRegression(),
                     param_grid = {'penalty': ['l1', 'l2', 'elasticnet', 'None'],
                                  'C': range(1, 10, 1),
                                  'random_state': [42],
                                  'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']},
                     cv=5,
                     scoring = 'accuracy',
                     verbose = 3,
                     n_jobs = -1)
log_gs.fit(X_selected_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


765 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Felipe\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Felipe\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Felipe\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

----------------------

GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': range(1, 10),
                         'penalty': ['l1', 'l2', 'elasticnet', 'None'],
                         'random_state': [42],
                         'solver': ['lbfgs', 'liblinear', 'newton-cg',
                                    'newton-cholesky', 'sag', 'saga']},
             scoring='accuracy', verbose=3)

In [229]:
print(log_gs.best_params_)
print(log_gs.best_score_)

{'C': 3, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear'}
0.812222230934976


# Testing For Best Model

In [296]:
knn = KNeighborsClassifier(algorithm = 'ball_tree', n_neighbors = 9, p = 2, weights = 'uniform')
knn.fit(X_selected_train, y_train.values.ravel())
knn_train_pred = knn.predict(X_selected_train)
knn_train_score = accuracy_score(y_train, knn_train_pred)
knn_test_pred = knn.predict(X_selected_test)
knn_test_score = accuracy_score(y_test, knn_test_pred)

print(f'KNN Train Accuracy: {np.round(knn_train_score, 3)} & Test Accuracy: {np.round(knn_test_score, 3)}')

rf = RandomForestClassifier(criterion = 'gini', max_depth = 10, min_samples_split = 18, random_state=42)
rf.fit(X_selected_train, y_train.values.ravel())
rf_train_pred = rf.predict(X_selected_train)
rf_train_score = accuracy_score(y_train, rf_train_pred)
rf_test_pred = rf.predict(X_selected_test)
rf_test_score = accuracy_score(y_test, rf_test_pred)

print(f'Random Forest Tree Train Accuracy: {np.round(rf_train_score, 3)} & Test Accuracy: {np.round(rf_test_score, 3)}')

line = Perceptron(alpha = 0.0001, early_stopping = False, penalty = None, random_state=42)
line.fit(X_selected_train, y_train.values.ravel())
line_train_pred = line.predict(X_selected_train)
line_train_score = accuracy_score(y_train, line_train_pred)
line_test_pred = line.predict(X_selected_test)
line_test_score = accuracy_score(y_test, line_test_pred)

print(f'Linear Classifier/Perceptron Train Accuracy: {np.round(line_train_score, 3)} & Test Accuracy: {np.round(line_test_score, 3)}')

svm = SVC(C = 5, degree = 3, gamma = 'scale', kernel = 'rbf')
svm.fit(X_selected_train, y_train.values.ravel())
svm_train_pred = svm.predict(X_selected_train)
svm_train_score = accuracy_score(y_train, svm_train_pred)
svm_test_pred = svm.predict(X_selected_test)
svm_test_score = accuracy_score(y_test, svm_test_pred)

print(f'SVM Train Accuracy: {np.round(svm_train_score, 3)} & Test Accuracy: {np.round(svm_test_score, 3)}')

log = LogisticRegression(C = 1, penalty = 'l1', random_state=42, solver = 'saga')
log.fit(X_selected_train, y_train.values.ravel())
log_train_pred = log.predict(X_selected_train)
log_train_score = accuracy_score(y_train, log_train_pred)
log_test_pred = log.predict(X_selected_test)
log_test_score = accuracy_score(y_test, log_test_pred)

print(f'Logistic Regression Train Accuracy: {np.round(log_train_score, 3)} & Test Accuracy: {np.round(log_test_score, 3)}')

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


KNN Train Accuracy: 0.793 & Test Accuracy: 0.79
Random Forest Tree Train Accuracy: 0.816 & Test Accuracy: 0.816
Linear Classifier/Perceptron Train Accuracy: 0.695 & Test Accuracy: 0.697
SVM Train Accuracy: 0.814 & Test Accuracy: 0.816
Logistic Regression Train Accuracy: 0.812 & Test Accuracy: 0.813


In [242]:
knn_conf_train = confusion_matrix(y_train, knn_train_pred)
knn_conf_test = confusion_matrix(y_test, knn_test_pred)

log_conf_train = confusion_matrix(y_train, log_train_pred)
log_conf_test = confusion_matrix(y_test, log_test_pred)

svm_conf_train = confusion_matrix(y_train, svm_train_pred)
svm_conf_test = confusion_matrix(y_test, svm_test_pred)

line_conf_train = confusion_matrix(y_train, line_train_pred)
line_conf_test = confusion_matrix(y_test, line_test_pred)

rf_conf_train = confusion_matrix(y_train, rf_train_pred)
rf_conf_test = confusion_matrix(y_test, rf_test_pred)

# KNN Accuracy and Metrics

In [243]:
print(f'KNN Train Accuracy: {np.round(knn_train_score, 3)} & Test Accuracy: {np.round(knn_test_score, 3)}\n')

print(f'Train Confusion Matrix:\n{knn_conf_train}\n\nTest Confusion Matrix:\n{knn_conf_test}\n')
print(f'Train Classification Report:\n{classification_report(y_train, knn_train_pred)}\nTest Classification Report:\n{classification_report(y_test, knn_test_pred)}')

KNN Train Accuracy: 0.793 & Test Accuracy: 0.79

Train Confusion Matrix:
[[20488  4257]
 [ 5980 18759]]

Test Confusion Matrix:
[[8692 1909]
 [2548 8059]]

Train Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.83      0.80     24745
         1.0       0.82      0.76      0.79     24739

    accuracy                           0.79     49484
   macro avg       0.79      0.79      0.79     49484
weighted avg       0.79      0.79      0.79     49484

Test Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.82      0.80     10601
         1.0       0.81      0.76      0.78     10607

    accuracy                           0.79     21208
   macro avg       0.79      0.79      0.79     21208
weighted avg       0.79      0.79      0.79     21208



# Logistic Regression Accuracy and Metrics

In [244]:
print(f'Logisitc Regression Train Accuracy: {np.round(log_train_score, 3)} & Test Accuracy: {np.round(log_test_score, 3)}\n')

print(f'Train Confusion Matrix:\n{log_conf_train}\n\nTest Confusion Matrix:\n{log_conf_test}\n')
print(f'Train Classification Report:\n{classification_report(y_train, log_train_pred)}\nTest Classification Report:\n{classification_report(y_test, log_test_pred)}')

Logisitc Regression Train Accuracy: 0.812 & Test Accuracy: 0.813

Train Confusion Matrix:
[[21878  2867]
 [ 6428 18311]]

Test Confusion Matrix:
[[9359 1242]
 [2729 7878]]

Train Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.88      0.82     24745
         1.0       0.86      0.74      0.80     24739

    accuracy                           0.81     49484
   macro avg       0.82      0.81      0.81     49484
weighted avg       0.82      0.81      0.81     49484

Test Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.88      0.82     10601
         1.0       0.86      0.74      0.80     10607

    accuracy                           0.81     21208
   macro avg       0.82      0.81      0.81     21208
weighted avg       0.82      0.81      0.81     21208



# SVM Accuracy and Metrics

In [245]:
print(f'SVM Train Accuracy: {np.round(svm_train_score, 3)} & Test Accuracy: {np.round(svm_test_score, 3)}\n')

print(f'Train Confusion Matrix:\n{svm_conf_train}\n\nTest Confusion Matrix:\n{svm_conf_test}\n')
print(f'Train Classification Report:\n{classification_report(y_train, svm_train_pred)}\nTest Classification Report:\n{classification_report(y_test, svm_test_pred)}')

SVM Train Accuracy: 0.814 & Test Accuracy: 0.815

Train Confusion Matrix:
[[22382  2363]
 [ 6839 17900]]

Test Confusion Matrix:
[[9592 1009]
 [2917 7690]]

Train Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.90      0.83     24745
         1.0       0.88      0.72      0.80     24739

    accuracy                           0.81     49484
   macro avg       0.82      0.81      0.81     49484
weighted avg       0.82      0.81      0.81     49484

Test Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.90      0.83     10601
         1.0       0.88      0.72      0.80     10607

    accuracy                           0.81     21208
   macro avg       0.83      0.81      0.81     21208
weighted avg       0.83      0.81      0.81     21208



# Perceptron Accuracy and Metrics

In [246]:
print(f'Perceptron Train Accuracy: {np.round(line_train_score, 3)} & Test Accuracy: {np.round(line_test_score, 3)}\n')

print(f'Train Confusion Matrix:\n{line_conf_train}\n\nTest Confusion Matrix:\n{line_conf_test}\n')
print(f'Train Classification Report:\n{classification_report(y_train, line_train_pred)}\nTest Classification Report:\n{classification_report(y_test, line_test_pred)}')

Perceptron Train Accuracy: 0.695 & Test Accuracy: 0.697

Train Confusion Matrix:
[[16250  8495]
 [ 6616 18123]]

Test Confusion Matrix:
[[6961 3640]
 [2777 7830]]

Train Classification Report:
              precision    recall  f1-score   support

         0.0       0.71      0.66      0.68     24745
         1.0       0.68      0.73      0.71     24739

    accuracy                           0.69     49484
   macro avg       0.70      0.69      0.69     49484
weighted avg       0.70      0.69      0.69     49484

Test Classification Report:
              precision    recall  f1-score   support

         0.0       0.71      0.66      0.68     10601
         1.0       0.68      0.74      0.71     10607

    accuracy                           0.70     21208
   macro avg       0.70      0.70      0.70     21208
weighted avg       0.70      0.70      0.70     21208



# Random Forest Tree Accuracy and Metrics

In [247]:
print(f'Random Forest Train Accuracy: {np.round(rf_train_score, 3)} & Test Accuracy: {np.round(rf_test_score, 3)}\n')

print(f'Train Confusion Matrix:\n{rf_conf_train}\n\nTest Confusion Matrix:\n{rf_conf_test}\n')
print(f'Train Classification Report:\n{classification_report(y_train, rf_train_pred)}\nTest Classification Report:\n{classification_report(y_test, rf_test_pred)}')

Random Forest Train Accuracy: 0.816 & Test Accuracy: 0.816

Train Confusion Matrix:
[[22210  2535]
 [ 6556 18183]]

Test Confusion Matrix:
[[9493 1108]
 [2791 7816]]

Train Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.90      0.83     24745
         1.0       0.88      0.73      0.80     24739

    accuracy                           0.82     49484
   macro avg       0.82      0.82      0.82     49484
weighted avg       0.82      0.82      0.82     49484

Test Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.90      0.83     10601
         1.0       0.88      0.74      0.80     10607

    accuracy                           0.82     21208
   macro avg       0.82      0.82      0.81     21208
weighted avg       0.82      0.82      0.81     21208



# The Best Performing Model

    * The Best Performing Model is Random Forest
    
    * The 3 best performing models are Random Forest, SVM, and Logistic

In [18]:
sfs_forward = SequentialFeatureSelector(KNeighborsClassifier(n_neighbors=3), n_features_to_select=10, n_jobs=-1)
sfs_forward.fit(X_train_scaled, y_train.values.ravel())


sfs_backward = SequentialFeatureSelector(KNeighborsClassifier(n_neighbors=3), n_features_to_select=10,direction = 'backward', n_jobs=-1)
sfs_backward.fit(X_train_scaled, y_train.values.ravel())

SequentialFeatureSelector(direction='backward',
                          estimator=KNeighborsClassifier(n_neighbors=3),
                          n_features_to_select=10, n_jobs=-1)

In [21]:
cols_idxs_forward = sfs_forward.get_support(indices=True)
cols_idxs_forward

forward_selection_train = df_undersampled_train.iloc[:, cols_idxs_forward]
print(forward_selection_train)
print(cols_idxs_forward)

cols_idxs_backward = sfs_backward.get_support(indices=True)
cols_idxs_backward

backward_selection_train = df_undersampled_train.iloc[:, cols_idxs_backward]
print(backward_selection_train)
print(cols_idxs_backward)

forward_selection_test = df_undersampled_test.iloc[:, cols_idxs_forward]
backward_selection_test = df_undersampled_test.iloc[:, cols_idxs_backward]

       HeartDiseaseorAttack  PhysActivity    Fruits  HvyAlcoholConsump  \
0                 -0.384172      0.514775  0.669234          -0.139079   
1                  2.603001     -1.942597  0.669234          -0.139079   
2                 -0.384172      0.514775 -1.494245          -0.139079   
3                 -0.384172      0.514775  0.669234          -0.139079   
4                 -0.384172     -1.942597  0.669234          -0.139079   
...                     ...           ...       ...                ...   
49479             -0.384172     -1.942597  0.669234          -0.139079   
49480             -0.384172      0.514775 -1.494245          -0.139079   
49481             -0.384172      0.514775  0.669234          -0.139079   
49482             -0.384172      0.514775  0.669234          -0.139079   
49483             -0.384172      0.514775  0.669234          -0.139079   

       AnyHealthcare   GenHlth  MentHlth  PhysHlth  DiffWalk       Sex  
0           0.147309 -0.639948 -0.3326

In [39]:
sfs_forward_rf = SequentialFeatureSelector(RandomForestClassifier(), n_features_to_select=10, n_jobs=-1)
sfs_forward_rf.fit(X_train_scaled, y_train.values.ravel())


sfs_backward_rf = SequentialFeatureSelector(RandomForestClassifier(), n_features_to_select=10,direction = 'backward', n_jobs=-1)
sfs_backward_rf.fit(X_train_scaled, y_train.values.ravel())

cols_idxs_forward_rf = sfs_forward_rf.get_support(indices=True)
cols_idxs_forward_rf

forward_selection_train_rf = df_undersampled_train.iloc[:, cols_idxs_forward_rf]
print(forward_selection_train_rf)
print(cols_idxs_forward_rf)

cols_idxs_backward_rf = sfs_backward_rf.get_support(indices=True)
cols_idxs_backward_rf

backward_selection_train_rf = df_undersampled_train.iloc[:, cols_idxs_backward_rf]
print(backward_selection_train_rf)
print(cols_idxs_backward_rf)

forward_selection_test_rf = df_undersampled_test.iloc[:, cols_idxs_forward_rf]
backward_selection_test_rf = df_undersampled_test.iloc[:, cols_idxs_backward_rf]

            BMI    Stroke  PhysActivity   Veggies  AnyHealthcare   GenHlth  \
0     -1.221972 -0.225623      0.514775  0.418265       0.147309 -0.639948   
1      0.697964 -0.225623     -1.942597 -2.390830       0.147309  1.255723   
2     -1.061978 -0.225623      0.514775  0.418265       0.147309 -0.639948   
3     -0.581994 -0.225623      0.514775  0.418265       0.147309 -1.587783   
4      1.177948 -0.225623     -1.942597 -2.390830       0.147309 -0.639948   
...         ...       ...           ...       ...            ...       ...   
49479 -0.262004 -0.225623     -1.942597  0.418265       0.147309  0.307887   
49480 -0.262004 -0.225623      0.514775  0.418265       0.147309 -0.639948   
49481 -1.061978  4.432171      0.514775  0.418265       0.147309 -0.639948   
49482  0.057985 -0.225623      0.514775  0.418265       0.147309 -0.639948   
49483  0.857959 -0.225623      0.514775  0.418265       0.147309 -0.639948   

       MentHlth  PhysHlth  DiffWalk    Income  
0     -0.332699

In [287]:
rf_wrapper = RandomForestClassifier(criterion = 'gini', max_depth = 10, min_samples_split = 18, random_state=42)
rf_wrapper.fit(forward_selection_train, y_train.values.ravel())

rfw_train_pred = rf_wrapper.predict(forward_selection_train)
rfw_train_score = accuracy_score(y_train, rfw_train_pred)
rfw_test_pred = rf_wrapper.predict(forward_selection_test)
rfw_test_score = accuracy_score(y_test, rfw_test_pred)

rfw_conf_train = confusion_matrix(y_train.values.ravel(), rfw_train_pred)
rfw_conf_test = confusion_matrix(y_test.values.ravel(), rfw_test_pred)

In [288]:
print(f'Random Forest Train Accuracy: {np.round(rfw_train_score, 3)} & Test Accuracy: {np.round(rfw_test_score, 3)}\n')

print(f'Train Confusion Matrix:\n{rfw_conf_train}\n\nTest Confusion Matrix:\n{rfw_conf_test}\n')
print(f'Train Classification Report:\n{classification_report(y_train, rfw_train_pred)}\nTest Classification Report:\n{classification_report(y_test, rfw_test_pred)}')

Random Forest Train Accuracy: 0.857 & Test Accuracy: 0.855

Train Confusion Matrix:
[[23319  1426]
 [ 5634 19105]]

Test Confusion Matrix:
[[9946  655]
 [2413 8194]]

Train Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      0.94      0.87     24745
         1.0       0.93      0.77      0.84     24739

    accuracy                           0.86     49484
   macro avg       0.87      0.86      0.86     49484
weighted avg       0.87      0.86      0.86     49484

Test Classification Report:
              precision    recall  f1-score   support

         0.0       0.80      0.94      0.87     10601
         1.0       0.93      0.77      0.84     10607

    accuracy                           0.86     21208
   macro avg       0.87      0.86      0.85     21208
weighted avg       0.87      0.86      0.85     21208



In [293]:
rf_wrapper = RandomForestClassifier(criterion = 'gini', max_depth = 10, min_samples_split = 18, random_state=42)
rf_wrapper.fit(backward_selection_train, y_train.values.ravel())

rfw_train_pred = rf_wrapper.predict(backward_selection_train)
rfw_train_score = accuracy_score(y_train, rfw_train_pred)
rfw_test_pred = rf_wrapper.predict(backward_selection_test)
rfw_test_score = accuracy_score(y_test, rfw_test_pred)

rfw_conf_train = confusion_matrix(y_train.values.ravel(), rfw_train_pred)
rfw_conf_test = confusion_matrix(y_test.values.ravel(), rfw_test_pred)

print(f'Random Forest Train Accuracy: {np.round(rfw_train_score, 3)} & Test Accuracy: {np.round(rfw_test_score, 3)}\n')

print(f'Train Confusion Matrix:\n{rfw_conf_train}\n\nTest Confusion Matrix:\n{rfw_conf_test}\n')
print(f'Train Classification Report:\n{classification_report(y_train, rfw_train_pred)}\nTest Classification Report:\n{classification_report(y_test, rfw_test_pred)}')

Random Forest Train Accuracy: 0.88 & Test Accuracy: 0.874

Train Confusion Matrix:
[[23386  1359]
 [ 4597 20142]]

Test Confusion Matrix:
[[9950  651]
 [2011 8596]]

Train Classification Report:
              precision    recall  f1-score   support

         0.0       0.84      0.95      0.89     24745
         1.0       0.94      0.81      0.87     24739

    accuracy                           0.88     49484
   macro avg       0.89      0.88      0.88     49484
weighted avg       0.89      0.88      0.88     49484

Test Classification Report:
              precision    recall  f1-score   support

         0.0       0.83      0.94      0.88     10601
         1.0       0.93      0.81      0.87     10607

    accuracy                           0.87     21208
   macro avg       0.88      0.87      0.87     21208
weighted avg       0.88      0.87      0.87     21208



# XGBoost Model

In [268]:
xgb_gs = GridSearchCV(estimator = XGBClassifier(),
                     param_grid={'objective': ['binary:logistic', 'binary:logitraw', 'binary:hinge'],
                                'learning_rate':[0.3, 0.6, 0.9],
                                'max_depth': [6, 12, 18],
                                'min_child_weight': [1, 5, 10],
                                'subsamples': [0.5, 0.75, 1],
                                'colsample_bytree': [0.5, 0.7, 1],
                                'seed':[42]},
                     cv = 5,
                     verbose = 2,
                     n_jobs = -1)
xgb_gs.fit(X_selected_train, y_train.values.ravel())

Fitting 5 folds for each of 729 candidates, totalling 3645 fits


Parameters: { "subsamples" } are not used.



GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     grow_policy=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     missing=nan, monotone_constraints=None,
                                     multi_strategy=None, n_estimators=None,
                                     n_jobs=None, num_parallel_tree=None,
                                     random_state=None, 

In [270]:
print(xgb_gs.best_params_)
print(xgb_gs.best_score_)

{'colsample_bytree': 0.5, 'learning_rate': 0.3, 'max_depth': 6, 'min_child_weight': 1, 'objective': 'binary:hinge', 'seed': 42, 'subsamples': 0.5}
0.8142430494072782


In [278]:
xgb = XGBClassifier(colsample_bytree = 0.5, learning_rate = 0.3, max_depth=6, min_child_weight=1,
                    objective='binary:hinge', seed=1, subsamples=0.5)
xgb.fit(X_selected_train, y_train.values.ravel())

xbg_train_pred = xgb.predict(X_selected_train)
xbg_train_score = accuracy_score(y_train.values.ravel(), xbg_train_pred)

xbg_test_pred = xgb.predict(X_selected_test)
xbg_test_score = accuracy_score(y_test.values.ravel(), xbg_test_pred)

print(f'Train Accuracy: {xbg_train_score}')
print(f'Classification Report for Train: \n{classification_report(y_train.values.ravel(), xbg_train_pred)}\n')

print(f'Test Accuracy: {xbg_test_score}')
print(f'Classification Report for Test: \n{classification_report(y_test.values.ravel(), xbg_test_pred)}\n')

Train Accuracy: 0.8146673672298117
Classification Report for Train: 
              precision    recall  f1-score   support

         0.0       0.77      0.91      0.83     24745
         1.0       0.88      0.72      0.80     24739

    accuracy                           0.81     49484
   macro avg       0.83      0.81      0.81     49484
weighted avg       0.83      0.81      0.81     49484


Test Accuracy: 0.8159656733308186
Classification Report for Test: 
              precision    recall  f1-score   support

         0.0       0.77      0.91      0.83     10601
         1.0       0.88      0.73      0.80     10607

    accuracy                           0.82     21208
   macro avg       0.83      0.82      0.81     21208
weighted avg       0.83      0.82      0.81     21208




Parameters: { "subsamples" } are not used.



# Extreme Machine Learning Model

In [34]:
input_length = X_selected_train.shape[1]
hidden_units = 1000

win = np.random.normal(size = [input_length, hidden_units])

def input_to_hidden(x):
    a = np.dot(x, win)
    a = np.maximum(a, 0, a)
    return a

x_h_v = input_to_hidden(X_selected_train)
x_h_t = np.transpose(x_h_v)
w_out = np.dot(np.linalg.inv(np.dot(x_h_t, x_h_v)), np.dot(x_h_t, y_train))

def predict(x):
    x = input_to_hidden(x)
    y = np.dot(x, w_out)
    return y

extreme_pred = predict(X_selected_test)
num_correct = 0
total = extreme_pred.shape[0]
for i in range(total):
    predicted = np.argmax(extreme_pred[i])
    test = np.argmax(y_test.values.ravel()[i])
    num_correct = num_correct + (1 if predicted == test else 0)
    
print('Accuracy: {:f}'.format(num_correct/total))

extreme_pred = predict(X_selected_train)
num_correct = 0
total = extreme_pred.shape[0]
for i in range(total):
    predicted = np.argmax(extreme_pred[i])
    train = np.argmax(y_train.values.ravel()[i])
    num_correct = num_correct + (1 if predicted == train else 0)
    
print('Accuracy: {:f}'.format(num_correct/total))

Accuracy: 1.000000
Accuracy: 1.000000


In [35]:
input_length = X_selected_train.shape[1]
hidden_units = 1000

win = np.random.normal(size = [input_length, hidden_units])

def input_to_hidden(x):
    a = np.dot(x, win)
    a = np.maximum(a, 0, a)
    return a

x_h_v = input_to_hidden(X_selected_train)
x_h_t = np.transpose(x_h_v)
w_out = np.dot(np.linalg.inv(np.dot(x_h_t, x_h_v)), np.dot(x_h_t, y_train))

def predict(x):
    x = input_to_hidden(x)
    y = np.dot(x, w_out)
    return y

extreme_pred = predict(X_selected_test)
num_correct = 0
total = extreme_pred.shape[0]
for i in range(total):
    predicted = np.argmax(extreme_pred[i])
    test = np.argmax(y_test.values.ravel()[i])
    num_correct = num_correct + (1 if predicted == test else 0)
    
print('Accuracy of test set: {:f}'.format(num_correct/total))

extreme_pred = predict(X_selected_train)
num_correct = 0
total = extreme_pred.shape[0]
for i in range(total):
    predicted = np.argmax(extreme_pred[i])
    train = np.argmax(y_train.values.ravel()[i])
    num_correct = num_correct + (1 if predicted == train else 0)
    
print('Accuracy of train set: {:f}'.format(num_correct/total))

Accuracy: 1.000000
Accuracy: 1.000000


# Basic Deep Learning Model

In [16]:
nn = Sequential()
nn.add(Dense(20, input_shape=(9,), activation='relu'))
nn.add(Dense(10, activation = 'relu'))
nn.add(Dense(1, activation='sigmoid'))

nn.compile(loss = 'binary_crossentropy', optimizer='adam', metrics='accuracy')
nn.fit(X_selected_train, y_train.values.ravel(), epochs=50, batch_size=10)

_, train_accuracy = nn.evaluate(X_selected_train, y_train)
_2, test_accuracy = nn.evaluate(X_selected_test, y_test)

print(f'Train Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Train Accuracy: 0.8152534365653992
Test Accuracy: 0.8144568204879761


In [40]:
nn = Sequential()
nn.add(Dense(20, input_shape=(9,), activation='relu'))
nn.add(Dense(10, activation = 'relu'))
nn.add(Dense(1, activation='sigmoid'))

nn.compile(loss = 'binary_crossentropy', optimizer='adam', metrics='accuracy')
nn.fit(backward_selection_train, y_train.values.ravel(), epochs=50, batch_size=10)

_, train_accuracy = nn.evaluate(backward_selection_train, y_train)
_2, test_accuracy = nn.evaluate(backward_selection_test, y_test)

print(f'Train Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Train Accuracy: 0.8801228404045105
Test Accuracy: 0.8755186796188354


# Ensemble Model

In [297]:
model_1 = RandomForestClassifier(criterion = 'gini', max_depth = 10, min_samples_split = 18, random_state=42)
model_2 = SVC(C = 5, degree = 3, gamma = 'scale', kernel = 'rbf')
model_3 = LogisticRegression(C = 1, penalty = 'l1', random_state=42, solver = 'saga')

ensemble_model = VotingClassifier(estimators = [('rf', model_1), ('svm', model_2), ('lg', model_3)],
                                 voting = 'hard',
                                 n_jobs = -1,
                                 verbose = True)
ensemble_model.fit(X_selected_train, y_train.values.ravel())
ensemble_pred_train = ensemble_model.predict(X_selected_train)
ensemble_pred_test = ensemble_model.predict(X_selected_test)

In [299]:
ensemble_train_score = accuracy_score(y_train.values.ravel(), ensemble_pred_train)
ensemble_test_score = accuracy_score(y_test.values.ravel(), ensemble_pred_test)

print(f'Train Accuracy: {ensemble_train_score}')
print(f'Classification Report for Train: \n{classification_report(y_train.values.ravel(), ensemble_pred_train)}\n')

print(f'Test Accuracy: {ensemble_test_score}')
print(f'Classification Report for Test: \n{classification_report(y_test.values.ravel(), ensemble_pred_test)}\n')

Train Accuracy: 0.8156777948427775
Classification Report for Train: 
              precision    recall  f1-score   support

         0.0       0.77      0.90      0.83     24745
         1.0       0.88      0.73      0.80     24739

    accuracy                           0.82     49484
   macro avg       0.82      0.82      0.81     49484
weighted avg       0.82      0.82      0.81     49484


Test Accuracy: 0.8156827612221803
Classification Report for Test: 
              precision    recall  f1-score   support

         0.0       0.77      0.90      0.83     10601
         1.0       0.88      0.73      0.80     10607

    accuracy                           0.82     21208
   macro avg       0.82      0.82      0.81     21208
weighted avg       0.82      0.82      0.81     21208




In [26]:
model_1 = RandomForestClassifier(criterion = 'gini', max_depth = 10, min_samples_split = 18, random_state=42)
model_2 = SVC(C = 5, degree = 3, gamma = 'scale', kernel = 'rbf')
model_3 = LogisticRegression(C = 1, penalty = 'l1', random_state=42, solver = 'saga')

ensemble_model = VotingClassifier(estimators = [('rf', model_1), ('svm', model_2), ('lg', model_3)],
                                 voting = 'hard',
                                 n_jobs = -1,
                                 verbose = True)
ensemble_model.fit(backward_selection_train, y_train.values.ravel())
ensemble_pred_train = ensemble_model.predict(backward_selection_train)
ensemble_pred_test = ensemble_model.predict(backward_selection_test)

ensemble_train_score = accuracy_score(y_train.values.ravel(), ensemble_pred_train)
ensemble_test_score = accuracy_score(y_test.values.ravel(), ensemble_pred_test)

print(f'Train Accuracy: {ensemble_train_score}')
print(f'Classification Report for Train: \n{classification_report(y_train.values.ravel(), ensemble_pred_train)}\n')

print(f'Test Accuracy: {ensemble_test_score}')
print(f'Classification Report for Test: \n{classification_report(y_test.values.ravel(), ensemble_pred_test)}\n')

Train Accuracy: 0.877394713442729
Classification Report for Train: 
              precision    recall  f1-score   support

         0.0       0.83      0.94      0.89     24745
         1.0       0.94      0.81      0.87     24739

    accuracy                           0.88     49484
   macro avg       0.88      0.88      0.88     49484
weighted avg       0.88      0.88      0.88     49484


Test Accuracy: 0.872736703130894
Classification Report for Test: 
              precision    recall  f1-score   support

         0.0       0.83      0.94      0.88     10601
         1.0       0.93      0.81      0.86     10607

    accuracy                           0.87     21208
   macro avg       0.88      0.87      0.87     21208
weighted avg       0.88      0.87      0.87     21208


