# Model training

In [84]:
import pandas as pd
import numpy as np

In [171]:
from sklearn.model_selection import train_test_split

# Load dataset
heart_dataset = pd.read_csv('heart_dataset_preprocessed.csv')

# Split dataset
heart_dataset_target = heart_dataset['HeartDisease']
heart_dataset_data = heart_dataset.drop(columns='HeartDisease')

# Make dataset smaller for testing
heart_dataset_data, _, heart_dataset_target, _ = train_test_split(
    heart_dataset_data, heart_dataset_target, test_size=0.95, random_state=42, stratify=heart_dataset_target)

heart_dataset_data

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,...,SleepTime,Asthma,KidneyDisease,SkinCancer,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White
124888,0.127731,0,0,0,0.066667,0.100000,1,1,0.594203,0.0,...,0.217391,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
221972,0.135458,0,0,0,0.000000,0.000000,0,1,0.739130,1.0,...,0.391304,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0
254188,0.321140,0,0,0,0.000000,0.000000,0,1,0.376812,0.0,...,0.260870,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
56215,0.205602,1,1,0,0.000000,0.000000,1,1,0.739130,0.0,...,0.347826,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
256278,0.284317,0,0,0,0.000000,0.000000,0,0,0.231884,0.0,...,0.347826,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180856,0.222262,0,0,0,0.000000,0.000000,0,0,0.159420,0.0,...,0.347826,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
8631,0.157431,1,0,0,0.000000,0.000000,0,1,1.000000,0.0,...,0.260870,0,0,1,0.0,0.0,0.0,0.0,0.0,1.0
113561,0.244597,0,0,0,0.166667,0.233333,1,1,0.521739,0.0,...,0.304348,0,0,1,0.0,0.0,0.0,0.0,0.0,1.0
279178,0.189424,0,0,0,0.000000,0.000000,0,0,0.811594,0.0,...,0.304348,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0


In [161]:
from sklearn.metrics import fbeta_score, make_scorer

# General Variables
scorer = make_scorer(fbeta_score, beta=2)
# scorer = 'accuracy'

In [162]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

def model_eval(estimator, parameters, X, y, scoring=scorer, verbose=0):
    # specify the cross validation
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    
    # create the grid search instance
    grid_search_estimator = GridSearchCV(estimator=estimator,
                                         param_grid=parameters,
                                         scoring=scoring,
                                         cv=inner_cv, 
                                         return_train_score=False,
                                         verbose=verbose
                                        )
    
    
    nested_cv_score = cross_validate(grid_search_estimator,
                                     X=X, y=y, 
                                     verbose=verbose,
                                     cv=outer_cv, scoring=scorer)
    
#     display(nested_cv_score.mean())
    return nested_cv_score

## Baseline

## Knn

In [163]:
from sklearn.neighbors import KNeighborsClassifier

# create an estimator
knn_estimator = KNeighborsClassifier()

# specify the parameter grid
parameters = {
    'n_neighbors': [2, 3, 5]
}

knn_result = model_eval(estimator=knn_estimator, parameters=parameters, X=heart_dataset_data, y=heart_dataset_target)

In [164]:
print("Mean", knn_result['test_score'].mean())
for score in knn_result['test_score']:
    print(score)

Mean 0.16398140810083794
0.12616201859229748
0.17739816031537448
0.1762402088772846
0.17612524461839527


## Random Forest

In [165]:
from sklearn.ensemble import RandomForestClassifier

# create an estimator
forest_estimator = RandomForestClassifier()

# specify the parameter grid
parameters = {
    'n_estimators': [2, 3, 4, 5]
}

forest_result = model_eval(estimator=forest_estimator, parameters=parameters, X=heart_dataset_data, y=heart_dataset_target)

In [166]:
print("Mean", forest_result['test_score'].mean())
for score in forest_result['test_score']:
    print(score)

Mean 0.1806964458049986
0.19626168224299068
0.15842839036755388
0.18873762376237624
0.1793580868470736


## Support Vector Machine

In [167]:
from sklearn.svm import SVC

# create an estimator
svc_estimator = SVC(random_state=0)

# specify the parameter grid
parameters = {
    'kernel': ['linear', 'poly']
}

svc_result = model_eval(estimator=svc_estimator, parameters=parameters, X=heart_dataset_data, y=heart_dataset_target)

In [168]:
print("Mean", svc_result['test_score'].mean())
for score in svc_result['test_score']:
    print(score)

Mean 0.07955710385257753
0.07368421052631578
0.07127583749109051
0.07446808510638298
0.0988002822865208


## Neural Net

In [169]:
from sklearn.neural_network import MLPClassifier

# create an estimator
nn_estimator = MLPClassifier(random_state=1, max_iter=300)

# specify the parameter grid
parameters = {
    'activation': ['identity', 'logistic', 'tanh', 'relu']
}

nn_result = model_eval(estimator=nn_estimator, parameters=parameters, X=heart_dataset_data, y=heart_dataset_target)



In [170]:
print("Mean", nn_result['test_score'].mean())
for score in nn_result['test_score']:
    print(score)

Mean 0.15287409227221466
0.13494809688581316
0.17749497655726723
0.12732278045423262
0.17173051519154559
