# Model training

## Prepare data

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
heart_dataset = pd.read_csv('heart_dataset_preprocessed.csv')

# Split dataset
heart_dataset_target = heart_dataset['HeartDisease']
heart_dataset_data = heart_dataset.drop(columns='HeartDisease')

# Make dataset smaller for testing
heart_dataset_data, _, heart_dataset_target, _ = train_test_split(
    heart_dataset_data, heart_dataset_target, test_size=0.95, random_state=42, stratify=heart_dataset_target)

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
124888,0.127731,No,No,No,0.066667,0.100000,Yes,Female,60-64,White,No,No,Excellent,0.217391,No,No,No
221972,0.135458,No,No,No,0.000000,0.000000,No,Female,70-74,Black,Yes,Yes,Very good,0.391304,No,No,No
254188,0.321140,No,No,No,0.000000,0.000000,No,Female,45-49,White,No,No,Very good,0.260870,No,No,No
56215,0.205602,Yes,Yes,No,0.000000,0.000000,Yes,Female,70-74,White,No,Yes,Very good,0.347826,No,No,No
256278,0.284317,No,No,No,0.000000,0.000000,No,Male,35-39,White,No,Yes,Good,0.347826,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180856,0.222262,No,No,No,0.000000,0.000000,No,Male,30-34,White,No,Yes,Fair,0.347826,No,No,No
8631,0.157431,Yes,No,No,0.000000,0.000000,No,Female,80 or older,White,No,Yes,Excellent,0.260870,No,No,Yes
113561,0.244597,No,No,No,0.166667,0.233333,Yes,Female,55-59,White,No,Yes,Good,0.304348,No,No,Yes
279178,0.189424,No,No,No,0.000000,0.000000,No,Male,75-79,White,No,Yes,Very good,0.304348,No,No,No


## Baseline

We define a baseline model as a reference to the actual model with DummyClassifier from Scikit-Learn. Because our data is very imbalanced, we are using the stratified strategy.

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# X_train: Includes all independent variables, these will be used to train the model; because test_size=0.3, only 70% of data will be used for training
# X_test: Remaining 30% of the independent variables which will not be used in the training phase and will be used to make predictions to evaluate the model
# y_train: Dependent variable which needs to be predicted by the model
# y_test: This data has category labels for your test data, these labels will be used for evaluation
X_train, X_test, y_train, y_test = train_test_split(heart_dataset_data, heart_dataset_target, test_size = 0.3, random_state = 42)

dummy_classifier = DummyClassifier(strategy='stratified')
dummy_classifier.fit(X_train, y_train)
y_predicted = dummy_classifier.predict(X_test)
results_dict = {'accuracy': accuracy_score(y_test, y_predicted),
                'recall': recall_score(y_test, y_predicted),
                'precision': precision_score(y_test, y_predicted),
                'f1_score': f1_score(y_test, y_predicted)}
display(results_dict)

{'accuracy': 0.8369184592296148,
 'recall': 0.09686609686609686,
 'precision': 0.0921409214092141,
 'f1_score': 0.09444444444444444}

**Conclusion**

We achieve an 0.84 accuracy, a 0.09 recall, a 0.09 precision and a 0.09 F1-Score.


## Generic modelling and evaluation method

Function that generate different models and evaluate them

In [10]:
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

scorer = make_scorer(fbeta_score, beta=2)

def model_eval(estimator, parameters, X, y, scoring=scorer, verbose=0):
    # specify the cross validation
    inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

    # create the grid search instance
    grid_search_estimator = GridSearchCV(estimator=estimator,
                                         param_grid=parameters,
                                         scoring=scoring,
                                         cv=inner_cv,
                                         return_train_score=False,
                                         verbose=verbose
                                        )


    nested_cv_score = cross_validate(grid_search_estimator,
                                     X=X, y=y,
                                     verbose=verbose,
                                     cv=outer_cv, scoring=scorer)

#     display(nested_cv_score.mean())
    return nested_cv_score

## Knn

In [14]:
from sklearn.neighbors import KNeighborsClassifier

# create an estimator
knn_estimator = KNeighborsClassifier()

# specify the parameter grid
parameters = {
    'n_neighbors': [2, 3, 5]
}

knn_result = model_eval(estimator=knn_estimator, parameters=parameters, X=heart_dataset_data, y=heart_dataset_target)

12 fits failed out of a total of 12.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/benediktluth/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/benediktluth/opt/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py", line 198, in fit
    return self._fit(X, y)
  File "/Users/benediktluth/opt/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 400, in _fit
    X, y = self._validate_data(X, y, accept_sparse="csr", multi_output=True)
  File "/Users/benediktluth/opt/anaconda3/lib/pyth

NotFittedError: All estimators failed to fit

In [164]:
print("Mean", knn_result['test_score'].mean())
for score in knn_result['test_score']:
    print(score)

Mean 0.16398140810083794
0.12616201859229748
0.17739816031537448
0.1762402088772846
0.17612524461839527


## Random Forest

In [165]:
from sklearn.ensemble import RandomForestClassifier

# create an estimator
forest_estimator = RandomForestClassifier()

# specify the parameter grid
parameters = {
    'n_estimators': [2, 3, 4, 5]
}

forest_result = model_eval(estimator=forest_estimator, parameters=parameters, X=heart_dataset_data, y=heart_dataset_target)

In [166]:
print("Mean", forest_result['test_score'].mean())
for score in forest_result['test_score']:
    print(score)

Mean 0.1806964458049986
0.19626168224299068
0.15842839036755388
0.18873762376237624
0.1793580868470736


## Support Vector Machine

In [167]:
from sklearn.svm import SVC

# create an estimator
svc_estimator = SVC(random_state=0)

# specify the parameter grid
parameters = {
    'kernel': ['linear', 'poly']
}

svc_result = model_eval(estimator=svc_estimator, parameters=parameters, X=heart_dataset_data, y=heart_dataset_target)

In [168]:
print("Mean", svc_result['test_score'].mean())
for score in svc_result['test_score']:
    print(score)

Mean 0.07955710385257753
0.07368421052631578
0.07127583749109051
0.07446808510638298
0.0988002822865208


## Neural Net

In [169]:
from sklearn.neural_network import MLPClassifier

# create an estimator
nn_estimator = MLPClassifier(random_state=1, max_iter=300)

# specify the parameter grid
parameters = {
    'activation': ['identity', 'logistic', 'tanh', 'relu']
}

nn_result = model_eval(estimator=nn_estimator, parameters=parameters, X=heart_dataset_data, y=heart_dataset_target)



In [170]:
print("Mean", nn_result['test_score'].mean())
for score in nn_result['test_score']:
    print(score)

Mean 0.15287409227221466
0.13494809688581316
0.17749497655726723
0.12732278045423262
0.17173051519154559
