In [17]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import balanced_accuracy_score
from imblearn.under_sampling import RandomUnderSampler

## Choosing Dataset

In [42]:
path = r'../../data/2015_cleaned_droppedNaN.csv'
# path = r'../../data/2015_cleaned_imputedNaN.csv'

## Preparing Dataset for MLP Classification

In [43]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,BMI,Smoker,Diabetes,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,SexIsMale,AgeGroup
0,0.0,1.0,1.0,0.320255,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.666667
1,0.0,0.0,0.0,0.148641,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.5
2,0.0,1.0,1.0,0.183896,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.666667
3,0.0,1.0,0.0,0.164904,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.833333
4,0.0,1.0,1.0,0.134994,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.833333


In [44]:
# Prepare X and y
X = df.drop('HeartDiseaseorAttack', axis=1)
y = df.HeartDiseaseorAttack

**Under Sampling**

As seen in 03_check_target_balancing the dataset is heavily imbalanced towards patients with no heart disease or attack.

Since we have a lot of data (> 400k rows) we choose to do under sampling to accord for that difference.  

In [45]:
rus = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [46]:
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=1, test_size=0.2)

## MLP Classification

We run the Neural Network once with guessed hyper parameters to see if everything works as expected

In [47]:
classifier = MLPClassifier(hidden_layer_sizes=(20, 20), activation="logistic", solver="adam", random_state=1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.739210694891214

## Results

**Balanced accuracy**
* Dropped NaN Values: 0.739210694891214
* Imputed NaN Values: 0.7504035572664426

There seems to be a slight increase in performance when using the rows with imputed values as well

## Results from Hyper Parameter Tuning

*The following hyper Parameters delivered the best results*