In [1]:
# imports
import numpy as np
import pandas as pd

import tensorflow as tf
import matplotlib.pyplot as plt
import keras.backend as K

from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split

In [2]:
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [3]:
# loading data
file_path = '../input/heart-disease-health-indicators-dataset/heart_disease_health_indicators_BRFSS2015.csv'
df = pd.read_csv(file_path)

In [4]:
# setting parameters
n_epochs=150
batch_size=256
lr=0.01
conf=0.5

In [5]:
#removing duplicates
df = df.drop_duplicates()

In [6]:
# checking skewness of classes
df['HeartDiseaseorAttack']. value_counts()

In [7]:
df_pos=df[df['HeartDiseaseorAttack']==1]
df_neg=df[df['HeartDiseaseorAttack']==0]

In [8]:
df_neg.shape

In [9]:
#Undersampling to remove skewness of Dataset
df_neg=df_neg.sample(frac=0.1, replace=True, random_state=1)

In [10]:
df_neg.shape

In [11]:
df=pd.concat([df_pos, df_neg],ignore_index=True)

In [12]:
# shuffling
df = df.sample(frac = 1)

In [13]:
# separating the X and Y parts of dataset
y_df=df["HeartDiseaseorAttack"]
x_df = df.drop(['HeartDiseaseorAttack'], axis = 1)
x_df.head()

In [14]:
# columns that need to be normalised
cols=['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']

In [15]:
# normalising
x_df[cols] = (x_df[cols] - x_df[cols].min())/ (x_df[cols].max()- x_df[cols].min())

In [16]:
x_df.head()

In [17]:
# converting to numpy arrays
x_train = x_df.to_numpy()
y_train = y_df.to_numpy()

In [18]:
# train-val-test split
x_train, x_test, y_train, y_test = train_test_split( x_train, y_train, test_size=0.15, random_state=42)
x_train, x_val, y_train, y_val = train_test_split( x_train, y_train, test_size=0.20, random_state=42)

In [19]:
y_train = y_train.reshape(y_train.shape[0],1)
y_test = y_test.reshape(y_test.shape[0],1)
y_val = y_val.reshape(y_val.shape[0],1)

In [20]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
print(x_val.shape)
print(y_val.shape)

In [21]:
# building the model
model=Sequential()
model.add(Dense(x_train.shape[0],input_dim=x_train.shape[1],activation='relu')) # input layer
model.add(Dense(128,activation='relu')) # hidden layers
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(1,activation='sigmoid')) # sigmoid output

In [22]:
# setting up the learning scheduler
initial_learning_rate =lr
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=n_epochs,
    decay_rate=0.96,
    staircase=True)

In [23]:
# compiling the model
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule), 
              metrics=[tf.keras.metrics.BinaryAccuracy(threshold=conf),tf.keras.metrics.Recall(), 
                       get_f1])

In [24]:
model.summary()

In [25]:
H = model.fit(x=x_train, y=y_train, epochs=n_epochs, batch_size=batch_size,
            validation_data=(x_val, y_val))

In [26]:
# Evaluation
loss, accuracy, recall, f1_score = model.evaluate(x_test, y_test)
print('Loss: %.4f' % (loss))
print('Accuracy: %.2f' % (accuracy*100))
print('Recall: %.4f' % (recall))
print('F1 Score: %.4f' % (f1_score))

In [27]:
# evaluating and visual analysis of performance
print("Evaluating network...")
predictions = model.predict(x=x_test, batch_size=128)
# plot the training loss and accuracy
N = np.arange(0, n_epochs)
plt.style.use("ggplot")
plt.figure()

plt.title("Training/Validation Recall on Dataset")
plt.plot(N, H.history["recall"], label="train_rec")
plt.plot(N, H.history["val_recall"], label="val_rec")
plt.xlabel("Epoch #")
plt.ylabel("Recall")
plt.legend()
plt.show()

plt.title("Training/Validation F1 Score on Dataset")
plt.plot(N, H.history["get_f1"], label="train_f1")
plt.plot(N, H.history["val_get_f1"], label="val_f1")
plt.xlabel("Epoch #")
plt.ylabel("F1 Score")
plt.legend()
plt.show()

plt.title("Training/Validation Accuracy on Dataset")
plt.plot(N, H.history["binary_accuracy"], label="train_acc")
plt.plot(N, H.history["val_binary_accuracy"], label="val_acc")
plt.xlabel("Epoch #")
plt.ylabel("Accuracy/100")
plt.legend()
plt.show()

plt.title("Training/Validation Loss on Dataset")
plt.plot(N, H.history["loss"], label="train_loss")
plt.plot(N, H.history["val_loss"], label="val_loss")
plt.xlabel("Epoch #")
plt.ylabel("Loss")
plt.legend()
plt.show()