In [None]:
# Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Importing the dataset

dataset = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
# Lets look at the top 5 rows
dataset.head()

In [None]:
# Checking for null values
dataset.isnull().sum()

In [None]:
# Feature Selection
plt.rcParams['figure.figsize']=15,6 
sns.set_style("darkgrid")

x = dataset.iloc[:, :-1]
y = dataset.iloc[:,-1]

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()

In [None]:
# We will select only 4 features : age, time, ejection_fraction, serum_creatinine

-----

Finding outliers

In [None]:
# Boxplot for ejection_fraction
sns.boxplot(x=dataset.ejection_fraction)
plt.show()

In [None]:
# We can see there are two outliers. Lets remove them

In [None]:
dataset[dataset['ejection_fraction']>=70]

In [None]:
dataset = dataset[dataset['ejection_fraction']<70]

In [None]:
sns.boxplot(x=dataset.time)
plt.show()

In [None]:
# No outliers in time.

LETS GET SOME INSIGHTS OF THE DATASET

In [None]:
# Distribution of Age

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(
    x = dataset['age'],
    xbins=dict( # bins used for histogram
        start=40,
        end=95,
        size=2
    ),
    marker_color='#e8ab60',
    opacity=1
))

fig.update_layout(
    title_text='Age Distribution',
    xaxis_title_text='Age',
    yaxis_title_text='Count', 
    bargap=0.05, # gap between bars of adjacent location coordinates
    plot_bgcolor='#000000',
    xaxis =  {'showgrid': False },
    yaxis = {'showgrid': False }
)

fig.show()

In [None]:
# Hover over the graph to get the count of people of different age groups 

In [None]:
# Now lets categorize the above histogram by DEATH_EVENT

import plotly.express as px
fig = px.histogram(dataset, x="age", color="DEATH_EVENT", marginal="violin", hover_data=dataset.columns)
fig.show()

In [None]:
# Wider sections of the violin plot represent a higher probability of observations taking a given value, the thinner sections
# correspond to a lower probability and the value of probability is given by kde value for given x

In [None]:
# Similarly lets get insights of other features as well

In [None]:
# Distribution of creatinine_phosphokinase

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(
    x = dataset['creatinine_phosphokinase'],
    xbins=dict( # bins used for histogram
        start=23,
        end=582,
        size=15
    ),
    marker_color='#FE6F5E',
    opacity=1
))

fig.update_layout(
    title_text='Creatinine Phosphokinase Distribution',
    xaxis_title_text='Creatinine Phosphokinase',
    yaxis_title_text='Count', 
    bargap=0.05, # gap between bars of adjacent location coordinates
    plot_bgcolor='#000000',
    xaxis =  {'showgrid': False },
    yaxis = {'showgrid': False }
)

fig.show()

In [None]:
# Hover over the graph to get the count of people having creatinine phosphokinase levels at same range

In [None]:
# Now lets categorize the above histogram by DEATH_EVENT

import plotly.express as px
fig = px.histogram(dataset, x="creatinine_phosphokinase", color="DEATH_EVENT", marginal="violin", hover_data=dataset.columns)
fig.show()

In [None]:
# Distribution of ejection_fraction

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(
    x = dataset['ejection_fraction'],
    xbins=dict( # bins used for histogram
        start=14,
        end=80,
        size=2
    ),
    marker_color='#A7F432',
    opacity=1
))

fig.update_layout(
    title_text='Ejection Fraction Distribution',
    xaxis_title_text='Ejection Fraction',
    yaxis_title_text='Count', 
    bargap=0.05, # gap between bars of adjacent location coordinates
    plot_bgcolor='#000000',
    xaxis =  {'showgrid': False },
    yaxis = {'showgrid': False }
)

fig.show()

In [None]:
# Now lets categorize the above histogram by DEATH_EVENT

import plotly.express as px
fig = px.histogram(dataset, x="ejection_fraction", color="DEATH_EVENT", marginal="violin", hover_data=dataset.columns)
fig.show()

In [None]:
# Distribution of platelets

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(
    x = dataset['platelets'],
    xbins=dict( # bins used for histogram
        start=25000,
        end=300000,
        size=5000
    ),
    marker_color='#50BFE6',
    opacity=1
))

fig.update_layout(
    title_text='Platelets Distribution',
    xaxis_title_text='Platelets',
    yaxis_title_text='Count', 
    bargap=0.05, # gap between bars of adjacent location coordinates
    plot_bgcolor='#000000',
    xaxis =  {'showgrid': False },
    yaxis = {'showgrid': False }
)

fig.show()

In [None]:
# Now lets categorize the above histogram by DEATH_EVENT

import plotly.express as px
fig = px.histogram(dataset, x="platelets", color="DEATH_EVENT", marginal="violin", hover_data=dataset.columns)
fig.show()

In [None]:
# Distribution of serum_creatinine

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(
    x = dataset['serum_creatinine'],
    xbins=dict( # bins used for histogram
        start=0.5,
        end=9.4,
        size=0.2
    ),
    marker_color='#E77200',
    opacity=1
))

fig.update_layout(
    title_text='Serum Creatinine Distribution',
    xaxis_title_text='Serum Creatinine',
    yaxis_title_text='Count', 
    bargap=0.05, # gap between bars of adjacent location coordinates
    plot_bgcolor='#000000',
    xaxis =  {'showgrid': False },
    yaxis = {'showgrid': False }
)

fig.show()

In [None]:
# Now lets categorize the above histogram by DEATH_EVENT

import plotly.express as px
fig = px.histogram(dataset, x="serum_creatinine", color="DEATH_EVENT", marginal="violin", hover_data=dataset.columns)
fig.show()

In [None]:
# Distribution of serum_sodium

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(
    x = dataset['serum_sodium'],
    xbins=dict( # bins used for histogram
        start=113,
        end=148,
        size=1
    ),
    marker_color='#AAF0D1',
    opacity=1
))

fig.update_layout(
    title_text='Serum Sodium Distribution',
    xaxis_title_text='Serum Sodium',
    yaxis_title_text='Count', 
    bargap=0.05, # gap between bars of adjacent location coordinates
    plot_bgcolor='#000000',
    xaxis =  {'showgrid': False },
    yaxis = {'showgrid': False }
)

fig.show()

In [None]:
# Now lets categorize the above histogram by DEATH_EVENT

import plotly.express as px
fig = px.histogram(dataset, x="serum_sodium", color="DEATH_EVENT", marginal="violin",hover_data=dataset.columns)
fig.show()

In [None]:
# The following pie chart shows that out of 125 people having diabetes only 32% die while 68% survive.

import plotly.express as px
fig = px.pie(dataset, values='diabetes',names='DEATH_EVENT', title='DIABETES',width=600, height=400)
fig.show()

In [None]:
# The following pie chart shows that out of 95 people who die because of heart failure 42.1% have diabetes and 57.9% do not have diabetes

fig = px.pie(dataset, values='DEATH_EVENT',names='diabetes', title='DIABETES', width=600, height=400)
fig.show()

In [None]:
# The following pie chart shows that out of 128 people having Anaemia, 64.8% survive while 35.2% die

fig = px.pie(dataset, values='anaemia',names='DEATH_EVENT', title='ANAEMIA', width=600, height=400)
fig.show()

In [None]:
# The following pie chart shows that out of 95 people who die due to heart faiure 47.4% are anaemic while 52.6% are non-anaemic

fig = px.pie(dataset, values='DEATH_EVENT',names='anaemia', title='ANAEMIA',
      width=600, height=400)
fig.show()

In [None]:
# The following pie chart shows that out of 104 people who have high BP, only 36.5% die while rest 63.5% survive

fig = px.pie(dataset, values='high_blood_pressure',names='DEATH_EVENT', title='HIGH BlOOD PRESSURE',width=600, height=400)
fig.show()

In [None]:
# Out of 95 people who die because of heart failure, 40% have high BP while 60% do not have high BP

fig = px.pie(dataset, values='DEATH_EVENT',names='high_blood_pressure', title='HIGH BLOOD PRESSURE', width=600, height=400)
fig.show()

In [None]:
# The following pie chart shows that out of 95 people who die due to Heart Failure, 34.7% are women and 65.3% are men.

fig = px.pie(dataset, values='DEATH_EVENT',names='sex', title='GENDER',
      width=600, height=400)
fig.show()

In [None]:
# The following pie chart shows that 96 people who smoke, 31.3% die while 68.8% survive

fig = px.pie(dataset, values='smoking',names='DEATH_EVENT', title='SMOKING',
      width=600, height=400)
fig.show()

In [None]:
# The following pie chart shows that out of 95 people who die due to Heart Failure, 31.6% smoke while 68.4% do not smoke

fig = px.pie(dataset, values='DEATH_EVENT',names='smoking', title='SMOKING',
      width=600, height=400)
fig.show()

In [None]:
# From the above Feature Selection Model we select the following features

Features = ['time','ejection_fraction','serum_creatinine','age']

In [None]:
x = dataset.iloc[:, [0,4,7,11]].values
y = dataset.iloc[:,-1].values

In [None]:
print(x)

In [None]:
print(y)

In [None]:
# Splitting the dataset into training set and test set

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state =0)

In [None]:
print(x_train)

In [None]:
print(y_test)

----

In [None]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

----

1. LOGISTIC REGRESSION

In [None]:
# Applying logistic regression on the training set

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=10000)
classifier.fit(x_train, y_train)

In [None]:
# Predicting the test set

y_pred = classifier.predict(x_test)

In [None]:
# Making Confusion Matrix and calculating accuracy score

mylist = []
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
mylist.append(ac)
print(cm)
print(ac)

----

2. K NEAREST NEIGHBOR

In [None]:
# Finding the optimum number of neighbors 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for neighbors in range(3,10):
    classifier = KNeighborsClassifier(n_neighbors=neighbors, metric='minkowski')
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
plt.plot(list(range(3,10)), list1)
plt.show()

In [None]:
# Training the K Nearest Neighbor Classifier on the Training set

classifier = KNeighborsClassifier(n_neighbors=6)
classifier.fit(x_train, y_train)

In [None]:
# Predicting the Test set results

y_pred = classifier.predict(x_test)
print(y_pred)

In [None]:
# Making the confusion matrix and calculating accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
mylist.append(ac)
print(cm)
print(ac)

----

3. SUPPORT VECTOR CLASSIFIER

In [None]:
# Training the Support Vector Classifier on the Training set

from sklearn.svm import SVC
classifier = SVC(random_state=0, kernel = 'rbf')
classifier.fit(x_train, y_train)

In [None]:
# Predicting the test set results

y_pred = classifier.predict(x_test)
print(y_pred)

In [None]:
# Making the confusion matrix and calculating accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
print(cm)
print(ac)
mylist.append(ac)

-----

4. DECISION TREE CLASSIFIER

In [None]:
# Finding the optimum number of max_leaf_nodes

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for leaves in range(2,10):
    classifier = DecisionTreeClassifier(max_leaf_nodes = leaves, random_state=0, criterion='entropy')
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
#print(mylist)
plt.plot(list(range(2,10)), list1)
plt.show()

In [None]:
# Training the Decision Tree Classifier on the Training set

classifier = DecisionTreeClassifier(max_leaf_nodes = 3, random_state=0, criterion='entropy')
classifier.fit(x_train, y_train)

In [None]:
# Predicting the test set results

y_pred = classifier.predict(x_test)
print(y_pred)

In [None]:
# Making the confusion matrix and calculating accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
print(cm)
print(ac)
mylist.append(ac)

-----

5. RANDOM FOREST CLASSIFCATION

In [None]:
#Finding the optimum number of n_estimators

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for estimators in range(10,30):
    classifier = RandomForestClassifier(n_estimators = estimators, random_state=0, criterion='gini')
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
#print(mylist)
plt.plot(list(range(10,30)), list1)
plt.show()

In [None]:
# Training the RandomForest Classifier on the Training set

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 17, criterion='gini', random_state=0)
classifier.fit(x_train,y_train)

In [None]:
# Predicting the test set results

y_pred = classifier.predict(x_test)
print(y_pred)

In [None]:
# Making the confusion matrix and calculating the accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
mylist.append(ac)
print(cm)
print(ac)

----

6. ANN

In [None]:
np.random.seed(0)
import tensorflow as tf

# Initialising the ANN

ann = tf.keras.models.Sequential()

In [None]:
# Adding the input layer and the first hidden layer

ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))

In [None]:
# Adding the second hidden layer

ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))

In [None]:
# Adding the third hidden layer

ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))

In [None]:
# Adding the fourth hidden layer

ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))

In [None]:
# Adding the output layer

ann.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))

In [None]:
# Compiling the ANN

ann.compile(optimizer = 'adam', loss = 'binary_crossentropy' , metrics = ['accuracy'] )

In [None]:
# Training the ANN on the training set

ann.fit(x_train, y_train, batch_size = 32, epochs = 100)

In [None]:
# Predicting the test set results

y_pred = ann.predict(x_test)
y_pred = (y_pred > 0.5)
np.set_printoptions()
print(np.concatenate( (y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)) 

In [None]:
# Making the confusion matrix, calculating accuracy_score 

from sklearn.metrics import confusion_matrix, accuracy_score

# confusion matrix
cm = confusion_matrix(y_test,y_pred)
print("Confusion Matrix")
print(cm)
print()

# accuracy
ac = accuracy_score(y_test,y_pred)
print("Accuracy")
print(ac)
mylist.append(ac)

-----

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for estimators in range(10,30,1):
    classifier = XGBClassifier(n_estimators = estimators, max_depth=12, subsample=0.7)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
#print(mylist)
plt.plot(list(range(10,30,1)), list1)
plt.show()

In [None]:
from xgboost import XGBClassifier

classifier = XGBClassifier(n_estimators=14, max_depth=12, subsample=0.7)

classifier.fit(x_train,y_train)

In [None]:
y_pred = classifier.predict(x_test)
print(y_pred)

In [None]:
# Making the confusion matrix and calculating the accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
mylist.append(ac)
print(cm)
print(ac)

In [None]:
# Plotting accuracy score of different models
mylist

In [None]:
mylist2 = ["Logistic Regression", "KNearestNeighbours","SupportVector","DecisionTree","RandomForest","ANN","XGBoost"]

In [None]:
plt.rcParams['figure.figsize']=15,6 
sns.set_style("darkgrid")
ax = sns.barplot(x=mylist2, y=mylist, palette = "rocket", saturation =1.5)
plt.xlabel("Classifier Models", fontsize = 20 )
plt.ylabel("% of Accuracy", fontsize = 20)
plt.title("Accuracy of different Classifier Models", fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()