Model training code

In [73]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import *
from scipy.stats import zscore

In [74]:
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')
print(df.head())

    age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0        0                       582         0                 20   
1  55.0        0                      7861         0                 38   
2  65.0        0                       146         0                 20   
3  50.0        1                       111         0                 20   
4  65.0        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1  265000.00               1.9           130    1   
1                    0  263358.03               1.1           136    1   
2                    0  162000.00               1.3           129    1   
3                    0  210000.00               1.9           137    1   
4                    0  327000.00               2.7           116    0   

   smoking  time  DEATH_EVENT  
0        0     4            1  
1        0     6            1  
2       

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [76]:
df.shape

(299, 13)

Checking for NULLs

In [77]:
df.isnull().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [78]:
df.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


PCA?

In [79]:
df.corr()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
age,1.0,0.088006,-0.081584,-0.101012,0.060098,0.093289,-0.052354,0.159187,-0.045966,0.06543,0.018668,-0.224068,0.253729
anaemia,0.088006,1.0,-0.190741,-0.012729,0.031557,0.038182,-0.043786,0.052174,0.041882,-0.094769,-0.10729,-0.141414,0.06627
creatinine_phosphokinase,-0.081584,-0.190741,1.0,-0.009639,-0.04408,-0.07059,0.024463,-0.016408,0.05955,0.079791,0.002421,-0.009346,0.062728
diabetes,-0.101012,-0.012729,-0.009639,1.0,-0.00485,-0.012732,0.092193,-0.046975,-0.089551,-0.15773,-0.147173,0.033726,-0.001943
ejection_fraction,0.060098,0.031557,-0.04408,-0.00485,1.0,0.024445,0.072177,-0.011302,0.175902,-0.148386,-0.067315,0.041729,-0.268603
high_blood_pressure,0.093289,0.038182,-0.07059,-0.012732,0.024445,1.0,0.049963,-0.004935,0.037109,-0.104615,-0.055711,-0.196439,0.079351
platelets,-0.052354,-0.043786,0.024463,0.092193,0.072177,0.049963,1.0,-0.041198,0.062125,-0.12512,0.028234,0.010514,-0.049139
serum_creatinine,0.159187,0.052174,-0.016408,-0.046975,-0.011302,-0.004935,-0.041198,1.0,-0.189095,0.00697,-0.027414,-0.149315,0.294278
serum_sodium,-0.045966,0.041882,0.05955,-0.089551,0.175902,0.037109,0.062125,-0.189095,1.0,-0.027566,0.004813,0.08764,-0.195204
sex,0.06543,-0.094769,0.079791,-0.15773,-0.148386,-0.104615,-0.12512,0.00697,-0.027566,1.0,0.445892,-0.015608,-0.004316


Potential models: <br>Logistic regression <br> SVM <br> Decision tree <br> KNN

I will provide the optimal conditions for each model, and then pick the one with the most accurate predictions

In [99]:
x = df.drop(columns="DEATH_EVENT")
y = df["DEATH_EVENT"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Decision Tree model

In [100]:
ADB_DT = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1, random_state=42), random_state=42)
ADB_DT.fit(x_train,y_train)
ADB_DT_pred = ADB_DT.predict(x_test)
print("Decision tree model (with adaBoost) accuracy:", accuracy_score(y_test, ADB_DT_pred))

RF_DT = RandomForestClassifier(max_depth=3, random_state=42, n_estimators=100)
RF_DT.fit(x_train, y_train)
RF_DT_pred = RF_DT.predict(x_test)
print("Decision tree model (with random forest) accuracy:", accuracy_score(y_test, RF_DT_pred))

Decision tree model (with adaBoost) accuracy: 0.7666666666666667
Decision tree model (with random forest) accuracy: 0.75


Logistic Regression model

Pre-processing

Outliers removal

In [82]:
print(df.shape)
z_scores = np.abs(zscore(df.select_dtypes(include=[np.number])))
print(f"# rows with an outlying value: {(z_scores > 3).any(axis=1).sum()}")
final_df = df[(z_scores < 3).all(axis=1)].copy()
print(final_df.shape)

(299, 13)
# rows with an outlying value: 19
(280, 13)


In [83]:
x = final_df.drop(columns="DEATH_EVENT")
y = final_df["DEATH_EVENT"]

In [84]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train_SS = StandardScaler().fit_transform(x_train) 
x_test_SS = StandardScaler().fit_transform(x_test)

In [85]:
LOGR = LogisticRegression(random_state=42)
LOGR.fit(x_train_SS, y_train)
LOGR_pred = LOGR.predict(x_test_SS)
print(f'Logistic regression model accuracy: {accuracy_score(y_test, LOGR_pred)}')

Logistic regression model accuracy: 0.8392857142857143


KNN model

In [86]:
KNN = KNeighborsClassifier(n_neighbors=7)
KNN.fit(x_train_SS, y_train)
KNN_pred = KNN.predict(x_test_SS)
print(f'KNN model accuracy: {accuracy_score(y_test, KNN_pred)}')

KNN model accuracy: 0.7321428571428571


SVM model

In [87]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train_MM = MinMaxScaler().fit_transform(x_train)
x_test_MM = MinMaxScaler().fit_transform(x_test)

In [88]:
BG_SVC = BaggingClassifier(estimator=SVC(), n_estimators=10, random_state=42)
BG_SVC.fit(x_train_MM, y_train)
BG_SVC_pred = BG_SVC.predict(x_test_MM)
print(f'SVC accuracy (with bagging): {accuracy_score(y_test, BG_SVC_pred)}')

SVC accuracy (with bagging): 0.7857142857142857


Logistic regression wins (for now)

In [89]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(LOGR, f)

Pick 3 random test samples

In [72]:
indices = [5, 14, 55]
samples = x_test.iloc[indices]
sample_labels = y_test.iloc[indices]

for i, (sample, label) in enumerate(zip(samples.iterrows(), sample_labels)):
    print(f"Sample {i+1}:")
    print(sample[1])
    print("Label:", label)
    print()

Sample 1:
age                             50.0
anaemia                          1.0
creatinine_phosphokinase      1051.0
diabetes                         1.0
ejection_fraction               30.0
high_blood_pressure              0.0
platelets                   232000.0
serum_creatinine                 0.7
serum_sodium                   136.0
sex                              0.0
smoking                          0.0
time                           246.0
Name: 278, dtype: float64
Label: 0

Sample 2:
age                             60.0
anaemia                          1.0
creatinine_phosphokinase       607.0
diabetes                         0.0
ejection_fraction               40.0
high_blood_pressure              0.0
platelets                   216000.0
serum_creatinine                 0.6
serum_sodium                   138.0
sex                              1.0
smoking                          1.0
time                            54.0
Name: 57, dtype: float64
Label: 0

Sample 3:
age        