<a href="https://colab.research.google.com/github/AdrieonK/Bioinformatics-Data-Science/blob/main/heart_failure_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Goal: Build a Binary Classification Model to predict patient survival from all or a select set of features**

In [None]:
#imports

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

##**Dataset**

In [None]:
#unzip dataset file

!unzip /content/heart+failure+clinical+records.zip

Archive:  /content/heart+failure+clinical+records.zip
 extracting: heart_failure_clinical_records_dataset.csv  


In [None]:
#open file

heart_failure_data = pd.read_csv('/content/heart_failure_clinical_records_dataset.csv')

heart_failure_data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


##**Process Data**

In [None]:
heart_failure_data.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

In [None]:
print(heart_failure_data.dtypes)

age                         float64
anaemia                       int64
creatinine_phosphokinase      int64
diabetes                      int64
ejection_fraction             int64
high_blood_pressure           int64
platelets                   float64
serum_creatinine            float64
serum_sodium                  int64
sex                           int64
smoking                       int64
time                          int64
DEATH_EVENT                   int64
dtype: object


In [None]:
heart_failure_data.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [None]:
heart_failure_data.isnull()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,False,False,False,False,False,False,False,False,False,False,False,False,False
295,False,False,False,False,False,False,False,False,False,False,False,False,False
296,False,False,False,False,False,False,False,False,False,False,False,False,False
297,False,False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
scaler = StandardScaler()

continuous_data_columns = ['age', 'platelets', 'serum_creatinine']

heart_failure_data[continuous_data_columns] = scaler.fit_transform(heart_failure_data[continuous_data_columns])


In [None]:
heart_failure_data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,1.192945,0,582,0,20,1,0.01681648,0.490057,130,1,0,4,1
1,-0.491279,0,7861,0,38,0,7.53566e-09,-0.284552,136,1,0,6,1
2,0.350833,0,146,0,20,0,-1.038073,-0.0909,129,1,1,7,1
3,-0.912335,1,111,0,20,0,-0.5464741,0.490057,137,1,0,7,1
4,0.350833,1,160,1,20,0,0.6517986,1.264666,116,0,0,8,1


##**Train/Test/Split**

In [None]:
heart_failure_data_features = ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']

X = heart_failure_data[heart_failure_data_features]

y = heart_failure_data.DEATH_EVENT

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .2, random_state = 1)

In [None]:
classifier = RandomForestClassifier(random_state = 1, )

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_val)

print("Accuracy Score", accuracy_score(y_val, y_pred))
print("Classification Report", classification_report(y_val, y_pred))

Accuracy Score 0.9333333333333333
Classification Report               precision    recall  f1-score   support

           0       0.94      0.98      0.96        46
           1       0.92      0.79      0.85        14

    accuracy                           0.93        60
   macro avg       0.93      0.88      0.90        60
weighted avg       0.93      0.93      0.93        60



In [None]:
#imports

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

##**Dataset**

In [None]:
#unzip dataset file

!unzip /content/heart+failure+clinical+records.zip

Archive:  /content/heart+failure+clinical+records.zip
 extracting: heart_failure_clinical_records_dataset.csv  


In [None]:
#open file

heart_failure_data = pd.read_csv('/content/heart_failure_clinical_records_dataset.csv')

heart_failure_data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


##**Process Data**

In [None]:
heart_failure_data.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

In [None]:
print(heart_failure_data.dtypes)

age                         float64
anaemia                       int64
creatinine_phosphokinase      int64
diabetes                      int64
ejection_fraction             int64
high_blood_pressure           int64
platelets                   float64
serum_creatinine            float64
serum_sodium                  int64
sex                           int64
smoking                       int64
time                          int64
DEATH_EVENT                   int64
dtype: object


In [None]:
heart_failure_data.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [None]:
heart_failure_data.isnull()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,False,False,False,False,False,False,False,False,False,False,False,False,False
295,False,False,False,False,False,False,False,False,False,False,False,False,False
296,False,False,False,False,False,False,False,False,False,False,False,False,False
297,False,False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
scaler = StandardScaler()

continuous_data_columns = ['age', 'platelets', 'serum_creatinine']

heart_failure_data[continuous_data_columns] = scaler.fit_transform(heart_failure_data[continuous_data_columns])


In [None]:
heart_failure_data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,1.192945,0,582,0,20,1,0.01681648,0.490057,130,1,0,4,1
1,-0.491279,0,7861,0,38,0,7.53566e-09,-0.284552,136,1,0,6,1
2,0.350833,0,146,0,20,0,-1.038073,-0.0909,129,1,1,7,1
3,-0.912335,1,111,0,20,0,-0.5464741,0.490057,137,1,0,7,1
4,0.350833,1,160,1,20,0,0.6517986,1.264666,116,0,0,8,1


##**Train/Test/Split**

In [None]:
heart_failure_data_features = ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking']

X = heart_failure_data[heart_failure_data_features]

y = heart_failure_data.DEATH_EVENT

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .2, random_state = 1)

In [None]:
classifier = RandomForestClassifier(random_state = 1, class_weight = 'balanced')

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_val)

print("Accuracy Score", accuracy_score(y_val, y_pred))
print("Classification Report", classification_report(y_val, y_pred))

Accuracy Score 0.8
Classification Report               precision    recall  f1-score   support

           0       0.85      0.89      0.87        46
           1       0.58      0.50      0.54        14

    accuracy                           0.80        60
   macro avg       0.72      0.70      0.71        60
weighted avg       0.79      0.80      0.79        60



In [None]:
feature_importances = classifier.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
print(importance_df)

                     Feature  Importance
7           serum_creatinine    0.205380
4          ejection_fraction    0.159071
2   creatinine_phosphokinase    0.143876
6                  platelets    0.131564
0                        age    0.128422
8               serum_sodium    0.122773
9                        sex    0.023403
1                    anaemia    0.022811
3                   diabetes    0.022386
5        high_blood_pressure    0.021105
10                   smoking    0.019209


In [None]:
heart_failure_data_features = [
       'ejection_fraction',
       'serum_creatinine']

X = heart_failure_data[heart_failure_data_features]

y = heart_failure_data.DEATH_EVENT

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .3, random_state = 1)

In [None]:
classifier = RandomForestClassifier(random_state = 1)

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_val)

print("Accuracy Score", accuracy_score(y_val, y_pred))
print("Classification Report", classification_report(y_val, y_pred))

Accuracy Score 0.7666666666666667
Classification Report               precision    recall  f1-score   support

           0       0.83      0.84      0.84        64
           1       0.60      0.58      0.59        26

    accuracy                           0.77        90
   macro avg       0.72      0.71      0.71        90
weighted avg       0.76      0.77      0.77        90



In [None]:
feature_importances = classifier.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
print(importance_df)

             Feature  Importance
1   serum_creatinine    0.588897
0  ejection_fraction    0.411103
