In [5]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv("heart_failure_clinical_records_dataset.csv")
# Dividing features into Numerical and Categorical :
df.head(10)
col = list(df.columns)
categorical_features = []
numerical_features = []
for i in col:
    if len(df[i].unique()) > 6:
        numerical_features.append(i)
    else:
        categorical_features.append(i)

print('Categorical Features :',*categorical_features)
print('Numerical Features :',*numerical_features)

# Categorical Features :

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1 = df.copy(deep = True)

df1['anaemia'] = le.fit_transform(df1['anaemia'])
df1['diabetes'] = le.fit_transform(df1['diabetes'])
df1['high_blood_pressure'] = le.fit_transform(df1['high_blood_pressure'])
df1['sex'] = le.fit_transform(df1['sex'])
df1['smoking'] = le.fit_transform(df1['smoking'])


# Data Scaling:

from sklearn.preprocessing import MinMaxScaler,StandardScaler
mms = MinMaxScaler() # Normalization
ss = StandardScaler() # Standardization

df1['age'] = ss.fit_transform(df1[['age']])
df1['creatinine_phosphokinase'] = ss.fit_transform(df1[['creatinine_phosphokinase']])
df1['ejection_fraction'] = ss.fit_transform(df1[['ejection_fraction']])
df1['platelets'] = ss.fit_transform(df1[['platelets']])
df1['serum_creatinine'] = ss.fit_transform(df1[['serum_creatinine']])
df1['serum_sodium'] = ss.fit_transform(df1[['serum_sodium']])
df1['time'] = ss.fit_transform(df1[['time']])
df1.head()

Categorical Features : anaemia diabetes high_blood_pressure sex smoking DEATH_EVENT
Numerical Features : age creatinine_phosphokinase ejection_fraction platelets serum_creatinine serum_sodium time


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,1.192945,0,0.000166,0,-1.53056,1,0.01681648,0.490057,-1.504036,1,0,-1.629502,1
1,-0.491279,0,7.51464,0,-0.007077,0,7.53566e-09,-0.284552,-0.141976,1,0,-1.603691,1
2,0.350833,0,-0.449939,0,-1.53056,0,-1.038073,-0.0909,-1.731046,1,1,-1.590785,1
3,-0.912335,1,-0.486071,0,-1.53056,0,-0.5464741,0.490057,0.085034,1,0,-1.590785,1
4,0.350833,1,-0.435486,1,-1.53056,0,0.6517986,1.264666,-4.682176,0,0,-1.577879,1


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import precision_recall_curve
features = df1[df1.columns.drop(['DEATH_EVENT','smoking','sex','diabetes','platelets','creatinine_phosphokinase'])].values
target = df1['DEATH_EVENT'].values
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state = 2)
df1 = df.copy(deep = True)
def model(classifier):
    
    classifier.fit(x_train,y_train)
    prediction = classifier.predict(x_test)
    cv = RepeatedStratifiedKFold(n_splits = 10,n_repeats = 3,random_state = 1)
    print("Accuracy : ",'{0:.2%}'.format(accuracy_score(y_test,prediction)))
    print("Cross Validation Score : ",'{0:.2%}'.format(cross_val_score(classifier,x_train,y_train,cv = cv,scoring = 'roc_auc').mean()))
    print("ROC_AUC Score : ",'{0:.2%}'.format(roc_auc_score(y_test,prediction)))
    plt.show()

def model_evaluation(classifier):
    
    # Confusion Matrix
    cm = confusion_matrix(y_test,classifier.predict(x_test))
    names = ['True Neg','False Pos','False Neg','True Pos']
    counts = [value for value in cm.flatten()]
    percentages = ['{0:.2%}'.format(value) for value in cm.flatten()/np.sum(cm)]
    labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cm,annot = labels,cmap = colors,fmt ='')
    
    # Classification Report
    print(classification_report(y_test,classifier.predict(x_test)))

#1]Logical Regression

from sklearn.linear_model import LogisticRegression
classifier_lr = LogisticRegression(random_state = 0,C=10,penalty= 'l2')
model(classifier_lr)

#2] Support Vector Classifier 

from sklearn.svm import SVC
classifier_svc = SVC(kernel = 'linear',C = 0.1)
model(classifier_svc)

# 3] Decision Tree Classifier :

from sklearn.tree import DecisionTreeClassifier
classifier_dt = DecisionTreeClassifier(random_state = 1000,max_depth = 4,min_samples_leaf = 1)
model(classifier_dt)

# 4] Random Forest Classifier :

from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(max_depth = 4,random_state = 0)
model(classifier_rf)

# 5] K-nearest Neighbors Classifier :

from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(leaf_size = 1, n_neighbors = 3,p = 1)
model(classifier_knn)
import joblib
model = joblib.dump(model, 'heart_failure.sav')



Accuracy :  88.33%
Cross Validation Score :  87.25%
ROC_AUC Score :  82.97%
Accuracy :  91.67%
Cross Validation Score :  86.75%
ROC_AUC Score :  87.07%
Accuracy :  88.33%
Cross Validation Score :  84.11%
ROC_AUC Score :  82.97%
Accuracy :  93.33%
Cross Validation Score :  90.00%
ROC_AUC Score :  90.01%
Accuracy :  80.00%
Cross Validation Score :  81.18%
ROC_AUC Score :  71.82%


In [3]:
import joblib

# Train classifiers
classifiers = {
    'logistic_regression': LogisticRegression(random_state=0, C=10, penalty='l2').fit(x_train, y_train),
    'support_vector_classifier': SVC(kernel='linear', C=0.1).fit(x_train, y_train),
    'decision_tree_classifier': DecisionTreeClassifier(random_state=1000, max_depth=4, min_samples_leaf=1).fit(x_train, y_train),
    'random_forest_classifier': RandomForestClassifier(max_depth=4, random_state=0).fit(x_train, y_train),
    'k_nearest_neighbors_classifier': KNeighborsClassifier(leaf_size=1, n_neighbors=3, p=1).fit(x_train, y_train)
}

# Save classifiers to a single file
joblib.dump(classifiers, 'heart_failure_models.sav')


['heart_failure_models.sav']

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import joblib
import pickle

# Load dataset
df = pd.read_csv("heart_failure_clinical_records_dataset.csv")

# Encode categorical features
le = LabelEncoder()
df['anaemia'] = le.fit_transform(df['anaemia'])
df['diabetes'] = le.fit_transform(df['diabetes'])
df['high_blood_pressure'] = le.fit_transform(df['high_blood_pressure'])
df['sex'] = le.fit_transform(df['sex'])
df['smoking'] = le.fit_transform(df['smoking'])

# Scale numerical features
ss = StandardScaler()
df[['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time']] = ss.fit_transform(df[['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time']])

# Split data into features and target
features = df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'serum_creatinine', 'serum_sodium', 'time']]
target = df['DEATH_EVENT']

# Split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=2)

# Train classifiers
classifiers = {
    'logistic_regression': LogisticRegression(random_state=0, C=10, penalty='l2'),
    'support_vector_classifier': SVC(kernel='linear', C=0.1),
    'decision_tree_classifier': DecisionTreeClassifier(random_state=1000, max_depth=4, min_samples_leaf=1),
    'random_forest_classifier': RandomForestClassifier(max_depth=4, random_state=0),
    'k_nearest_neighbors_classifier': KNeighborsClassifier(leaf_size=1, n_neighbors=3, p=1)
}

# Train and evaluate models
for name, classifier in classifiers.items():
    classifier.fit(x_train, y_train)
    prediction = classifier.predict(x_test)
    print(f"Classifier: {name}")
    print("Accuracy:", '{0:.2%}'.format(accuracy_score(y_test, prediction)))
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    print("Cross Validation Score:", '{0:.2%}'.format(cross_val_score(classifier, x_train, y_train, cv=cv, scoring='roc_auc').mean()))
    print("ROC_AUC Score:", '{0:.2%}'.format(roc_auc_score(y_test, prediction)))
    print(classification_report(y_test, prediction))
    print()

# Save trained models into a dictionary
models_dict = {name: classifier for name, classifier in classifiers.items()}

# Save the dictionary containing trained models to a .pkl file
with open('heart_failure_models.pkl', 'wb') as file:
    pickle.dump(models_dict, file)

print("Models saved successfully.")


Classifier: logistic_regression
Accuracy: 86.67%
Cross Validation Score: 86.31%
ROC_AUC Score: 81.81%
              precision    recall  f1-score   support

           0       0.89      0.93      0.91        43
           1       0.80      0.71      0.75        17

    accuracy                           0.87        60
   macro avg       0.84      0.82      0.83        60
weighted avg       0.86      0.87      0.86        60


Classifier: support_vector_classifier
Accuracy: 88.33%
Cross Validation Score: 87.22%
ROC_AUC Score: 84.75%
              precision    recall  f1-score   support

           0       0.91      0.93      0.92        43
           1       0.81      0.76      0.79        17

    accuracy                           0.88        60
   macro avg       0.86      0.85      0.85        60
weighted avg       0.88      0.88      0.88        60


Classifier: decision_tree_classifier
Accuracy: 88.33%
Cross Validation Score: 83.46%
ROC_AUC Score: 82.97%
              precision    