In [14]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model  import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, roc_auc_score


import pickle



In [15]:
data = pd.read_csv('cleaned_data.csv')
df = data.copy()

In [16]:
# First level splitting 
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

In [17]:
X.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,0,0,140,289,0,0,172,0,0,0
1,49,1,1,160,180,0,0,156,0,1,1
2,37,0,0,130,283,0,1,98,0,0,0
3,48,1,2,138,214,0,0,108,1,1,1
4,54,0,1,150,195,0,0,122,0,0,0


In [18]:
#Second Level Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)

# Model Building

## Logistic Regression

In [19]:
solver = {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}

best_solver = ''
test_score = np.zeros(6)

for model, solver_param in enumerate(solver):
    LR_Model = LogisticRegression(solver=solver_param).fit(X_train, y_train)
    test_score[model] = LR_Model.score(X_test, y_test)
    if LR_Model.score(X_test, y_test) == test_score.max():
        best_solver = solver_param

LoRe = LogisticRegression(solver=best_solver)
LoRe.fit(X_train, y_train)
ypred = LoRe.predict(X_test)

print(f'Logistic Regression Score : {accuracy_score(y_test, ypred)}')

Logistic Regression Score : 0.8532608695652174


### Support Vector Machine

In [20]:
kernels = {'linear':0, 'poly':0, 'rbf':0, 'sigmoid':0}
best_kernel = ''

for i in kernels:
    svm = SVC(kernel=i)
    svm.fit(X_train, y_train)
    y_val = svm.predict(X_test)
    kernels[i] = f1_score(y_test, y_val, average='weighted')
    if kernels[i] == max(kernels.values()):
        best = i
svm = SVC(kernel=best)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print(f' {best.title()} SVM F1 Score: {f1_score(y_test, svm_pred, average='weighted')}')

 Linear SVM F1 Score: 0.8691889632107023


### Decision Tree

In [21]:
decision = DecisionTreeClassifier(class_weight='balanced')
parameter_grid = {
    'max_depth': [3, 4, 5, 6, 7],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf':[1, 2, 3, 4],
    'random_state': [0, 3]
}

#Run grid search
grid_search = GridSearchCV(decision, parameter_grid, cv=3)
grid_search.fit(X_train, y_train)

best_decision_tree = grid_search.best_estimator_
decision_pred = best_decision_tree.predict_proba(X_test)

print(f'Decision Tree\'s AUC: {roc_auc_score(y_test, decision_pred[:, 1])}')

Decision Tree's AUC: 0.9068627450980392


### Random Forest Classifier

In [22]:
random_forest = RandomForestClassifier()
params_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_features': ['sqrt', 'log2', None],
    'max_depth' : [1, 5, 10, 20],
    'max_leaf_nodes': [2, 5, 10]

}

random_search = RandomizedSearchCV(random_forest, param_distributions=params_grid, random_state=6, n_jobs=-1)
random_search.fit(X_train, y_train)

best_search = random_search.best_estimator_
rf_pred = best_search.predict(X_test)
print(f'Classification Report on Random Forest: \n{classification_report(y_test, rf_pred)}')

Classification Report on Random Forest: 
              precision    recall  f1-score   support

         0.0       0.90      0.79      0.84        82
         1.0       0.85      0.93      0.89       102

    accuracy                           0.87       184
   macro avg       0.88      0.86      0.87       184
weighted avg       0.87      0.87      0.87       184



In [30]:
X_test.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
821,60,1,1,102,318,0,0,160,0,0,0
748,64,0,2,120,246,0,2,96,1,2,2
797,41,0,2,110,172,0,2,158,0,0,0
396,62,1,3,140,210,1,0,143,0,0,1
486,55,0,0,110,214,1,1,180,0,0,0


## Save the Models

In [23]:
file = open('logisticregression.pkl', 'wb')
pickle.dump(LoRe, file)

In [24]:
file = open('svm.pkl', 'wb')
pickle.dump(svm, file)
# joblib.dump(svm_model, 'svm.pkl')

In [25]:
file = open('decisiontree.pkl', 'wb')
pickle.dump(best_decision_tree, file)

In [26]:
rf_model  = best_search
file = open('randomforest.pkl', 'wb')
pickle.dump(rf_model, file)
# joblib.dump(rf_model, 'randomforest.pkl')

In [29]:
X_test.to_csv('sample.csv', index=False)