# Heart Disease Prediction Project

+ The project involves prediction of heart disease (coronary heart disease) given health and socioeconomic data as inputs.

In [1]:
import pandas as pd
import numpy as np

# Casting The correct datatypes
data_types = {"male":np.bool_, "age":np.int0, "currentSmoker":np.bool_, "TenYearCHD": np.bool_ , "prevalentHyp":np.bool_}
main_df = pd.read_csv("dataset.csv", dtype= data_types)

main_df

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,True,39,4.0,False,0.0,0.0,0,False,0,195.0,106.0,70.0,26.97,80.0,77.0,False
1,False,46,2.0,False,0.0,0.0,0,False,0,250.0,121.0,81.0,28.73,95.0,76.0,False
2,True,48,1.0,True,20.0,0.0,0,False,0,245.0,127.5,80.0,25.34,75.0,70.0,False
3,False,61,3.0,True,30.0,0.0,0,True,0,225.0,150.0,95.0,28.58,65.0,103.0,True
4,False,46,3.0,True,23.0,0.0,0,False,0,285.0,130.0,84.0,23.10,85.0,85.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4233,True,50,1.0,True,1.0,0.0,0,True,0,313.0,179.0,92.0,25.97,66.0,86.0,True
4234,True,51,3.0,True,43.0,0.0,0,False,0,207.0,126.5,80.0,19.71,65.0,68.0,False
4235,False,48,2.0,True,20.0,,0,False,0,248.0,131.0,72.0,22.00,84.0,86.0,False
4236,False,44,1.0,True,15.0,0.0,0,False,0,210.0,126.5,87.0,19.16,86.0,,False


---
## EDA and Statistics
+ We will use the pandas profiler library for this

In [7]:
!pip install pandas-profiling
from pandas_profiling import ProfileReport

profile = ProfileReport(main_df, title="Profiling Report", explorative=True)
profile.to_file("EDAreport.html")




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

***

## Data Pre-processing

### Missing Data Imputation

In [25]:
from sklearn.impute import KNNImputer

# Separate out target attribute from the rest of the attributes
ten_year_chd = main_df.pop("TenYearCHD").astype("int")

# Imputation of various attributes that have missing values 
# Glucose Imputation - we will the top 3 most co-related features and use the kNN method for imputation 
gluc_temp_df = main_df[["glucose", "sysBP","age", "diabetes"]]
imputed_values = KNNImputer(n_neighbors=5, weights="uniform").fit_transform(gluc_temp_df)
main_df["glucose"] = imputed_values[:,0]


# for the rest of the missing values, since percent missing is insignificant, we just use kNN imputer
new_df = pd.DataFrame(KNNImputer(n_neighbors=5, weights="uniform").fit_transform(main_df))
new_df.columns = main_df.columns
main_df = new_df
main_df.index







RangeIndex(start=0, stop=4238, step=1)

In [8]:
profile1 = ProfileReport(main_df, title="Post Processing", explorative=True)
profile1.to_file("PostProcessing.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

***
## Modelling (SVM and Logistic)
### 1. Logistic Regression Model

In [46]:
"""We will be using k-fold cross-validation instead of single hold-out(train-test split)"""
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
logisitic_model = LogisticRegression(solver='liblinear', C=3.0, random_state=0)
cross_val_score(logisitic_model, main_df, ten_year_chd, cv=15).mean()

0.8532324219565782

### 2. Support Vector Machine (SVM) Model

In [45]:
from sklearn import svm
svm_model = svm.SVC()
# Quick score of model performance
cross_val_score(svm_model, main_df, ten_year_chd, cv=15).mean()


0.8480415006390499

***
## Cross-validation and Evaluation

### 1. Cross-validation confusion matrix generation

In [74]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

# We have to generate confusion matrix manually for cross validation, then average the results
def get_confusion_matrix(model):
    conf_matrix_list_of_arrays = []
    skf = StratifiedKFold(n_splits=7, random_state=0, shuffle=True)
    for train_index, test_index in skf.split(main_df, ten_year_chd):
        X_train, X_test = main_df.iloc[train_index], main_df.iloc[test_index]
        y_train, y_test = ten_year_chd.iloc[train_index], ten_year_chd.iloc[test_index]
        model.fit(X_train, y_train)
        conf_matrix = confusion_matrix(y_test, model.predict(X_test))
        conf_matrix_list_of_arrays .append(conf_matrix)
    return np.round(np.mean(conf_matrix_list_of_arrays, axis=0), 4)
    
confusion_matrix_dict = {"logistic": get_confusion_matrix(logisitic_model), "svm": get_confusion_matrix(svm_model)}




### 2. Model Evaluation

In [75]:
# We print the results of the model accuracy
evaluation_dict = {}
for key, val in confusion_matrix_dict.items():
    tn, fp, fn, tp = val.flatten()
    accuracy = (tp + tn) / (tp + tn + fn + fp)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    precision = tp / (tp + fp)
    print(f"{key}: [precision: {precision}] [accuracy: {accuracy}] [sensitivity: {sensitivity}] [specificity: {specificity}]" )



logistic: [precision: 0.6515177226735677] [accuracy: 0.8527608375289835] [sensitivity: 0.06677065217391305] [specificity: 0.993600473366696]
svm: [precision: 0.5714] [accuracy: 0.8482774021577443] [sensitivity: 0.006210869565217392] [specificity: 0.9991652198572499]
