#Heart Disease Prediction Project
The project involves prediction of heart disease (coronary heart disease) given health and socioeconomic data as inputs.

In [None]:
import pandas as pd
import numpy as np

# Upload data

We will upload our data as CSV file named "dataset.csv"

This dataset is from an ongoing cardiovascular study on residents of the town of Framingham, Massachusetts.

In [None]:
from google.colab import files
uploaded = files.upload()

Saving dataset.csv to dataset.csv


# Casting the correct datatypes

In [None]:
data_types = {"male":np.bool_, "age":np.int0, "currentSmoker":np.bool_, "TenYearCHD": np.bool_ , "prevalentHyp":np.bool_}
main_df = pd.read_csv("dataset.csv", dtype= data_types)
main_df.dtypes

male                  bool
age                  int64
education          float64
currentSmoker         bool
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp          bool
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD            bool
dtype: object

In [None]:
# head of dataset
main_df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
0,1.0,39.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,195.0,106.0,70.0,26.97,80.0,77.0
1,0.0,46.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,250.0,121.0,81.0,28.73,95.0,76.0
2,1.0,48.0,1.0,1.0,20.0,0.0,0.0,0.0,0.0,245.0,127.5,80.0,25.34,75.0,70.0
3,0.0,61.0,3.0,1.0,30.0,0.0,0.0,1.0,0.0,225.0,150.0,95.0,28.58,65.0,103.0
4,0.0,46.0,3.0,1.0,23.0,0.0,0.0,0.0,0.0,285.0,130.0,84.0,23.1,85.0,85.0


The dataset provides the patients’ information. It includes over 4,000 records and 15 attributes.

In [None]:
# get info about dataset
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4238 non-null   float64
 1   age              4238 non-null   float64
 2   education        4238 non-null   float64
 3   currentSmoker    4238 non-null   float64
 4   cigsPerDay       4238 non-null   float64
 5   BPMeds           4238 non-null   float64
 6   prevalentStroke  4238 non-null   float64
 7   prevalentHyp     4238 non-null   float64
 8   diabetes         4238 non-null   float64
 9   totChol          4238 non-null   float64
 10  sysBP            4238 non-null   float64
 11  diaBP            4238 non-null   float64
 12  BMI              4238 non-null   float64
 13  heartRate        4238 non-null   float64
 14  glucose          4238 non-null   float64
dtypes: float64(15)
memory usage: 496.8 KB


# EDA and Statistics
+ Pandas profiler library used for data analysis
+ This report provides overview of data, interaction between attributes, missing values information, correlations and samples of dataset.

In [None]:
# !pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
from pandas_profiling import ProfileReport

profile = ProfileReport(main_df, title="Profiling Report", explorative=True)
profile.to_file("EDAreport.html")

Collecting https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
  Downloading https://github.com/pandas-profiling/pandas-profiling/archive/master.zip (25.9 MB)
[K     |████████████████████████████████| 25.9 MB 1.6 MB/s 
Collecting pydantic>=1.8.1
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 8.5 MB/s 
[?25hCollecting PyYAML>=5.0.0
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 40.0 MB/s 
Collecting visions[type_image_path]==0.7.4
  Downloading visions-0.7.4-py3-none-any.whl (102 kB)
[K     |████████████████████████████████| 102 kB 11.0 MB/s 
Collecting htmlmin>=0.1.12
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
Collecting phik>=0.11.1
  Downloading phik-0.12.0-cp37-cp37m-manylinux2010_x86_64.whl (675 kB)
[K     |████████████████████████████████| 675 kB 

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Data Pre-processing

Handling missing values using KNN imputer

In [None]:
from sklearn.impute import KNNImputer

# Separate out target attribute from the rest of the attributes
ten_year_chd = main_df.pop("TenYearCHD").astype("int")

# Imputation of various attributes that have missing values 
# Glucose Imputation - Using the top 3 most co-related features and use the kNN method for imputation 
gluc_temp_df = main_df[["glucose", "sysBP","age", "diabetes"]]
imputed_values = KNNImputer(n_neighbors=5, weights="uniform").fit_transform(gluc_temp_df)
main_df["glucose"] = imputed_values[:,0]


# For the rest of the missing values, since percent missing is insignificant, so just using kNN imputer
new_df = pd.DataFrame(KNNImputer(n_neighbors=5, weights="uniform").fit_transform(main_df))
new_df.columns = main_df.columns
main_df = new_df
main_df.index

RangeIndex(start=0, stop=4238, step=1)

Checking outliers for most correlated variables

In [None]:
def check_Outliers(col):
  Q1 = col.quantile(0.25)
  Q3 = col.quantile(0.75)
  IQR = Q3-Q1
  lower = Q1 - 1.5*IQR
  upper = Q3 + 1.5*IQR
  print(main_df[(col < lower)])

check_Outliers(main_df['age'])
check_Outliers(main_df['prevalentHyp'])
check_Outliers(main_df['sysBP'])

Empty DataFrame
Columns: [male, age, education, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose]
Index: []
Empty DataFrame
Columns: [male, age, education, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose]
Index: []
Empty DataFrame
Columns: [male, age, education, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose]
Index: []


In [None]:
#Analysis report after data pre-processing 
profile1 = ProfileReport(main_df, title="Post Processing", explorative=True)
profile1.to_file("PostProcessing.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#Modelling (SVM and Logistic)

1. Logistic Regression Model

In [None]:
"""Using k-fold cross-validation instead of single hold-out(train-test split)"""
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
logisitic_model = LogisticRegression(solver='liblinear', C=3.0, random_state=0)
cross_val_score(logisitic_model, main_df, ten_year_chd, cv=15).mean()

0.8532324219565782

2. Support Vector Machine (SVM) Model

In [None]:
from sklearn import svm
svm_model = svm.SVC()
# Quick score of model performance
cross_val_score(svm_model, main_df, ten_year_chd, cv=15).mean()

0.8480415006390499

#Cross-validation and Evaluation


1. Cross-validation confusion matrix generation for Logistic Regression model and Support Vector Machine.

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

# Generating confusion matrix manually for cross validation, then average the results
def get_confusion_matrix(model):
    conf_matrix_list_of_arrays = []
    skf = StratifiedKFold(n_splits=7, random_state=0, shuffle=True)
    for train_index, test_index in skf.split(main_df, ten_year_chd):
        X_train, X_test = main_df.iloc[train_index], main_df.iloc[test_index]
        y_train, y_test = ten_year_chd.iloc[train_index], ten_year_chd.iloc[test_index]
        model.fit(X_train, y_train)
        conf_matrix = confusion_matrix(y_test, model.predict(X_test))
        conf_matrix_list_of_arrays .append(conf_matrix)
    return np.round(np.mean(conf_matrix_list_of_arrays, axis=0), 4)
    
confusion_matrix_dict = {"Logistic": get_confusion_matrix(logisitic_model), "SVM": get_confusion_matrix(svm_model)}
confusion_matrix_dict

{'Logistic': array([[510.2857,   3.1429],
        [ 85.5714,   6.4286]]), 'SVM': array([[5.13000e+02, 4.28600e-01],
        [9.14286e+01, 5.71400e-01]])}

2. Model Evaluation

In [None]:
# Shows result of the model accuracy
evaluation_dict = {}
for key, val in confusion_matrix_dict.items():
    tn, fp, fn, tp = val.flatten()
    accuracy = (tp + tn) / (tp + tn + fn + fp)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    precision = tp / (tp + fp)
    print(f"{key}: [Precision: {precision}] [Accuracy: {accuracy}] [Sensitivity: {sensitivity}] [Specificity: {specificity}]" )

Logistic: [Precision: 0.6716397638823591] [Accuracy: 0.8534686005913825] [Sensitivity: 0.06987608695652174] [Specificity: 0.9938786035682468]
SVM: [Precision: 0.5714] [Accuracy: 0.8482774021577443] [Sensitivity: 0.006210869565217392] [Specificity: 0.9991652198572499]
