# **Cardiovascular Disease Prediction**

By Aloysius Chua, [Bian Si Sheng](https://github.com/emptysetoverhere), [Kenneth Tanudjaja](https://github.com/astrayr), Surekha in March 28, 2023.

The dataset is provided by [Kaggle](https://www.kaggle.com) in this [link](https://www.kaggle.com/datasets/sulianova/cardiovascular-disease-dataset).

In [80]:
import math
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

sb.set()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393.0,2.0,168.0,62.0,110.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
1,20228.0,1.0,156.0,85.0,140.0,90.0,3.0,1.0,0.0,0.0,1.0,1.0
2,18857.0,1.0,165.0,64.0,130.0,70.0,3.0,1.0,0.0,0.0,0.0,1.0
3,17623.0,2.0,169.0,82.0,150.0,100.0,1.0,1.0,0.0,0.0,1.0,1.0
4,17474.0,1.0,156.0,56.0,100.0,60.0,1.0,1.0,0.0,0.0,0.0,0.0


In [None]:
data = pd.read_csv("cardio_train.csv", sep=';')
data = pd.DataFrame(data=data.drop("id", axis=1), index=data["id"])

data.head()

In [81]:
data.infer_objects().dtypes

age            float64
gender         float64
height         float64
weight         float64
ap_hi          float64
ap_lo          float64
cholesterol    float64
gluc           float64
smoke          float64
alco           float64
active         float64
cardio         float64
dtype: object

### Discard N/A values

In [82]:
data = data.dropna(axis=0, how="any")
data["index"] = range(data.shape[0])
data.set_index("index", inplace=True)



### Create BMI Column

In [83]:
calc_bmi = lambda wh : wh[0] / math.pow(wh[1] / 100, 2) # Function to calculate BMI where BMI = height / (weight) * 2 

data["bmi"] = data[["weight", "height"]].apply(calc_bmi, axis=1) # creating the bmi column

def swap_col(col1: str, col2: str, data: pd.DataFrame) -> pd.DataFrame:
    # Swaping datafame columns 
    data[col1], data[col2] = data[col2], data[col1]
    data.rename(columns={col1: col2, col2: col1}, inplace=True)
    return data


data.drop(["height", "weight"], axis=1, inplace=True)
data = swap_col(col1="cardio", col2="bmi", data=data) # swap bmi and cardio, let cardio to be the last col
data.head(n=10)

Unnamed: 0_level_0,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,bmi,cardio
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,18393.0,2.0,110.0,80.0,1.0,1.0,0.0,0.0,1.0,21.96712,0.0
1,20228.0,1.0,140.0,90.0,3.0,1.0,0.0,0.0,1.0,34.927679,1.0
2,18857.0,1.0,130.0,70.0,3.0,1.0,0.0,0.0,0.0,23.507805,1.0
3,17623.0,2.0,150.0,100.0,1.0,1.0,0.0,0.0,1.0,28.710479,1.0
4,17474.0,1.0,100.0,60.0,1.0,1.0,0.0,0.0,0.0,23.011177,0.0
5,17668.0,1.0,110.0,70.0,1.0,1.0,0.0,0.0,1.0,28.440955,0.0
6,19834.0,1.0,110.0,60.0,1.0,1.0,0.0,0.0,0.0,25.28257,0.0
7,14791.0,2.0,120.0,80.0,1.0,1.0,0.0,0.0,0.0,22.038567,0.0
8,19809.0,1.0,110.0,70.0,1.0,1.0,0.0,0.0,1.0,31.244993,0.0
9,14532.0,2.0,130.0,90.0,1.0,1.0,1.0,1.0,1.0,28.997894,0.0


### Separating data into matrices of numeric and categorical values

In [84]:
numeric_labels = set(["age", "bmi", "ap_hi", "ap_lo"])
categorical_labels = set(data.columns) - numeric_labels - set(["cardio"])

numeric_vec = data.filter(numeric_labels) # numerical vector
categorical_vec = data.filter(categorical_labels) # categorical vector


In [85]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

std_scaler = StandardScaler() 
one_hot = OneHotEncoder()


### Transform categorical data into numerical data using One-hot Encoder

Rationale: Some machine learning algorithms like KNN and SVM require all features to be numerics. One-hot encoding transform a categorical data matrix to a binary matrix where 0s and 1s are meant to represent the existence of a feature.

In [86]:
# Replacer dictionary for more readable column names after one-hot transformation
replacer = dict(
    cholesterol ={1: 'Normal', 2:'Above Normal', 3:'Well Above Normal'},
    gluc={1:'Normal', 2:'Above Normal', 3:'Well Above Normal'},
    gender={1:'Female', 2:'Male'},
) 

for label in replacer.keys():
    categorical_vec[label].replace(replacer[label], inplace=True)
    
categorical_vec

Unnamed: 0_level_0,gender,alco,gluc,smoke,cholesterol,active
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Male,0.0,Normal,0.0,Normal,1.0
1,Female,0.0,Normal,0.0,Well Above Normal,1.0
2,Female,0.0,Normal,0.0,Well Above Normal,0.0
3,Male,0.0,Normal,0.0,Normal,1.0
4,Female,0.0,Normal,0.0,Normal,0.0
...,...,...,...,...,...,...
49031,Female,0.0,Normal,0.0,Normal,1.0
49032,Female,0.0,Normal,0.0,Normal,1.0
49033,Female,0.0,Above Normal,0.0,Above Normal,1.0
49034,Female,0.0,Above Normal,0.0,Normal,0.0


In [87]:
one_hot.fit(categorical_vec.filter(list(replacer.keys())))
onehot_data = one_hot.transform(categorical_vec.filter(replacer.keys())).toarray()
onehot_data = pd.DataFrame(data=onehot_data, columns=one_hot.get_feature_names_out())

onehot_data.head()



Unnamed: 0,cholesterol_Above Normal,cholesterol_Normal,cholesterol_Well Above Normal,gluc_Above Normal,gluc_Normal,gluc_Well Above Normal,gender_Female,gender_Male
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


### Standardize numerical data using Standard Scaler

This is to ensure that numerical features are about the same scale as the transformed categorical features so as to balance their significance before training. 

In [88]:
std_scaler.fit(numeric_vec)
numeric_data = std_scaler.transform(numeric_vec)
numeric_data = pd.DataFrame(numeric_data, columns=std_scaler.get_feature_names_out())
numeric_data.head()

Unnamed: 0,ap_hi,ap_lo,bmi,age
0,-0.151119,-0.088039,-0.902642,-0.43196
1,0.097746,-0.033406,1.182862,0.30992
2,0.014791,-0.142673,-0.654728,-0.244367
3,0.180701,0.021227,0.182443,-0.743266
4,-0.234074,-0.197306,-0.734641,-0.803506


In [89]:

res = categorical_vec.drop(columns = list(replacer.keys()))

preprocessed = pd.concat([numeric_data, onehot_data, res, data["cardio"]], axis=1)
preprocessed


Unnamed: 0,ap_hi,ap_lo,bmi,age,cholesterol_Above Normal,cholesterol_Normal,cholesterol_Well Above Normal,gluc_Above Normal,gluc_Normal,gluc_Well Above Normal,gender_Female,gender_Male,alco,smoke,active,cardio
0,-0.151119,-0.088039,-0.902642,-0.431960,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.097746,-0.033406,1.182862,0.309920,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
2,0.014791,-0.142673,-0.654728,-0.244367,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.180701,0.021227,0.182443,-0.743266,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
4,-0.234074,-0.197306,-0.734641,-0.803506,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49031,0.346610,-0.033406,-0.961055,-0.270646,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
49032,0.014791,-0.033406,-0.630006,0.096048,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
49033,0.097746,-0.033406,3.684232,1.269310,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
49034,0.056268,-0.088039,-0.076823,1.200580,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [90]:
preprocessed.to_csv("preprocessed.csv")