# Data Preprocessing

In [1]:
import math
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

sb.set()

In [2]:
data = pd.read_csv("eda.csv")
data.drop(["Unnamed: 0"], axis=1, inplace=True)
data.head()

Unnamed: 0,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,50.35729,Male,110,80,Normal,Normal,Non-smoker,No alcohol,Active,No Cvd,21.96712
1,55.381246,Female,140,90,Well Above Normal,Normal,Non-smoker,No alcohol,Active,Cvd,34.927679
2,51.627652,Female,130,70,Well Above Normal,Normal,Non-smoker,No alcohol,Not active,Cvd,23.507805
3,48.249144,Male,150,100,Normal,Normal,Non-smoker,No alcohol,Active,Cvd,28.710479
4,47.841205,Female,100,60,Normal,Normal,Non-smoker,No alcohol,Not active,No Cvd,23.011177


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69000 entries, 0 to 68999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          69000 non-null  float64
 1   gender       69000 non-null  object 
 2   ap_hi        69000 non-null  int64  
 3   ap_lo        69000 non-null  int64  
 4   cholesterol  69000 non-null  object 
 5   gluc         69000 non-null  object 
 6   smoke        69000 non-null  object 
 7   alco         69000 non-null  object 
 8   active       69000 non-null  object 
 9   cardio       69000 non-null  object 
 10  bmi          69000 non-null  float64
dtypes: float64(2), int64(2), object(7)
memory usage: 5.8+ MB


### Separating data into matrices of numeric and categorical values

In [4]:
numeric_labels = set(["age", "bmi", "ap_hi", "ap_lo"])
categorical_labels = set(data.columns) - numeric_labels - set(["cardio"])

numeric_vec = data.filter(numeric_labels) # numerical vector
categorical_vec = data.filter(categorical_labels) # categorical vector


In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

std_scaler = StandardScaler() 
one_hot = OneHotEncoder()


### Transform categorical data into numerical data using One-hot Encoder

Rationale: Some machine learning algorithms like KNN and SVM require all features to be numerics. One-hot encoding transform a categorical data matrix to a binary matrix where 0s and 1s are meant to represent the existence of a feature.

In [6]:
# Replacer dictionary for more readable column names after one-hot transformation
replacer = dict(
    cholesterol ={1:'Normal', 2:'Above Normal', 3:'Well Above Normal'},
    gluc={1:'Normal', 2:'Above Normal', 3:'Well Above Normal'},
    gender={1:'Female', 2:'Male'},
    smoke = {'Non-smoker':0, 'Smoker': 1},
    alco={'No alcohol':0, 'Alcohol': 1},
    active= {'Not active':0, 'Active': 1}
) 

for label in replacer.keys():
    categorical_vec[label].replace(replacer[label], inplace=True)
    
categorical_vec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69000 entries, 0 to 68999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   active       69000 non-null  int64 
 1   smoke        69000 non-null  int64 
 2   alco         69000 non-null  int64 
 3   gluc         69000 non-null  object
 4   gender       69000 non-null  object
 5   cholesterol  69000 non-null  object
dtypes: int64(3), object(3)
memory usage: 3.2+ MB


In [7]:
one_hot.fit(categorical_vec.drop(["smoke", "alco", "active"], axis=1))
onehot_data = one_hot.transform(categorical_vec.drop(["smoke", "alco", "active"], axis=1)).toarray()
onehot_data = pd.DataFrame(data=onehot_data, columns=one_hot.get_feature_names_out())

onehot_data.head()



Unnamed: 0,gluc_Above Normal,gluc_Normal,gluc_Well Above Normal,gender_Female,gender_Male,cholesterol_Above Normal,cholesterol_Normal,cholesterol_Well Above Normal
0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


### Standardize numerical data using Standard Scaler

This is to ensure that numerical features are about the same scale as the transformed categorical features so as to balance their significance before training. 

In [8]:
std_scaler.fit(numeric_vec)
numeric_data = std_scaler.transform(numeric_vec)
numeric_data = pd.DataFrame(numeric_data, columns=std_scaler.get_feature_names_out())
numeric_data.head()

Unnamed: 0,age,ap_lo,ap_hi,bmi
0,-0.433834,-0.134931,-0.921194,-0.913925
1,0.309613,0.87746,0.771644,1.217495
2,-0.245845,-1.147322,0.207365,-0.660553
3,-0.745799,1.889851,1.335923,0.19505
4,-0.806166,-2.159713,-1.485474,-0.742225


In [9]:

res = categorical_vec.filter(["smoke", "alco", "active"])

preprocessed = pd.concat([numeric_data, onehot_data, res, data["cardio"].map({'No Cvd': 0, 'Cvd': 1})], axis=1)
preprocessed


Unnamed: 0,age,ap_lo,ap_hi,bmi,gluc_Above Normal,gluc_Normal,gluc_Well Above Normal,gender_Female,gender_Male,cholesterol_Above Normal,cholesterol_Normal,cholesterol_Well Above Normal,smoke,alco,active,cardio
0,-0.433834,-0.134931,-0.921194,-0.913925,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0,0,1,No Cvd
1,0.309613,0.877460,0.771644,1.217495,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0,0,1,Cvd
2,-0.245845,-1.147322,0.207365,-0.660553,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0,0,0,Cvd
3,-0.745799,1.889851,1.335923,0.195050,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0,0,1,Cvd
4,-0.806166,-2.159713,-1.485474,-0.742225,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0,0,No Cvd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68995,-0.090674,-0.134931,-0.356915,-0.098179,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0,1,No Cvd
68996,1.271031,0.877460,0.771644,3.773938,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,1,Cvd
68997,-0.161169,0.877460,3.028762,0.629719,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1,0,Cvd
68998,1.202156,-0.134931,0.489504,-0.069924,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0,0,Cvd


In [10]:
preprocessed.index.name = "id"
preprocessed.to_csv("preprocessed.csv")