In [27]:
import pandas as pd
import numpy as np

df = pd.read_csv("../dataset_stroke.csv")
df = df.rename({'avg_glucose_level':'agl'},axis=1)
df['bmi'] = df['bmi'].replace(np.nan,df['bmi'].median()) #Data `bmi` cleaning by media

In [22]:
# Simple Feature Scaling method
# formula: Xnew = Xold / Xmax

df_sfs = df

df_sfs['agl'] = df_sfs['agl'] / df_sfs['agl'].max()
df_sfs

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,agl,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,0.841577,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,0.744130,28.1,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,0.389784,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,0.630124,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,0.640760,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,0.308199,28.1,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,0.460735,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,0.305402,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,0.611945,25.6,formerly smoked,0


In [23]:
# Min Max Scaler method
# formula: Xnew = (Xold - Xmin) / (Xmax - Xmin)

df_mms = df

df_mms['agl'] = (df_mms['agl'] - df_mms['agl'].min()) / (df_mms['agl'].max() - df_mms['agl'].min())
df_mms

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,agl,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,0.801265,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,0.679023,28.1,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,0.234512,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,0.536008,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,0.549349,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,0.132167,28.1,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,0.323516,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,0.128658,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,0.513203,25.6,formerly smoked,0


In [28]:
# data standardization with sklearn library

from sklearn.preprocessing import MinMaxScaler

df_mms_sklearn = df

mms = MinMaxScaler() # membuat scaller

df_mms_sklearn[['agl','bmi']] = mms.fit_transform(df_mms_sklearn[['agl','bmi']]) # Scalling dengan MinMaxScaller
df_mms_sklearn


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,agl,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,0.801265,0.301260,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,0.679023,0.203895,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,0.234512,0.254296,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,0.536008,0.276060,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,0.549349,0.156930,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,0.132167,0.203895,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,0.323516,0.340206,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,0.128658,0.232532,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,0.513203,0.175258,formerly smoked,0


In [30]:
# Standard Scaling (Z-Score)
# formula: Xnew = (Xold - mean) / stdDev

df_zs = df

df_zs['agl'] = (df_zs['agl'] - df_zs['agl'].mean()) / df_zs['agl'].std()
df_zs

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,agl,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,2.706111,0.301260,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,2.121351,0.203895,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,-0.005028,0.254296,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,1.437217,0.276060,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,1.501038,0.156930,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,-0.494609,0.203895,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,0.420734,0.340206,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,-0.511393,0.232532,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,1.328127,0.175258,formerly smoked,0


In [31]:
# standard scaller (Z-Score) with sklearn library

from sklearn.preprocessing import StandardScaler

df_zs_sklearn = df

ssc = StandardScaler() # membuat scaller

df_zs_sklearn[['agl','bmi']] = ssc.fit_transform(df_zs_sklearn[['agl','bmi']]) # Scalling dengan MinMaxScaller
df_zs_sklearn

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,agl,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,2.706375,1.005086,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,2.121559,-0.098981,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,-0.005028,0.472536,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,1.437358,0.719327,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,1.501184,-0.631531,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,-0.494658,-0.098981,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,0.420775,1.446713,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,-0.511443,0.225745,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,1.328257,-0.423706,formerly smoked,0


In [32]:
# save processed data to csv

df_zs_sklearn.to_csv('../data_stroke-cleaned.csv',index=False)