#Import

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# from pandas_profiling import ProfileReport

import missingno as msno

import warnings
warnings.filterwarnings("ignore")

#Load Data

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/BryanT05/Machine-Learning-Bootcamp/main/Final%20Project/healthcare-dataset-stroke-data.csv')
# X = df.drop(columns=['id', 'stroke'])
# y = df['stroke']

#Input missing

In [None]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [None]:
df.bmi.fillna(df.bmi.median(), inplace = True)

We will replace bmi with its median

In [None]:
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [None]:
df.gender.value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [None]:
df.gender.replace('Other', 'Female', inplace = True)

We will replace other to female which is the mode

In [None]:
df = pd.get_dummies(df)

In [None]:
df

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0
1,51676,61.0,0,0,202.21,28.1,1,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0
2,31112,80.0,0,1,105.92,32.5,1,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0
3,60182,49.0,0,0,171.23,34.4,1,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1
4,1665,79.0,1,0,174.12,24.0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,80.0,1,0,83.75,28.1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0
5106,44873,81.0,0,0,125.20,40.0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0
5107,19723,35.0,0,0,82.99,30.6,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0
5108,37544,51.0,0,0,166.29,25.6,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0


In [None]:
df.stroke.value_counts()

0    4861
1     249
Name: stroke, dtype: int64

Because the data is too imbalance we will balance the data with SMOTE

In [None]:
X = df.drop(columns=['id', 'stroke'])
y = df['stroke']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(X_train, y_train)

In [None]:
col = X.columns

In [None]:
pd.Series(y_smote).value_counts()

1    3889
0    3889
dtype: int64

In [None]:
df = pd.DataFrame(x_smote, columns=col)
df['stroke'] = y_smote

df_val = pd.DataFrame(X_val, columns = col)
df_val['stroke'] = y_val

In [None]:
pd.Series(y_smote).value_counts()

1    3889
0    3889
dtype: int64

In [None]:
y_val.value_counts()

0    972
1     50
Name: stroke, dtype: int64

now the data is balanced for the training data

In [None]:
df.to_csv('df_train.csv', index= False)
df_val.to_csv('df_val.csv', index = False)

#Standarization

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

X_train = sc.fit_transform(x_smote)
X_val = sc.transform(X_val)

In [None]:
df = pd.DataFrame(X_train, columns=col)
df['stroke'] = y_smote

df_val = pd.DataFrame(X_val, columns = col)
df_val['stroke'] = y_val

In [None]:
X_val

array([[ 0.36785263, -0.5180195 , -0.41422726, ..., -0.60295052,
         1.45100798, -0.4737858 ],
       [-0.5516954 , -0.5180195 , -0.41422726, ..., -0.60295052,
         1.45100798, -0.4737858 ],
       [-1.47124342, -0.5180195 , -0.41422726, ..., -0.60295052,
        -0.80773792,  2.49884417],
       ...,
       [-0.22985359, -0.5180195 , -0.41422726, ..., -0.60295052,
         1.45100798, -0.4737858 ],
       [-1.83906263, -0.5180195 , -0.41422726, ..., -0.60295052,
         1.45100798, -0.4737858 ],
       [-0.18387619, -0.5180195 , -0.41422726, ...,  2.02555822,
        -0.80773792, -0.4737858 ]])

In [None]:
y_val.isnull().sum()

0

In [None]:
X_val.shape

(1022, 20)

In [None]:
y_val.shape

(1022,)

In [None]:
df_valid = pd.DataFrame(X_val, columns = col)
df_valid = df_valid.join(y_val)

In [None]:
df.to_csv('df_train_stand.csv',index = False)
df_val.to_csv('df_val_stand.csv', index = False)