In [1]:
#libraries
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt 
import seaborn as sns
sns.set(style='darkgrid')
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

In [2]:
from sklearn.metrics import accuracy_score


In [3]:
df = pd.read_csv('healthcare-dataset-stroke-data\healthcare-dataset-stroke-data.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
len(df)


5110

In [5]:
df.isnull().sum()


id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [6]:
df[df == 'Unknown']


  res_values = method(rvalues)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,,,,,,,,,,,,
1,,,,,,,,,,,,
2,,,,,,,,,,,,
3,,,,,,,,,,,,
4,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,,,,,,,,,,,,
5106,,,,,,,,,,,,
5107,,,,,,,,,,,,
5108,,,,,,,,,,,,


In [7]:
#So we have missing values in 'bmi' column and some 'unknown' values in smoking_status
#Drop only missing values
df1a = df.drop(columns = ['bmi'])

In [8]:
df1a.head()


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,never smoked,1


In [9]:
#Drop missing values and "Unknown" values
#First Matrix
df1b = df.drop(columns = ['bmi','smoking_status'])
df1b.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,1


In [10]:
#Mean Imputation

In [11]:
df2 = df.copy()


In [12]:
df2.head()


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [13]:
df2['bmi'].isnull().sum()


201

In [14]:
#Mean of bmi column
np.mean(df2.bmi)

28.893236911794666

In [15]:
#fill N/A values with mean value
df2['bmi'].fillna(np.mean(df2.bmi), inplace=True)

In [16]:
#Second Matrix
df2.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [17]:
#Checking if missing values still exists
df2.bmi.isnull().sum()

0

In [18]:
def onehot_encode(dff, column):
    dff = dff.copy()
    dummies = pd.get_dummies(dff[column], prefix = column) 
    dff = pd.concat([dff, dummies], axis=1)#Side by side (axis=1)
    dff = dff.drop(column, axis=1)
    return dff

In [19]:
def preprocess_inputs(dff):
    dff = dff.copy()
    #Drop id column because its not needed
    dff = dff.drop('id', axis =1)

    #Binary encoding
    dff['ever_married'] = dff['ever_married'].replace({'No': 0,'Yes':1})
    dff['Residence_type'] = dff['Residence_type'].replace({'Rural': 0,'Urban':1})

    #One-Hot encoding
    for column in ['gender', 'work_type', 'smoking_status']:
        dff= onehot_encode(dff, column=column)

    #Split dff
    y= dff['stroke']
    X= dff.drop('stroke',axis=1) 

    #Train_Test_Split
    X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.75, shuffle=True, random_state=1)
    
    #KNN imputation
    imputer = KNNImputer()
    imputer.fit(X_train)
    X_train = pd.DataFrame(imputer.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(imputer.transform(X_test), index=X_test.index, columns=X_test.columns)

    #Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
        
    return  X_train, X_test, y_train, y_test

In [20]:
X_train, X_test, y_train, y_test = preprocess_inputs(df)


In [21]:
X_train
#Before Scaling

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
1914,-0.012400,-0.329843,-0.240181,0.721550,-1.012606,-0.611726,-0.012266,-1.182369,1.182369,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,-0.662147,-0.459067,1.315785,-0.428307
1751,0.961818,-0.329843,-0.240181,0.721550,0.987551,-0.369349,-0.102977,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,-0.662147,2.178331,-0.760003,-0.428307
396,0.740405,3.031747,-0.240181,0.721550,-1.012606,2.385034,0.907800,-1.182369,1.182369,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,-0.662147,-0.459067,1.315785,-0.428307
1783,-1.827988,-0.329843,-0.240181,-1.385905,-1.012606,-0.881304,-1.606184,-1.182369,1.182369,0.0,-0.379766,-0.068698,-1.155492,-0.43977,2.526545,1.510239,-0.459067,-0.760003,-0.428307
2361,-0.588074,-0.329843,-0.240181,0.721550,0.987551,-0.078320,-0.750911,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,1.510239,-0.459067,-0.760003,-0.428307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,0.120448,-0.329843,-0.240181,0.721550,0.987551,0.713164,0.039569,-1.182369,1.182369,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,-0.662147,-0.459067,1.315785,-0.428307
2763,0.076165,-0.329843,-0.240181,0.721550,-1.012606,-0.924207,0.389453,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,1.510239,-0.459067,-0.760003,-0.428307
905,-0.543792,-0.329843,-0.240181,0.721550,-1.012606,-0.647110,0.855965,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,-0.662147,-0.459067,1.315785,-0.428307
3980,0.076165,-0.329843,-0.240181,0.721550,-1.012606,2.489637,3.369949,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,-0.662147,-0.459067,-0.760003,2.334776


In [22]:
X_train
#After Scaling

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
1914,-0.012400,-0.329843,-0.240181,0.721550,-1.012606,-0.611726,-0.012266,-1.182369,1.182369,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,-0.662147,-0.459067,1.315785,-0.428307
1751,0.961818,-0.329843,-0.240181,0.721550,0.987551,-0.369349,-0.102977,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,-0.662147,2.178331,-0.760003,-0.428307
396,0.740405,3.031747,-0.240181,0.721550,-1.012606,2.385034,0.907800,-1.182369,1.182369,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,-0.662147,-0.459067,1.315785,-0.428307
1783,-1.827988,-0.329843,-0.240181,-1.385905,-1.012606,-0.881304,-1.606184,-1.182369,1.182369,0.0,-0.379766,-0.068698,-1.155492,-0.43977,2.526545,1.510239,-0.459067,-0.760003,-0.428307
2361,-0.588074,-0.329843,-0.240181,0.721550,0.987551,-0.078320,-0.750911,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,1.510239,-0.459067,-0.760003,-0.428307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2895,0.120448,-0.329843,-0.240181,0.721550,0.987551,0.713164,0.039569,-1.182369,1.182369,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,-0.662147,-0.459067,1.315785,-0.428307
2763,0.076165,-0.329843,-0.240181,0.721550,-1.012606,-0.924207,0.389453,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,1.510239,-0.459067,-0.760003,-0.428307
905,-0.543792,-0.329843,-0.240181,0.721550,-1.012606,-0.647110,0.855965,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,-0.662147,-0.459067,1.315785,-0.428307
3980,0.076165,-0.329843,-0.240181,0.721550,-1.012606,2.489637,3.369949,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.43977,-0.395797,-0.662147,-0.459067,-0.760003,2.334776


In [23]:
X_train.mean()
#Close to 0

age                              -1.423125e-16
hypertension                      3.108161e-16
heart_disease                     3.178129e-16
ever_married                     -4.845926e-16
Residence_type                    2.490469e-16
avg_glucose_level                 3.002701e-16
bmi                              -2.031546e-16
gender_Female                     2.284765e-16
gender_Male                      -2.284765e-16
gender_Other                      0.000000e+00
work_type_Govt_job                1.646792e-16
work_type_Never_worked           -1.911093e-16
work_type_Private                 5.904579e-17
work_type_Self-employed           4.863310e-16
work_type_children                4.525492e-16
smoking_status_Unknown           -3.545644e-16
smoking_status_formerly smoked    4.742495e-16
smoking_status_never smoked       1.292749e-16
smoking_status_smokes            -1.406756e-16
dtype: float64

In [24]:
X_train.var()
#Close to 1
#The 0 at "gender_other" happened because of OneHot encoding

age                               1.000261
hypertension                      1.000261
heart_disease                     1.000261
ever_married                      1.000261
Residence_type                    1.000261
avg_glucose_level                 1.000261
bmi                               1.000261
gender_Female                     1.000261
gender_Male                       1.000261
gender_Other                      0.000000
work_type_Govt_job                1.000261
work_type_Never_worked            1.000261
work_type_Private                 1.000261
work_type_Self-employed           1.000261
work_type_children                1.000261
smoking_status_Unknown            1.000261
smoking_status_formerly smoked    1.000261
smoking_status_never smoked       1.000261
smoking_status_smokes             1.000261
dtype: float64

In [25]:
X_test
#before Scaling

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
4673,-0.056683,-0.329843,-0.240181,0.721550,-1.012606,0.164941,1.063304,0.845759,-0.845759,0.0,2.633201,-0.068698,-1.155492,-0.439770,-0.395797,-0.662147,2.178331,-0.760003,-0.428307
3232,1.537492,-0.329843,-0.240181,0.721550,0.987551,-0.602217,0.143238,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.439770,-0.395797,-0.662147,2.178331,-0.760003,-0.428307
3694,-0.676640,-0.329843,-0.240181,-1.385905,0.987551,-0.713233,-0.465820,-1.182369,1.182369,0.0,-0.379766,-0.068698,0.865432,-0.439770,-0.395797,-0.662147,-0.459067,-0.760003,2.334776
1070,1.670340,-0.329843,-0.240181,0.721550,-1.012606,3.114155,-1.023043,0.845759,-0.845759,0.0,-0.379766,-0.068698,-1.155492,2.273916,-0.395797,-0.662147,-0.459067,1.315785,-0.428307
4163,0.386144,-0.329843,-0.240181,0.721550,-1.012606,-0.187346,-0.128894,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.439770,-0.395797,1.510239,-0.459067,-0.760003,-0.428307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,-1.877585,-0.329843,-0.240181,-1.385905,0.987551,-0.445424,-0.724993,-1.182369,1.182369,0.0,-0.379766,-0.068698,-1.155492,-0.439770,2.526545,1.510239,-0.459067,-0.760003,-0.428307
5078,-1.562292,-0.329843,-0.240181,-1.385905,0.987551,-0.646004,-1.748729,0.845759,-0.845759,0.0,-0.379766,-0.068698,-1.155492,-0.439770,2.526545,1.510239,-0.459067,-0.760003,-0.428307
4879,-1.783705,-0.329843,-0.240181,-1.385905,0.987551,-0.522825,-1.424762,-1.182369,1.182369,0.0,-0.379766,-0.068698,-1.155492,-0.439770,2.526545,1.510239,-0.459067,-0.760003,-0.428307
3125,-0.765205,-0.329843,-0.240181,-1.385905,0.987551,-0.368023,0.946676,0.845759,-0.845759,0.0,2.633201,-0.068698,-1.155492,-0.439770,-0.395797,-0.662147,-0.459067,1.315785,-0.428307


In [26]:
X_train.isna().sum()


age                               0
hypertension                      0
heart_disease                     0
ever_married                      0
Residence_type                    0
avg_glucose_level                 0
bmi                               0
gender_Female                     0
gender_Male                       0
gender_Other                      0
work_type_Govt_job                0
work_type_Never_worked            0
work_type_Private                 0
work_type_Self-employed           0
work_type_children                0
smoking_status_Unknown            0
smoking_status_formerly smoked    0
smoking_status_never smoked       0
smoking_status_smokes             0
dtype: int64

In [27]:
X_test

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
4673,-0.056683,-0.329843,-0.240181,0.721550,-1.012606,0.164941,1.063304,0.845759,-0.845759,0.0,2.633201,-0.068698,-1.155492,-0.439770,-0.395797,-0.662147,2.178331,-0.760003,-0.428307
3232,1.537492,-0.329843,-0.240181,0.721550,0.987551,-0.602217,0.143238,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.439770,-0.395797,-0.662147,2.178331,-0.760003,-0.428307
3694,-0.676640,-0.329843,-0.240181,-1.385905,0.987551,-0.713233,-0.465820,-1.182369,1.182369,0.0,-0.379766,-0.068698,0.865432,-0.439770,-0.395797,-0.662147,-0.459067,-0.760003,2.334776
1070,1.670340,-0.329843,-0.240181,0.721550,-1.012606,3.114155,-1.023043,0.845759,-0.845759,0.0,-0.379766,-0.068698,-1.155492,2.273916,-0.395797,-0.662147,-0.459067,1.315785,-0.428307
4163,0.386144,-0.329843,-0.240181,0.721550,-1.012606,-0.187346,-0.128894,0.845759,-0.845759,0.0,-0.379766,-0.068698,0.865432,-0.439770,-0.395797,1.510239,-0.459067,-0.760003,-0.428307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,-1.877585,-0.329843,-0.240181,-1.385905,0.987551,-0.445424,-0.724993,-1.182369,1.182369,0.0,-0.379766,-0.068698,-1.155492,-0.439770,2.526545,1.510239,-0.459067,-0.760003,-0.428307
5078,-1.562292,-0.329843,-0.240181,-1.385905,0.987551,-0.646004,-1.748729,0.845759,-0.845759,0.0,-0.379766,-0.068698,-1.155492,-0.439770,2.526545,1.510239,-0.459067,-0.760003,-0.428307
4879,-1.783705,-0.329843,-0.240181,-1.385905,0.987551,-0.522825,-1.424762,-1.182369,1.182369,0.0,-0.379766,-0.068698,-1.155492,-0.439770,2.526545,1.510239,-0.459067,-0.760003,-0.428307
3125,-0.765205,-0.329843,-0.240181,-1.385905,0.987551,-0.368023,0.946676,0.845759,-0.845759,0.0,2.633201,-0.068698,-1.155492,-0.439770,-0.395797,-0.662147,-0.459067,1.315785,-0.428307


In [28]:
y_train

1914    0
1751    0
396     0
1783    0
2361    0
       ..
2895    0
2763    0
905     0
3980    0
235     1
Name: stroke, Length: 3832, dtype: int64

In [29]:
y_test

4673    0
3232    0
3694    0
1070    0
4163    0
       ..
2490    0
5078    0
4879    0
3125    0
4694    0
Name: stroke, Length: 1278, dtype: int64

In [30]:
{column: len(X[column].unique())for column in X.select_dtypes('object').columns}
#We will use binary encoding for ever_married and Residence_type because of the 2 different values
#We will use OneHote Encoding for the rest

NameError: name 'X' is not defined

In [None]:
#For the OneHote econding Example
pd.get_dummies(X['work_type'],prefix='work_type')
