LIBRARIES

In [37]:
#libraries
import numpy as np #Allows Linear Algebra
import pandas as pd #To handle csv file
from matplotlib import pyplot as plt #Allows ploting charts to be used
import seaborn as sns #Allows heating maps to be used
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier #For training values and predict values
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold #helps train_test_split to split data
from sklearn.model_selection import train_test_split #Used for X-y_train and X-y_test
from sklearn.metrics import f1_score,precision_score,recall_score #Prediction score
from sklearn.impute import KNNImputer #Allows to use Knn Imputer
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import MinMaxScaler #Allows to turn values into 0 and 1
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression #Allows to use Linear Regression
from sklearn.metrics import accuracy_score
from imblearn import over_sampling
from imblearn.over_sampling import SMOTE

sns.set(style='darkgrid')

LOAD DATASET

In [38]:
#Load the file we want to work on as a dataframe
dataset = pd.read_csv("healthcare-dataset-stroke-data.csv") 
#Show dataset
dataset.head() #.head() used to show the dataframe

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [39]:
#Checking for missing values (NaN)
dataset.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [40]:
#Checking for 'Unknown' values
dataset[dataset == 'Unknown']

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,,,,,,,,,,,,
1,,,,,,,,,,,,
2,,,,,,,,,,,,
3,,,,,,,,,,,,
4,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,,,,,,,,,,,,
5106,,,,,,,,,,,,
5107,,,,,,,,,,,,
5108,,,,,,,,,,,,


CONVERT BOTH NON-NUMERIAL AND NUMERIAL VALUES INTO ONLY NUMERIAL VALUES

In [41]:
#We need to change the values to only numerial values
#Id column is not needed for the prediction
#Gender, ever_married, work_type, Residence_type, smoking_status columns have non numerail values
#ever_married, residence_type columns have only 2 different values in each of them
#gender has "Other" value, its not needed and we will drop it
#work_type column has more than just 2 different values
#smoking_status have some missing values called "Unknown"
#Drop the Id column
dataset = dataset.drop(['id'], axis = 'columns')
#Change "Other" value from gender column
dataset.drop(dataset[dataset.gender == "Other"].index, inplace=True)
#Show dataframe
dataset

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [42]:
#Id column is missing
#"Other" value has been replaced
#For columns with only 2 different values, we're changing the values to 0 or 1, doing Binary Encoding
dataset['ever_married'] = dataset['ever_married'].replace(to_replace= ['No', 'Yes'],value = ['0','1'])
dataset['ever_married']

0       1
1       1
2       1
3       1
4       1
       ..
5105    1
5106    1
5107    1
5108    1
5109    1
Name: ever_married, Length: 5109, dtype: object

In [43]:
#Non numerial values have been replaced with 0 or 1

In [44]:
dataset['Residence_type'] = dataset['Residence_type'].replace(to_replace= ['Rural', 'Urban'],value = ['0','1'])
dataset['Residence_type']

0       1
1       0
2       0
3       1
4       0
       ..
5105    1
5106    1
5107    0
5108    0
5109    1
Name: Residence_type, Length: 5109, dtype: object

In [45]:
#Non numerial values have been replaced with 0 or 1

In [46]:
dataset['gender'] = dataset['gender'].replace(to_replace= ['Female', 'Male'],value = ['0','1'])
dataset['gender']

0       1
1       0
2       1
3       0
4       0
       ..
5105    0
5106    0
5107    0
5108    1
5109    0
Name: gender, Length: 5109, dtype: object

In [47]:
#Non numerial values have been replaced with 0 or 1

In [48]:
#Define one-hot encoding to use it later for the dataframes
def one_hot_encoding(dataset, column):
    dataset = dataset.copy()
    dummies = pd.get_dummies(dataset[column], prefix= column)
    dataset = pd.concat([dataset, dummies], axis=1) #axis = 1 refers to columns while 0 refers to rows
    dataset = dataset.drop(column, axis=1)
    return dataset
#Now we can call one-hot encoding from the previous def
#One-Hot Encoding is used for the 'work_type' column because it has more than 2 different non numerial 

#Define MinMaxScaler to use it later for the dataframes
def mmscaler(dataset):
    scaler = MinMaxScaler()
    scaler.fit(dataset)
    dataset = pd.DataFrame(scaler.transform(dataset), index = dataset.index, columns = dataset.columns)
    #Index defines as an address
    return dataset
#Now we can call scaler from the previous def


DROP COLUMN

In [49]:
#Drop Missing Values Columns Dataset
#There are NaN values in column 'bmi' and there are 'Unknown' values in column 'smoking_status'
#We are gonna drop both 'bmi' and 'smoking status
#First dataset goes by the name of "dataset1"
dataset1 = dataset.drop(['bmi','smoking_status'], axis = 1)
dataset1 = one_hot_encoding(dataset1, 'work_type') #We are doing one-hot encoding on work_type column
dataset1 = mmscaler(dataset1)
dataset1.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,1.0,0.816895,0.0,1.0,1.0,1.0,0.801265,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.743652,0.0,0.0,1.0,0.0,0.679023,1.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.975586,0.0,1.0,1.0,0.0,0.234512,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.597168,0.0,0.0,1.0,1.0,0.536008,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.963379,1.0,0.0,1.0,0.0,0.549349,1.0,0.0,0.0,0.0,1.0,0.0


MEAN IMPUTATION 

In [50]:
#Replace NaN values in 'bmi' with the mean of that column
#Droping smoking_status because it has unknown values
#Second dataset goes by the name of dataset2
dataset2 = dataset.drop('smoking_status',axis=1)
dataset2['bmi'].fillna((dataset2['bmi'].mean()), inplace = True)
dataset2 = one_hot_encoding(dataset2,'work_type') 
dataset2 = mmscaler(dataset2)
dataset2.head()


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,1.0,0.816895,0.0,1.0,1.0,1.0,0.801265,0.30126,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.743652,0.0,0.0,1.0,0.0,0.679023,0.212996,1.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.975586,0.0,1.0,1.0,0.0,0.234512,0.254296,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.597168,0.0,0.0,1.0,1.0,0.536008,0.27606,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.963379,1.0,0.0,1.0,0.0,0.549349,0.15693,1.0,0.0,0.0,0.0,1.0,0.0


LINEAR REGRESSION

In [51]:
#Linear Regression
#We are going to replace NaN values using linear regression
#First we need to build inside a def everything is needed for the linear regression
def prelinear_regression(input):
    dataset = input.copy()
    dataset = one_hot_encoding(dataset, column = 'work_type')
    newdataset = dataset['stroke'] #We want to predict the stroke column
    dataset.drop(['smoking_status','stroke'],inplace=True, axis=1) 
    #We droped smoking_status because of Unknown Values. 
    #We droped stroke column because we want to predict that
    dataset =mmscaler(dataset)
    #Now we are going to seperate NaN values from not Nan
    test_data = dataset[dataset['bmi'].isnull()]
    train_data = dataset.dropna()
    #We dropped the NaN values from the dataset
    #Time for X and y train
    y_train=train_data['bmi']
    X_train =train_data.drop('bmi', axis =1)
    linr = LinearRegression()
    linr.fit(X_train, y_train)
    #Now we want to replace the NaN values with the predicted values
    X_test = test_data.drop('bmi', axis=1)
    y_pred = linr.predict(X_test)
    test_data.loc[test_data.bmi.isnull(),'bmi']= y_pred 
    dataset['bmi'].fillna((test_data['bmi']), inplace=True)
    dataset = pd.concat([dataset, newdataset], axis =1)
    return dataset

In [52]:
#Now we can test how linear regression works
#Third dataset goes by the name dataset3
dataset3 = dataset
dataset3 = prelinear_regression(dataset3)
#We loaded everything we've done for the Linear Regression into dataset3
dataset3.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.loc[test_data.bmi.isnull(),'bmi']= y_pred


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,stroke
0,1.0,0.816895,0.0,1.0,1.0,1.0,0.801265,0.30126,0.0,0.0,1.0,0.0,0.0,1
1,0.0,0.743652,0.0,0.0,1.0,0.0,0.679023,0.247691,0.0,0.0,0.0,1.0,0.0,1
2,1.0,0.975586,0.0,1.0,1.0,0.0,0.234512,0.254296,0.0,0.0,1.0,0.0,0.0,1
3,0.0,0.597168,0.0,0.0,1.0,1.0,0.536008,0.27606,0.0,0.0,1.0,0.0,0.0,1
4,0.0,0.963379,1.0,0.0,1.0,0.0,0.549349,0.15693,0.0,0.0,0.0,1.0,0.0,1


KNN IMPUTATION

In [53]:
#KNN Imputation
#Just like Linear regression we'll do the steps needed to run the Knn Imputation and then load it to a new dataset copy of dataset
#We need to replace the 'Unknown' values from smoking_status, with NaN values
def pre_knn(input):
    dataset = input.copy()
    dataset = one_hot_encoding(dataset, column=['work_type'])
    lb = LabelEncoder() #to replace 'Unknown' values with NaN
    dataset['smoking_status'] = dataset['smoking_status'].replace('Unknown',np.nan)
    dataset['smoking_status'] = dataset['smoking_status'].astype(str)
    dataset['smoking_status']=lb.fit_transform(dataset['smoking_status'])
    dataset['smoking_status'] = dataset['smoking_status'].replace(3,np.nan)
    
    y=dataset['stroke']
    X=dataset.drop(['bmi','stroke'],axis=1) 
    #Dropped bmi to make a dataset without this column for now
    #Dropped stroke because we want to predict it
    X = mmscaler(X)
    #Now we need to do the Knn imputation for the missing values
    knn = KNNImputer(n_neighbors=10) #Number of neighboring samples to use for imputation
    X = pd.DataFrame(knn.fit_transform(X), index = X.index, columns=X.columns)
    X = pd.concat([X,y], axis=1)
    return X #By using "return X" it will return both of X_train,X_test,y_train and y_test when its called

In [54]:
#Now we can run and test the Knn Imputation
#Fourth dataset goes by the name of dataset4
dataset4 = dataset
dataset4 = pre_knn(dataset4)
dataset4.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,smoking_status,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,stroke
0,1.0,0.816895,0.0,1.0,1.0,1.0,0.801265,0.0,0.0,0.0,1.0,0.0,0.0,1
1,0.0,0.743652,0.0,0.0,1.0,0.0,0.679023,1.0,0.0,0.0,0.0,1.0,0.0,1
2,1.0,0.975586,0.0,1.0,1.0,0.0,0.234512,1.0,0.0,0.0,1.0,0.0,0.0,1
3,0.0,0.597168,0.0,0.0,1.0,1.0,0.536008,0.55,0.0,0.0,1.0,0.0,0.0,1
4,0.0,0.963379,1.0,0.0,1.0,0.0,0.549349,1.0,0.0,0.0,0.0,1.0,0.0,1


In [55]:
dataset['smoking_status']

0       formerly smoked
1          never smoked
2          never smoked
3                smokes
4          never smoked
             ...       
5105       never smoked
5106       never smoked
5107       never smoked
5108    formerly smoked
5109            Unknown
Name: smoking_status, Length: 5109, dtype: object

In [56]:
dataset.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

KNN IMPUTATION AND LINEAR REGRESSION

In [57]:
#For the 5th dataset we'll take the dataset from the Knn Imputer, and we will add the 'bmi' column from dataset3
#Fifth dataset goes by the name of dataset5
dataset5 = pd.concat([dataset4, dataset3['bmi']], axis =1)
dataset5.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,smoking_status,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,stroke,bmi
0,1.0,0.816895,0.0,1.0,1.0,1.0,0.801265,0.0,0.0,0.0,1.0,0.0,0.0,1,0.30126
1,0.0,0.743652,0.0,0.0,1.0,0.0,0.679023,1.0,0.0,0.0,0.0,1.0,0.0,1,0.247691
2,1.0,0.975586,0.0,1.0,1.0,0.0,0.234512,1.0,0.0,0.0,1.0,0.0,0.0,1,0.254296
3,0.0,0.597168,0.0,0.0,1.0,1.0,0.536008,0.55,0.0,0.0,1.0,0.0,0.0,1,0.27606
4,0.0,0.963379,1.0,0.0,1.0,0.0,0.549349,1.0,0.0,0.0,0.0,1.0,0.0,1,0.15693


RANDOM FOREST AND DATA PREDICTION

In [58]:
#We have all 5 datasets
#Now we need to preprocess the RandomForest
#We want to predict the 'stroke' column, so we'll split  the dataset into 2 parts. 
#The first part will be the whole dataset without the 'stroke' column
#The second part will be only the 'stroke' column
def randfor(dataset):
    X = dataset.drop('stroke',axis=1) #First part
    y = dataset['stroke'] #second part

    #Use of SMOTE to balance the values inside the 'stroke' column
    sm = SMOTE(sampling_strategy = 'minority')
    X_s, y_s = sm.fit_resample(X,y)
    #Train and test split
    X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, train_size =0.75, test_size =0.25, shuffle = True, random_state=1)

    #Call the desire model
    modelr = RandomForestClassifier(n_estimators = 16)
    modelr.fit(X_train,y_train)
    y_pred = modelr.predict(X_test)
    #We want F1, Precision and Recall scores
    score = [f1_score(y_test,y_pred),precision_score(y_test,y_pred),recall_score(y_test,y_pred)]


    return score



In [59]:
randfor(dataset1)

[0.9035159443990187, 0.9020408163265307, 0.904995904995905]

In [60]:
randfor(dataset2)

[0.9285140562248996, 0.9109535066981875, 0.9467649467649467]

In [61]:
randfor(dataset3)

[0.9311871227364186, 0.9153481012658228, 0.9475839475839476]

In [62]:
randfor(dataset4)

[0.9178082191780822, 0.9032513877874703, 0.9328419328419328]

In [63]:
randfor(dataset5)

[0.9265944645006017, 0.9080188679245284, 0.9459459459459459]

INPUT PARAMETERS

In [64]:
#We are gonna split the dataset(n) into X and y
X = dataset1.drop('stroke',axis =1)
y = dataset1['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size =0.75, test_size =0.25, shuffle = True, random_state=1)
#We want to improve input values
params = {'n_estimators': [6,7,8,9,10,11,12,13,14,15,16,17],'criterion':['gini','entropy']}
gridrandomforest = GridSearchCV(RandomForestClassifier(),param_grid= params, cv=5)
gridrandomforest.fit(X_train,y_train)
gridrandomforest.best_params_

{'criterion': 'gini', 'n_estimators': 8}