In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

In [2]:
path = r'Datasets/healthcare-dataset-stroke-data.csv'
data = pd.read_csv(path)
data = pd.DataFrame(data)

In [3]:
columns_names = data.columns.tolist()
data[columns_names] = data[columns_names].values

In [4]:
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5110 non-null   object
 1   gender             5110 non-null   object
 2   age                5110 non-null   object
 3   hypertension       5110 non-null   object
 4   heart_disease      5110 non-null   object
 5   ever_married       5110 non-null   object
 6   work_type          5110 non-null   object
 7   Residence_type     5110 non-null   object
 8   avg_glucose_level  5110 non-null   object
 9   bmi                4909 non-null   object
 10  smoking_status     5110 non-null   object
 11  stroke             5110 non-null   object
dtypes: object(12)
memory usage: 479.2+ KB


In [6]:
unknown_count = (data['smoking_status'] == 'Unknown').sum()

In [7]:
mapping = {
    'formerly smoked': 1,
    'never smoked': 2,
    'smokes': 3
}
data['smoking_status'] = data['smoking_status'].map(mapping).replace('Unknown', np.nan)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   object 
 1   gender             5110 non-null   object 
 2   age                5110 non-null   object 
 3   hypertension       5110 non-null   object 
 4   heart_disease      5110 non-null   object 
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   object 
 9   bmi                4909 non-null   object 
 10  smoking_status     3566 non-null   float64
 11  stroke             5110 non-null   object 
dtypes: float64(1), object(11)
memory usage: 479.2+ KB


In [9]:
columns_to_impute = ['bmi', 'smoking_status']
imputer = KNNImputer(n_neighbors=6)
data[columns_to_impute] = imputer.fit_transform(data[columns_to_impute])

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   object 
 1   gender             5110 non-null   object 
 2   age                5110 non-null   object 
 3   hypertension       5110 non-null   object 
 4   heart_disease      5110 non-null   object 
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   object 
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   float64
 11  stroke             5110 non-null   object 
dtypes: float64(2), object(10)
memory usage: 479.2+ KB


In [11]:
data.drop('id', axis=1, inplace=True)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   object 
 2   hypertension       5110 non-null   object 
 3   heart_disease      5110 non-null   object 
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   object 
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   float64
 10  stroke             5110 non-null   object 
dtypes: float64(2), object(9)
memory usage: 439.3+ KB


In [13]:
data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.600000,1.0,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,29.533333,2.0,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.500000,2.0,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.400000,3.0,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.000000,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,29.533333,2.0,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.000000,2.0,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.600000,2.0,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.600000,1.0,0


In [14]:
columns_to_convert = ['age', 'avg_glucose_level']
data[columns_to_convert] = data[columns_to_convert].apply(pd.to_numeric)

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   object 
 3   heart_disease      5110 non-null   object 
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   float64
 10  stroke             5110 non-null   object 
dtypes: float64(4), object(7)
memory usage: 439.3+ KB


In [16]:
columns_to_encode = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
data = pd.concat([data.drop(columns=columns_to_encode), pd.get_dummies(data[columns_to_encode])], axis=1)

In [17]:
data

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,smoking_status,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
0,67.0,0,1,228.69,36.600000,1,1.0,False,True,False,False,True,False,False,True,False,False,False,True
1,61.0,0,0,202.21,29.533333,1,2.0,True,False,False,False,True,False,False,False,True,False,True,False
2,80.0,0,1,105.92,32.500000,1,2.0,False,True,False,False,True,False,False,True,False,False,True,False
3,49.0,0,0,171.23,34.400000,1,3.0,True,False,False,False,True,False,False,True,False,False,False,True
4,79.0,1,0,174.12,24.000000,1,2.0,True,False,False,False,True,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,80.0,1,0,83.75,29.533333,0,2.0,True,False,False,False,True,False,False,True,False,False,False,True
5106,81.0,0,0,125.20,40.000000,0,2.0,True,False,False,False,True,False,False,False,True,False,False,True
5107,35.0,0,0,82.99,30.600000,0,2.0,True,False,False,False,True,False,False,False,True,False,True,False
5108,51.0,0,0,166.29,25.600000,0,1.0,False,True,False,False,True,False,False,True,False,False,True,False


In [18]:
columns_names= data.columns.tolist()
mmn = MinMaxScaler()
data[columns_names] = mmn.fit_transform(data[columns_names])

In [19]:
data

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,smoking_status,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
0,0.816895,0.0,1.0,0.801265,0.301260,1.0,0.00,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.743652,0.0,0.0,0.679023,0.220313,1.0,0.50,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.975586,0.0,1.0,0.234512,0.254296,1.0,0.50,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.597168,0.0,0.0,0.536008,0.276060,1.0,1.00,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.963379,1.0,0.0,0.549349,0.156930,1.0,0.50,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,0.975586,1.0,0.0,0.132167,0.220313,0.0,0.50,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5106,0.987793,0.0,0.0,0.323516,0.340206,0.0,0.50,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
5107,0.426270,0.0,0.0,0.128658,0.232532,0.0,0.50,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
5108,0.621582,0.0,0.0,0.513203,0.175258,0.0,0.00,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
