In [1]:
import numpy as np
import pandas as pd

data_full=pd.read_csv('healthcare-dataset-stroke-data.csv')
data = data_full.copy()

In [2]:
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [4]:
print(data.isnull().sum())

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


# Handling Missing Values

In [5]:
# Impute BMI for stroke = 0
data.loc[data['stroke'] == 0, 'bmi'] = data.loc[data['stroke'] == 0, 'bmi'].fillna(
    data.loc[data['stroke'] == 0, 'bmi'].median()
)

# Impute BMI for stroke = 1
data.loc[data['stroke'] == 1, 'bmi'] = data.loc[data['stroke'] == 1, 'bmi'].fillna(
    data.loc[data['stroke'] == 1, 'bmi'].median()
)

print("Median BMI for stroke=0:", data.loc[data['stroke'] == 0, 'bmi'].median())
print("Median BMI for stroke=1:", data.loc[data['stroke'] == 1, 'bmi'].median())

Median BMI for stroke=0: 28.0
Median BMI for stroke=1: 29.7


In [6]:
print(data.isnull().sum())

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


# Data sanitization

In [7]:
data.loc[(data['gender'] != 'Female') & (data['gender'] != 'Male')]

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
3116,56156,Other,26.0,0,0,No,Private,Rural,143.33,22.4,formerly smoked,0


In [8]:
# delete the problematic row
data = data[(data['gender'] == 'Male') | (data['gender'] == 'Female')]
data = data.reset_index(drop=True)

In [9]:
# Create a data backup the following steps
data_backup = data.copy() 

# Checking for unique values
data = data.drop(['id', 'stroke','age', 'avg_glucose_level', 'bmi'], axis=1)
for col in data.columns:
    print(f"Unique values in '{col}':")
    print(data[col].unique())
    print("-" * 40)


Unique values in 'gender':
['Male' 'Female']
----------------------------------------
Unique values in 'hypertension':
[0 1]
----------------------------------------
Unique values in 'heart_disease':
[1 0]
----------------------------------------
Unique values in 'ever_married':
['Yes' 'No']
----------------------------------------
Unique values in 'work_type':
['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
----------------------------------------
Unique values in 'Residence_type':
['Urban' 'Rural']
----------------------------------------
Unique values in 'smoking_status':
['formerly smoked' 'never smoked' 'smokes' 'Unknown']
----------------------------------------


# Categorical Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder

# Restore dropped columns from backup
data = data_backup.copy()

# Drop only the target and id
data = data.drop(['id', 'stroke'], axis=1)

le = LabelEncoder()
data['gender'] = le.fit_transform(data['gender']) 
data['ever_married'] = le.fit_transform(data['ever_married'])  
data['Residence_type'] = le.fit_transform(data['Residence_type'])  

# One-hot encoding for multiclass features
data = pd.get_dummies(data, columns=['work_type', 'smoking_status'], drop_first=True)


In [11]:
# check data after encoding
print(data.columns)
print(data.head())

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'Residence_type', 'avg_glucose_level', 'bmi', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed', 'work_type_children',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes'],
      dtype='object')
   gender   age  hypertension  heart_disease  ever_married  Residence_type  \
0       1  67.0             0              1             1               1   
1       0  61.0             0              0             1               0   
2       1  80.0             0              1             1               0   
3       0  49.0             0              0             1               1   
4       0  79.0             1              0             1               0   

   avg_glucose_level   bmi  work_type_Never_worked  work_type_Private  \
0             228.69  36.6                   False               True   
1             202.21  29.7   

# Scaling/Normalization

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[['age', 'avg_glucose_level', 'bmi']] = scaler.fit_transform(data[['age', 'avg_glucose_level', 'bmi']])

print(data[['age', 'avg_glucose_level', 'bmi']].describe())

                age  avg_glucose_level           bmi
count  5.109000e+03       5.109000e+03  5.109000e+03
mean   7.231987e-17      -1.613289e-16  8.900907e-17
std    1.000098e+00       1.000098e+00  1.000098e+00
min   -1.908332e+00      -1.126761e+00 -2.412218e+00
25%   -8.062312e-01      -6.382516e-01 -6.588388e-01
50%    7.827984e-02      -3.149342e-01 -1.133431e-01
75%    7.858887e-01       1.755632e-01  5.100807e-01
max    1.714625e+00       3.657189e+00  8.926301e+00


In [13]:
# Check data after Scaling
print(data.columns)
print(data.head())

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'Residence_type', 'avg_glucose_level', 'bmi', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed', 'work_type_children',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes'],
      dtype='object')
   gender       age  hypertension  heart_disease  ever_married  \
0       1  1.051242             0              1             1   
1       0  0.785889             0              0             1   
2       1  1.626174             0              1             1   
3       0  0.255182             0              0             1   
4       0  1.581949             1              0             1   

   Residence_type  avg_glucose_level       bmi  work_type_Never_worked  \
0               1           2.706450  1.003624                   False   
1               0           2.121652  0.107453                   False   
2               0        

In [14]:
X = data 
y = data_backup['stroke']


In [15]:
X.to_csv("X_preprocessed.csv", index=False)
y.to_csv("y_labels.csv", index=False)