# Data Preprocessing

In [1]:
import pandas as pd

df = pd.read_csv("../data/raw/Kaggle-Stroke-Dataset/healthcare-dataset-stroke-data.csv")

display(df.head())


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [2]:
df_processed = df.copy()

In [3]:
display(df_processed.shape)

(5110, 12)

In [4]:
# 
# drop unnecessary columns
# 

df_processed.drop(columns=["id"], inplace=True) # drop "id"
df_processed.drop(columns=["avg_glucose_level"], inplace=True) # drop "avg_glucose_level" (most users will not have this information)


In [5]:
# 
# drop rows with missing data and data that may introduce bias
# 

# drop 1 row with "gender" == "Other" as it may introduce bias 
# (in the app, will ask the user for their biological sex)
df_processed = df_processed[df_processed["gender"] != "Other"]


# drop 22 rows with "work_type" == "Never_worked" as it may introduce bias
df_processed = df_processed[df_processed["work_type"] != "Never_worked"]



In [6]:
# 
# Handle categorical data (one-hot encoding)
# 

# binary encoding
df_processed['gender'] = df_processed['gender'].map({'Male': 0, 'Female': 1})
df_processed['ever_married'] = df_processed['ever_married'].map({'No': 0, 'Yes': 1})
df_processed['Residence_type'] = df_processed['Residence_type'].map({'Rural': 0, 'Urban': 1})

# one-hot encoding
df_processed = pd.get_dummies(df_processed, columns=["work_type"], prefix="work", drop_first=True, dtype=int)
df_processed = pd.get_dummies(df_processed, columns=["smoking_status"], prefix="smoking", drop_first=True, dtype=int)

In [7]:
#
# Handle missing values:
# 201 rows with missing values in "bmi" column --> will do median imputation grouped by age bins.
# 


bins = [0, 18, 30, 45, 60, 75, 100]
labels = ['0-17', '18-29', '30-44', '45-59', '60-74', '75+']
df_processed['age_bin'] = pd.cut(df_processed['age'], bins=bins, labels=labels, right=False)

# Compute median BMI per age bin
median_bmi = df_processed.groupby('age_bin', observed=False)['bmi'].median()

print("Median BMI per age bin (ie. values that will be used to impute missing BMI):")
print(median_bmi)


# Impute missing BMI
df_processed['bmi'] = df_processed.apply(
    lambda row: median_bmi[row['age_bin']] if pd.isnull(row['bmi']) else row['bmi'],
    axis=1
)

# Drop age_bin column as it's no longer needed
df_processed.drop(columns='age_bin', inplace=True)

Median BMI per age bin (ie. values that will be used to impute missing BMI):
age_bin
0-17     19.8
18-29    26.4
30-44    29.9
45-59    30.3
60-74    30.0
75+      28.0
Name: bmi, dtype: float64



## Feature engineering

In [8]:
#
# Some options:
# - Encode smoking status as ordinal. E.g.: never smoked (0) < unknown (1) < formerly smoked (2) < smokes (3)
# - Age binning 
# - Feature amplification. E.g. age * (hypertension + heart_disease), to capture interaction effects
#


## Check missing data and data dtypes

In [9]:
#
# Check for any remaining missing values
#

df_processed.isnull().sum() 

gender                     0
age                        0
hypertension               0
heart_disease              0
ever_married               0
Residence_type             0
bmi                        0
stroke                     0
work_Private               0
work_Self-employed         0
work_children              0
smoking_formerly smoked    0
smoking_never smoked       0
smoking_smokes             0
dtype: int64

In [10]:
# 
# Check head, dtype, and shape
# 

display(df_processed.dtypes)


gender                       int64
age                        float64
hypertension                 int64
heart_disease                int64
ever_married                 int64
Residence_type               int64
bmi                        float64
stroke                       int64
work_Private                 int64
work_Self-employed           int64
work_children                int64
smoking_formerly smoked      int64
smoking_never smoked         int64
smoking_smokes               int64
dtype: object

In [11]:
display(df_processed.head(10))
print("\n")

display(df_processed.shape)
print("\n")

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,bmi,stroke,work_Private,work_Self-employed,work_children,smoking_formerly smoked,smoking_never smoked,smoking_smokes
0,0,67.0,0,1,1,1,36.6,1,1,0,0,1,0,0
1,1,61.0,0,0,1,0,30.0,1,0,1,0,0,1,0
2,0,80.0,0,1,1,0,32.5,1,1,0,0,0,1,0
3,1,49.0,0,0,1,1,34.4,1,1,0,0,0,0,1
4,1,79.0,1,0,1,0,24.0,1,0,1,0,0,1,0
5,0,81.0,0,0,1,1,29.0,1,1,0,0,1,0,0
6,0,74.0,1,1,1,0,27.4,1,1,0,0,0,1,0
7,1,69.0,0,0,0,1,22.8,1,1,0,0,0,1,0
8,1,59.0,0,0,1,0,30.3,1,1,0,0,0,0,0
9,1,78.0,0,0,1,1,24.2,1,1,0,0,0,0,0






(5087, 14)





## Train-test split + Feature scaling

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#
# Train-test split
#

X = df_processed.drop(columns=["stroke"])
y = df_processed["stroke"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [13]:
#
# Standardize the features
#


scaler = StandardScaler()

numeric_cols = ["age", "bmi"]

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [14]:
X_train.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,bmi,work_Private,work_Self-employed,work_children,smoking_formerly smoked,smoking_never smoked,smoking_smokes
849,1,1.199622,1,0,1,1,0.702202,0,1,0,1,0,0
3236,1,0.490822,0,0,1,0,1.924045,1,0,0,1,0,0
1915,1,1.642622,0,0,1,0,-0.275272,1,0,0,0,1,0
3930,1,0.313622,0,0,1,0,0.097711,0,0,0,0,1,0
4442,0,0.357922,0,0,1,0,-0.493918,1,0,0,1,0,0
2137,0,-0.838179,0,0,0,1,-0.85404,1,0,0,0,0,0
1885,0,-0.793879,0,0,0,1,-1.085547,1,0,0,0,0,1
3700,1,0.668022,0,0,1,0,1.139493,1,0,0,0,1,0
4824,1,1.686922,0,0,1,0,1.036601,1,0,0,0,0,0
3801,1,-0.660979,0,0,1,1,-0.558225,1,0,0,0,1,0



<br><br>


## Export

In [15]:
import pickle

# processed DataFrame (csv)
df_processed.to_csv("../data/processed/df_processed.csv", index=False)

# train and test sets (pickle)
X_train.to_pickle("../data/processed/X_train.pkl")
X_test.to_pickle("../data/processed/X_test.pkl")
y_train.to_pickle("../data/processed/y_train.pkl")
y_test.to_pickle("../data/processed/y_test.pkl")

# export train and test data also as csv files
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

# scaler (pickle)
with open("../models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)


print("Data processing complete:")
print("- Processed data saved to ../data/processed/df_processed.csv")
print("- Train/test sets saved as pickle files.")
print("\n")

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


Data processing complete:
- Processed data saved to ../data/processed/df_processed.csv
- Train/test sets saved as pickle files.


X_train shape: (4069, 13)
X_test shape: (1018, 13)
y_train shape: (4069,)
y_test shape: (1018,)
