In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

df = pd.read_csv('dataset.csv')

#drop columns ids
df.drop('id', axis=1, inplace=True)
df.describe(include='all')


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,43400,43400.0,43400.0,43400.0,43400,43400,43400,43400.0,41938.0,30108,43400.0
unique,3,,,,2,5,2,,,3,
top,Female,,,,Yes,Private,Urban,,,never smoked,
freq,25665,,,,27938,24834,21756,,,16053,
mean,,42.217894,0.093571,0.047512,,,,104.48275,28.605038,,0.018041
std,,22.519649,0.291235,0.212733,,,,43.111751,7.77002,,0.133103
min,,0.08,0.0,0.0,,,,55.0,10.1,,0.0
25%,,24.0,0.0,0.0,,,,77.54,23.2,,0.0
50%,,44.0,0.0,0.0,,,,91.58,27.7,,0.0
75%,,60.0,0.0,0.0,,,,112.07,32.9,,0.0


In [9]:

#split data: training/test sets
x = df.drop('stroke', axis=1)
y = df['stroke']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

#feature types
numerical_features = ['age', 'bmi', 'avg_glucose_level']
categorcial_features = ['gender','ever_married','work_type','Residence_type']
ordinal_features = ['smoking_status']
binary_features = ['hypertension', 'heart_disease']


In [None]:
# order of ordinal features to apply ordinal encoder
smoking_order=['never smoked','formerly smoked', 'smokes']

#create pipelines for each feature type

# imputer with median to treat bmi missing values
# scaler so every feature has the same scale
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    # add scaler
])

# missing values will have the most frequent value
# one hot encoder will transform categories into numbers
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# missing values will have the most frequent value
ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('ordinal', OrdinalEncoder(categories=[smoking_order], handle_unknown='use_encoded_value',unknown_value=-1))
])

# combine the pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorcial_features),
        ('ord', ordinal_pipeline, ordinal_features),
        ('bin', 'passthrough', binary_features)
    ],
    remainder='drop'
)

Formato original de x_train: (34720, 10)
Formato de x_train processado: (34720, 18)
Formato original de x_test: (8680, 10)
Formato de x_test processado: (8680, 18)
