In [21]:
import pandas as pd 
import pickle
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [22]:
df = pd.read_csv("../data/raw/healthcare-dataset-stroke-data.csv")

In [23]:
# filling nan value of bmi with mean 
df['bmi'].fillna(df['bmi'].mean(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(df['bmi'].mean(), inplace = True)


In [24]:
# encoding categorical datas
# initializing encoder
encoder = OneHotEncoder(sparse_output = False)

# columns to encode
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# training encoder 
encoder.fit(df[categorical_columns])

# encoding
encoded_columns = pd.DataFrame(encoder.transform(df[categorical_columns]))

# naming columns
encoded_columns.columns = encoder.get_feature_names_out(categorical_columns)

# concating with original dataframe
df = pd.concat([df.drop(categorical_columns, axis = 1), encoded_columns], axis = 1)

# saving encoder to encode data from user
with open('../models/encoder.pkl', 'wb') as fp: 
    pickle.dump(encoder, fp)

In [25]:
# Normalizing numerical features 
# initializing standared scaler
scaler = StandardScaler()

# features to scale
numerical_features = ['age', 'avg_glucose_level', 'bmi']

# training scaler
scaler.fit(df[numerical_features])

# scaling
df[numerical_features] = scaler.transform(df[numerical_features])

# saving model 
with open('../models/scaler.pkl', 'wb') as fp:
    pickle.dump(scaler, fp)

In [26]:
# spliting train test data
X = df.drop(['stroke', 'id'], axis = 1) # features
y = df['stroke'] # target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [27]:
# saving train data
X_train.to_csv('../data/model_training/X_train.csv', index=False)
X_test.to_csv('../data/model_training/X_test.csv', index=False)
y_train.to_csv('../data/model_training/y_train.csv', index=False)
y_test.to_csv('../data/model_training/y_test.csv', index=False)

In [28]:
X.columns

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'gender_Female', 'gender_Male', 'gender_Other', 'ever_married_No',
       'ever_married_Yes', 'work_type_Govt_job', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed', 'work_type_children',
       'Residence_type_Rural', 'Residence_type_Urban',
       'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes'],
      dtype='object')