In [17]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [18]:
df = pd.read_csv("../data/raw/healthcare-dataset-stroke-data.csv")

In [19]:
# filling nan value of bmi with mean 
df['bmi'].fillna(df['bmi'].mean(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(df['bmi'].mean(), inplace = True)


In [20]:
# encoding categorical datas
# initializing encoder
encoder = OneHotEncoder(sparse_output = False)

# columns to encode
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# encoding
encoded_columns = pd.DataFrame(encoder.fit_transform(df[categorical_columns]))

# naming columns
encoded_columns.columns = encoder.get_feature_names_out(categorical_columns)

# concating with original dataframe
df = pd.concat([df.drop(categorical_columns, axis = 1), encoded_columns], axis = 1)

In [21]:
# Normalizing numerical features 
# initializing standared scaler
scaler = StandardScaler()

# features to scale
numerical_features = ['age', 'avg_glucose_level', 'bmi']

# scaling
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [22]:
# spliting train test data
X = df.drop(['stroke', 'id'], axis = 1) # features
y = df['stroke'] # target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [23]:
# saving train data
X_train.to_csv('../data/model_training/X_train.csv', index=False)
X_test.to_csv('../data/model_training/X_test.csv', index=False)
y_train.to_csv('../data/model_training/y_train.csv', index=False)
y_test.to_csv('../data/model_training/y_test.csv', index=False)