In [1]:
import os; os.chdir('../..') # changing to root directory of project
from config import config

In [19]:
import pandas as pd 
import pickle
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from utils.save_features import save_features

In [20]:
df = pd.read_csv("data/processed/stroke.csv")

In [21]:
# Feature selecting
target = 'stroke'
features = df.drop(target, axis = 1)
features

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1,67.0,0,1,1,Private,Urban,228.69,36.600000,formerly smoked
1,0,61.0,0,0,1,Self-employed,Rural,202.21,28.893237,never smoked
2,1,80.0,0,1,1,Private,Rural,105.92,32.500000,never smoked
3,0,49.0,0,0,1,Private,Urban,171.23,34.400000,smokes
4,0,79.0,1,0,1,Self-employed,Rural,174.12,24.000000,never smoked
...,...,...,...,...,...,...,...,...,...,...
5105,0,80.0,1,0,1,Private,Urban,83.75,28.893237,never smoked
5106,0,81.0,0,0,1,Self-employed,Urban,125.20,40.000000,never smoked
5107,0,35.0,0,0,1,Self-employed,Rural,82.99,30.600000,never smoked
5108,1,51.0,0,0,1,Private,Rural,166.29,25.600000,formerly smoked


In [22]:
# saving features in json file
save_features(features, 'stroke')

Starting to process columns in feature_df...
Updating features config file
Succesfully updated features config file


In [23]:
# encoding categorical datas
# initializing encoder
encoder = OneHotEncoder(sparse_output = False)

# columns to encode
categorical_columns = features.select_dtypes(include = 'object').columns

# training encoder 
encoder.fit(features[categorical_columns])

# encoding
encoded_columns = pd.DataFrame(encoder.transform(features[categorical_columns]))

# naming columns
encoded_columns.columns = encoder.get_feature_names_out(categorical_columns)

# concating with original dataframe
features = pd.concat([features.drop(categorical_columns, axis = 1), encoded_columns], axis = 1)

# saving encoder to encode data from user
with open('models/stroke/encoder.pkl', 'wb') as fp: 
    pickle.dump(encoder, fp)

In [24]:
# Normalizing numerical features 
# initializing standared scaler
scaler = StandardScaler()

# features to scale
numerical_features = features.select_dtypes(include = ['int', 'float']).columns

# training scaler
scaler.fit(features[numerical_features])

# scaling
features[numerical_features] = scaler.transform(features[numerical_features])

# saving model 
with open('models/stroke/scaler.pkl', 'wb') as fp:
    pickle.dump(scaler, fp)

In [25]:
# spliting train test data
X = features # features
y = df['stroke'] # target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [26]:
# saving train data
X_train.to_csv('data/model_training/stroke/X_train.csv', index=False)
X_test.to_csv('data/model_training/stroke/X_test.csv', index=False)
y_train.to_csv('data/model_training/stroke/y_train.csv', index=False)
y_test.to_csv('data/model_training/stroke/y_test.csv', index=False)

In [27]:
print(categorical_columns)

Index(['work_type', 'Residence_type', 'smoking_status'], dtype='object')
