# Preparing the datase5 for training (OHE and standardizarion)

## Import libraries and create the needed functions

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [6]:
def ohe_and_standized(df):
    # Separate numerical and categorical features
    numerical_features = df.select_dtypes(include=['int', 'float']).columns.tolist()
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    
    # Create transformers for preprocessing
    numerical_transformer = Pipeline([('scaler', StandardScaler())])
    categorical_transformer = Pipeline([('onehot', OneHotEncoder(sparse_output=False))])  # Set sparse=False to get dense array
    
    # Combine transformers into a preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'  # Preserve the remaining features
    )
        
    # Create a pipeline to apply preprocessing
    pipeline = Pipeline([('preprocessor', preprocessor)])
    transformed_data = pipeline.fit_transform(df)
    
    # Get feature names after preprocessing
    feature_names = []
    feature_names.extend(numerical_features)
    
    # Add one-hot encoded categorical feature names
    if 'cat' in preprocessor.named_transformers_ and categorical_features:
        one_hot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
        categorical_feature_names = one_hot_encoder.get_feature_names_out(input_features=categorical_features)
        feature_names.extend(categorical_feature_names)
    
    # Add any other remaining columns (e.g., columns you didn't process)
    remaining_cols = [col for col in df.columns if col not in numerical_features + categorical_features]
    feature_names.extend(remaining_cols)
    
    # Convert the transformed data back to a DataFrame for visualization
    transformed_df = pd.DataFrame(transformed_data, columns=feature_names)
    
    # set the indexes as the original dataset
    transformed_df = transformed_df.set_index(df.index)
    
    return transformed_df

In [7]:
df = pd.read_csv('./dataset5.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   age            600 non-null    int64
 1   gen            600 non-null    int64
 2   smoking        600 non-null    int64
 3   heart_rate     600 non-null    int64
 4   chest_pain     600 non-null    int64
 5   cholesterol    600 non-null    int64
 6   bloodpressure  600 non-null    int64
 7   bloodsugar     600 non-null    int64
 8   stroke         600 non-null    int64
dtypes: int64(9)
memory usage: 42.3 KB


Unnamed: 0,age,gen,smoking,heart_rate,chest_pain,cholesterol,bloodpressure,bloodsugar,stroke
0,33,1,1,45,6,200,65,80,1
1,55,0,0,66,9,256,88,99,1
2,77,1,1,87,5,222,142,151,1
3,55,1,1,55,2,155,121,200,1
4,66,1,1,56,8,239,139,122,1


In [4]:
df = ohe_and_standized(df)
df.head()

Unnamed: 0,age,gen,smoking,heart_rate,chest_pain,cholesterol,bloodpressure,bloodsugar,stroke
0,-1.947404,0.654654,0.654654,-1.477117,0.25375,-0.69603,-1.750619,-1.087907,-1.248543
1,0.366552,-1.527525,-1.527525,-0.248299,1.341249,0.150851,-0.94372,-0.689637,-1.248543
2,2.680507,0.654654,0.654654,0.980519,-0.10875,-0.363327,0.950737,0.400367,-1.248543
3,0.366552,0.654654,0.654654,-0.891966,-1.196249,-1.376558,0.214003,1.427485,-1.248543
4,1.523529,0.654654,0.654654,-0.833451,0.978749,-0.106238,0.845489,-0.20752,-1.248543


In [8]:
df.rename(columns={'stroke': 'Target'}, inplace=True)

## create a CSV file again from the procesed data

In [9]:
df.to_csv('procesed_dataset_5.csv', index=False)