# Preparing the dataset 12 for training (OHE and standardizarion)

## Import libraries and create the needed functions

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
def ohe_and_standized(df):
    # Separate numerical and categorical features
    numerical_features = df.select_dtypes(include=['int', 'float']).columns.tolist()
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    
    # Create transformers for preprocessing
    numerical_transformer = Pipeline([('scaler', StandardScaler())])
    categorical_transformer = Pipeline([('onehot', OneHotEncoder(sparse_output=False))])  # Set sparse=False to get dense array
    
    # Combine transformers into a preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'  # Preserve the remaining features
    )
        
    # Create a pipeline to apply preprocessing
    pipeline = Pipeline([('preprocessor', preprocessor)])
    transformed_data = pipeline.fit_transform(df)
    
    # Get feature names after preprocessing
    feature_names = []
    feature_names.extend(numerical_features)
    
    # Add one-hot encoded categorical feature names
    if 'cat' in preprocessor.named_transformers_ and categorical_features:
        one_hot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
        categorical_feature_names = one_hot_encoder.get_feature_names_out(input_features=categorical_features)
        feature_names.extend(categorical_feature_names)
    
    # Add any other remaining columns (e.g., columns you didn't process)
    remaining_cols = [col for col in df.columns if col not in numerical_features + categorical_features]
    feature_names.extend(remaining_cols)
    
    # Convert the transformed data back to a DataFrame for visualization
    transformed_df = pd.DataFrame(transformed_data, columns=feature_names)
    
    # set the indexes as the original dataset
    transformed_df = transformed_df.set_index(df.index)
    
    return transformed_df

In [3]:
df = pd.read_csv('./dataset12.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Columns: 110 entries, Year to NIHSS_group
dtypes: float64(85), int64(17), object(8)
memory usage: 138.5+ KB


In [4]:
df.head()

Unnamed: 0,Year,DEMOGRAPHY_age,DEMOGRAPHY_sex,DEMOGRAPHY_nationality,History_OldStroke,History_DM,History_HyperTension,History_IschemicHeartDisease,History_ArterFibrillation,History_HyperLypidAemia,...,RHDIF6,RHDIF7,HDIF1,HDIF2,HDIF3,HDIF4,HDIF5,HDIF6,HDIF7,NIHSS_group
0,2019,47,Female,Asian,0,0,1,0,0,0,...,85.237197,242.816319,-2.869094,-1.139046,-1.685841,8.555659,17.547685,-9.015824,50.492466,1.0
1,2019,73,Female,Arab,1,1,1,0,1,0,...,141.00027,46.203676,-0.158827,-2.364226,-0.504784,-1.616118,-4.607018,26.63306,54.486933,3.0
2,2019,34,Male,Asian,0,1,1,0,0,0,...,-143.718498,-134.980739,0.410083,0.829128,-4.263227,-5.922891,1.746738,-27.857171,-3.566275,1.0
3,2019,58,Male,Asian,0,0,0,0,0,0,...,8.80149,-137.998294,2.019633,-1.599083,-1.808742,10.717613,-7.342656,-6.340554,-80.36059,1.0
4,2019,45,Male,Asian,0,0,1,0,0,0,...,-141.806086,-199.602139,-4.143329,-3.51671,-5.919545,7.169438,-13.36747,28.695202,-30.056348,1.0


In [5]:
df.isnull().any(axis=1).sum()
null_counts = df.isnull().sum()
features_with_nulls = null_counts[null_counts > 0].sort_values(ascending=False)
print("Features with null values (in decreasing order):\n", features_with_nulls)

Features with null values (in decreasing order):
 Lab_Investigation_Trop I                      161
Lab_Investigation_international_norm_ratio    161
Lab_Investigation_C-reactive protein          161
Lab_Investigation_TotalCholeserol             161
Lab_Investigation_low-density_lipoprotein     161
                                             ... 
WDSP5                                           3
WDSP4                                           3
WDSP3                                           3
WDSP2                                           3
Day_Time                                        2
Length: 88, dtype: int64


In [6]:
df = df.drop(["Lab_Investigation_Trop I","Lab_Investigation_international_norm_ratio","Lab_Investigation_C-reactive protein","Lab_Investigation_TotalCholeserol","Lab_Investigation_low-density_lipoprotein","Lab_Investigation_POC_Random blood sugar","Lab_Investigation_Creatinine"],axis = 1)

In [7]:
df.isnull().any(axis=1).sum()

83

### We will try to predict NIHSS_group which is the sevarity of the stroke and will also remove all the instances with null values for NIHSS_group

In [8]:
df = df.dropna()
df.isnull().any(axis=1).sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 78 entries, 0 to 158
Columns: 103 entries, Year to NIHSS_group
dtypes: float64(78), int64(17), object(8)
memory usage: 63.4+ KB


In [9]:
df = ohe_and_standized(df)
df.head()

Unnamed: 0,Year,DEMOGRAPHY_age,History_OldStroke,History_DM,History_HyperTension,History_IschemicHeartDisease,History_ArterFibrillation,History_HyperLypidAemia,History_Smoking,BMI,...,Clinical_Diagnosis_I61.5,Clinical_Diagnosis_I61.6,Clinical_Diagnosis_I61.8,Clinical_Diagnosis_I61.8.1,Clinical_Diagnosis_I61.9,Clinical_Diagnosis_I62.9,Day_Time_Afternoon,Day_Time_Evening,Day_Time_Morning,Day_Time_Night
0,1.19646,-0.481947,-0.261712,-0.834058,0.52791,-0.313993,-0.162221,-0.288675,-0.361158,-0.14897,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,1.19646,1.401266,3.820995,1.198958,0.52791,-0.313993,6.164414,-0.288675,-0.361158,-0.872982,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.19646,0.314797,-0.261712,-0.834058,-1.894264,-0.313993,-0.162221,-0.288675,-0.361158,0.002605,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.19646,-0.626809,-0.261712,-0.834058,0.52791,-0.313993,-0.162221,-0.288675,-0.361158,-0.648845,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,1.19646,1.03911,-0.261712,-0.834058,-1.894264,-0.313993,-0.162221,-0.288675,-0.361158,-1.003594,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
df.rename(columns={'NIHSS_group': 'Target'}, inplace=True)

## create a CSV file again from the procesed data

In [11]:
df.to_csv('procesed_dataset_12.csv', index=False)