# Preparing the dataset 13 for training (OHE and standardizarion)

## Import libraries and create the needed functions

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
def ohe_and_standized(df):
    # Separate numerical and categorical features
    numerical_features = df.select_dtypes(include=['int', 'float']).columns.tolist()
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    
    # Create transformers for preprocessing
    numerical_transformer = Pipeline([('scaler', StandardScaler())])
    categorical_transformer = Pipeline([('onehot', OneHotEncoder(sparse_output=False))])  # Set sparse=False to get dense array
    
    # Combine transformers into a preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'  # Preserve the remaining features
    )
        
    # Create a pipeline to apply preprocessing
    pipeline = Pipeline([('preprocessor', preprocessor)])
    transformed_data = pipeline.fit_transform(df)
    
    # Get feature names after preprocessing
    feature_names = []
    feature_names.extend(numerical_features)
    
    # Add one-hot encoded categorical feature names
    if 'cat' in preprocessor.named_transformers_ and categorical_features:
        one_hot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
        categorical_feature_names = one_hot_encoder.get_feature_names_out(input_features=categorical_features)
        feature_names.extend(categorical_feature_names)
    
    # Add any other remaining columns (e.g., columns you didn't process)
    remaining_cols = [col for col in df.columns if col not in numerical_features + categorical_features]
    feature_names.extend(remaining_cols)
    
    # Convert the transformed data back to a DataFrame for visualization
    transformed_df = pd.DataFrame(transformed_data, columns=feature_names)
    
    # set the indexes as the original dataset
    transformed_df = transformed_df.set_index(df.index)
    
    return transformed_df

In [3]:
df = pd.read_csv('./dataset13.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3561 entries, 0 to 3560
Data columns (total 81 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   sex_ps               3561 non-null   object 
 1   umur_ps              3561 non-null   float64
 2   tgl_admisi           3561 non-null   float64
 3   jam_admisi           3561 non-null   int64  
 4   st_nikah             3561 non-null   object 
 5   etnis                1257 non-null   object 
 6   pekerjaan            3272 non-null   object 
 7   pendidikan           2620 non-null   object 
 8   alamat               3561 non-null   object 
 9   kelurahan            3561 non-null   object 
 10  kecamatan            3561 non-null   object 
 11  kota                 3561 non-null   object 
 12  diagnosa_sek         3561 non-null   object 
 13  DIAGN0               3561 non-null   object 
 14  onset                3547 non-null   float64
 15  tindakan             3561 non-null   o

In [4]:
df.head()

Unnamed: 0,sex_ps,umur_ps,tgl_admisi,jam_admisi,st_nikah,etnis,pekerjaan,pendidikan,alamat,kelurahan,...,DM.uncontrolled,heart.disease,HT,HT.uncontrolled,renal.disease,V.coherent,V.num,GCS,GCS.cat,GCS.cat2
0,laki-laki,70.0,13797220000.0,18840,menikah,jawa,ASN/PNS/POLRI,S1,JL. SIAGA ID NO. 9 RT 010 RW 005 ...,PEJATEN BARAT ...,...,False,True,True,True,False,True,5,14,Mild,good
1,laki-laki,35.0,13797220000.0,53100,menikah,,Pekerja swasta,S1,KOMP. SENO II/L23 RT1/4 ...,PEJATEN TIMUR ...,...,False,False,True,True,False,True,5,15,Mild,good
2,laki-laki,61.0,13797220000.0,48960,menikah,,Pekerja swasta,SMA,JL. BUNI RAYA I NO.51 RT5/5 ...,BEJI TIMUR ...,...,True,False,True,True,False,True,5,15,Mild,good
3,laki-laki,56.0,13797220000.0,51540,menikah,,Pekerja swasta,D3,JL. DUTA HARAPAN IX NO.29 RT3/10 ...,HARAPAN BARU ...,...,True,False,True,True,False,True,5,15,Mild,good
4,perempuan,57.0,13797220000.0,52980,menikah,minang,ASN/PNS/POLRI,S1,JL TAMBAK II BLOK A NO 47 RT 03/05 ...,PEGANGSAAN ...,...,True,False,True,True,False,True,5,15,Mild,good


In [5]:
df.isnull().any(axis=1).sum()
null_counts = df.isnull().sum()
features_with_nulls = null_counts[null_counts > 0].sort_values(ascending=False)
print("Features with null values (in decreasing order):\n", features_with_nulls)

Features with null values (in decreasing order):
 dtn                    3392
thn_riw_stroke         2885
jenis_riw_stroke       2697
d_dimer                2688
etnis                  2304
HBA1C                  1130
pendidikan              941
pekerjaan               289
G2PP                    257
as_urat                 219
GDP                     167
GDS                     140
hdl                     137
kol_total               135
trigliserida            132
ldl                     131
Ht                      104
Leukosit                101
Hb                       98
Trombosit                98
imt                      82
onset                    14
ekg                       6
kelas_rawat               5
kelas_bpjs                5
pembayaran                5
komplikasi_rawat          4
DM                        2
DM.uncontrolled           2
riw_dm                    2
riw_stroke_tia            2
stroke_in_evolution       1
transformasi              1
diastol                   

### The feature YEAR and Filename have constant values so we dont need them

In [6]:
df = df.drop(["dtn","thn_riw_stroke","jenis_riw_stroke","d_dimer","etnis","HBA1C","pendidikan"],axis = 1)

In [7]:
df.isnull().any(axis=1).sum()

811

In [8]:
df = df.dropna()
df.isnull().any(axis=1).sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2750 entries, 0 to 3560
Data columns (total 74 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   sex_ps               2750 non-null   object 
 1   umur_ps              2750 non-null   float64
 2   tgl_admisi           2750 non-null   float64
 3   jam_admisi           2750 non-null   int64  
 4   st_nikah             2750 non-null   object 
 5   pekerjaan            2750 non-null   object 
 6   alamat               2750 non-null   object 
 7   kelurahan            2750 non-null   object 
 8   kecamatan            2750 non-null   object 
 9   kota                 2750 non-null   object 
 10  diagnosa_sek         2750 non-null   object 
 11  DIAGN0               2750 non-null   object 
 12  onset                2750 non-null   float64
 13  tindakan             2750 non-null   object 
 14  riw_stroke_tia       2750 non-null   object 
 15  riw_ht               2750 non-null   object

In [9]:
df = ohe_and_standized(df)
df.head()

Unnamed: 0,umur_ps,tgl_admisi,jam_admisi,onset,E,M,sistol,diastol,GDS,trigliserida,...,GCS.cat_Moderate,GCS.cat_Severe,GCS.cat2_bad-mid,GCS.cat2_good,death,heart.disease,HT,HT.uncontrolled,renal.disease,V.coherent
0,0.993436,-1.717068,-1.541896,-0.319059,-2.497488,0.26137,-0.63377,-0.637377,-0.223958,-0.313156,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
1,-2.198945,-1.717068,0.127955,-0.277551,0.240986,0.26137,-0.280039,0.024327,-0.786306,0.891098,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
2,0.172538,-1.717068,-0.073831,-0.244345,0.240986,0.26137,-0.63377,0.024327,-0.370168,-0.033909,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
4,-0.192306,-1.717068,0.122106,-0.036809,0.240986,0.26137,1.488615,1.347733,0.518342,1.013269,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
5,-0.921993,-1.717068,0.563696,-0.397923,0.240986,0.26137,0.073692,0.68603,-0.8088,-0.522592,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0


In [10]:
df.rename(columns={'GCS': 'Target'}, inplace=True)

## create a CSV file again from the procesed data

In [11]:
df.to_csv('procesed_dataset_13.csv', index=False)