In [43]:
import pandas as pd
import numpy as np

In [44]:
external_dataset_path = './datasets/'
data_filename = 'training_v2_top15hosp_gossis.csv'
index_col = None
hospitalid_var = 'hospital_id'
var_other = ['hospital_id', 'hospital_death', 'encounter_id', 'patient_id']

In [45]:
df_eicu = pd.read_csv(external_dataset_path + data_filename, sep=',', index_col=index_col)

In [36]:
FeatureGroups_gossis = {
    'outcome': 'hospital_death',
    'demographic': ['age', 'bmi', 'elective_surgery', 'ethnicity', 'gender',
                    'height', 'pre_icu_los_days', 'readmission_status', 'weight'],
    'APACHE_covariate': ['albumin_apache', 'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_post_operative',
                        'arf_apache', 'bilirubin_apache', 'bun_apache', 'creatinine_apache', 'fio2_apache',
                        'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache',
                        'glucose_apache', 'heart_rate_apache', 'hematocrit_apache', 'intubated_apache',
                        'map_apache', 'paco2_apache', 'paco2_for_ph_apache', 'pao2_apache', 'ph_apache', 'resprate_apache', 'sodium_apache',
                        'temp_apache', 'urineoutput_apache', 'ventilated_apache', 'wbc_apache'],
    'vitals': ['d1_diasbp_invasive_max', 'd1_diasbp_invasive_min', 'd1_diasbp_max', 'd1_diasbp_min',
                'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min', 'd1_heartrate_max', 'd1_heartrate_min',
                'd1_mbp_invasive_max', 'd1_mbp_invasive_min', 'd1_mbp_max', 'd1_mbp_min', 'd1_mbp_noninvasive_max',
                'd1_mbp_noninvasive_min', 'd1_resprate_max', 'd1_resprate_min', 'd1_spo2_max', 'd1_spo2_min',
                'd1_sysbp_invasive_max', 'd1_sysbp_invasive_min', 'd1_sysbp_max', 'd1_sysbp_min', 'd1_sysbp_noninvasive_max',
                'd1_sysbp_noninvasive_min', 'd1_temp_max', 'd1_temp_min', 'h1_diasbp_invasive_max',
                'h1_diasbp_invasive_min', 'h1_diasbp_max', 'h1_diasbp_min', 'h1_diasbp_noninvasive_max',
                'h1_diasbp_noninvasive_min', 'h1_heartrate_max', 'h1_heartrate_min', 'h1_mbp_invasive_max',
                'h1_mbp_invasive_min', 'h1_mbp_max', 'h1_mbp_min', 'h1_mbp_noninvasive_max', 'h1_mbp_noninvasive_min',
                'h1_resprate_max', 'h1_resprate_min', 'h1_spo2_max', 'h1_spo2_min', 'h1_sysbp_invasive_max',
                'h1_sysbp_invasive_min', 'h1_sysbp_max', 'h1_sysbp_min', 'h1_sysbp_noninvasive_max',
                'h1_sysbp_noninvasive_min', 'h1_temp_max', 'h1_temp_min'],
    'labs': ['d1_albumin_max', 'd1_albumin_min', 'd1_bilirubin_max', 'd1_bilirubin_min',
            'd1_bun_max', 'd1_bun_min', 'd1_calcium_max', 'd1_calcium_min', 'd1_creatinine_max', 'd1_creatinine_min',
            'd1_glucose_max', 'd1_glucose_min', 'd1_hco3_max', 'd1_hco3_min',
            'd1_hemaglobin_max', 'd1_hemaglobin_min', 'd1_hematocrit_max', 'd1_hematocrit_min',
            'd1_inr_max', 'd1_inr_min', 'd1_lactate_max', 'd1_lactate_min', 'd1_platelets_max',
            'd1_platelets_min', 'd1_potassium_max', 'd1_potassium_min', 'd1_sodium_max', 'd1_sodium_min',
            'd1_wbc_max', 'd1_wbc_min', 'h1_albumin_max', 'h1_albumin_min', 'h1_bilirubin_max', 'h1_bilirubin_min',
            'h1_bun_max', 'h1_bun_min', 'h1_calcium_max', 'h1_calcium_min', 'h1_creatinine_max', 'h1_creatinine_min',
            'h1_glucose_max', 'h1_glucose_min', 'h1_hco3_max', 'h1_hco3_min', 'h1_hemaglobin_max', 'h1_hemaglobin_min',
            'h1_hematocrit_max', 'h1_hematocrit_min', 'h1_inr_max', 'h1_inr_min', 'h1_lactate_max', 'h1_lactate_min',
            'h1_platelets_max', 'h1_platelets_min', 'h1_potassium_max', 'h1_potassium_min',
            'h1_sodium_max', 'h1_sodium_min', 'h1_wbc_max', 'h1_wbc_min'],
    'labs_blood_gas': ['d1_arterial_pco2_max', 'd1_arterial_pco2_min', 'd1_arterial_ph_max',
                        'd1_arterial_ph_min', 'd1_arterial_po2_max', 'd1_arterial_po2_min',
                        'd1_pao2fio2ratio_max', 'd1_pao2fio2ratio_min', 'h1_arterial_pco2_max', 'h1_arterial_pco2_min',
                        'h1_arterial_ph_max', 'h1_arterial_ph_min', 'h1_arterial_po2_max', 'h1_arterial_po2_min',
                        'h1_pao2fio2ratio_max', 'h1_pao2fio2ratio_min'],
    'APACHE_prediction': ['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob'],
    'APACHE_comorbidity': ['aids', 'cirrhosis', 'diabetes_mellitus', 'hepatic_failure', 'immunosuppression',
                            'leukemia', 'lymphoma', 'solid_tumor_with_metastasis'],
    'APACHE_grouping': ['apache_3j_bodysystem', 'apache_2_bodysystem']
}

In [37]:
features = FeatureGroups_gossis['demographic']
x_train = df_eicu[features].values

In [38]:
df_eicu[features]

Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,pre_icu_los_days,readmission_status,weight
0,68.0,22.730000,0,Caucasian,M,180.3,0.541667,0,73.9
1,25.0,31.950000,0,Caucasian,F,172.7,0.000694,0,95.3
2,81.0,22.640000,1,Caucasian,F,165.1,0.000694,0,61.7
3,45.0,,0,Caucasian,M,170.2,0.009028,0,
4,50.0,25.710000,0,,M,175.3,0.060417,0,79.0
...,...,...,...,...,...,...,...,...,...
37906,56.0,29.936155,0,Caucasian,F,165.1,0.168750,0,81.6
37907,42.0,27.223993,0,Other/Unknown,M,180.3,0.000000,0,88.5
37908,55.0,44.475599,0,Caucasian,M,177.8,0.004861,0,140.6
37909,76.0,41.327572,0,Caucasian,M,175.3,0.091667,0,127.0


In [13]:
df_eicu[features]['ethnicity'].value_counts()

Caucasian           30934
African American     4053
Other/Unknown        1215
Hispanic              781
Native American       358
Asian                 309
Name: ethnicity, dtype: int64

In [41]:
pd.get_dummies(df_eicu[features], columns=['ethnicity','gender']).columns

Index(['age', 'bmi', 'elective_surgery', 'height', 'pre_icu_los_days',
       'readmission_status', 'weight', 'ethnicity_African American',
       'ethnicity_Asian', 'ethnicity_Caucasian', 'ethnicity_Hispanic',
       'ethnicity_Native American', 'ethnicity_Other/Unknown', 'gender_F',
       'gender_M'],
      dtype='object')

In [42]:
categorical_cols = set(df_eicu.columns) - set(df_eicu._get_numeric_data().columns)
categorical_cols

{'apache_2_bodysystem',
 'apache_3j_bodysystem',
 'ethnicity',
 'gender',
 'hospital_admit_source',
 'icu_admit_source',
 'icu_stay_type',
 'icu_type'}

In [27]:
for i in categorical_cols:
    print(df_eicu[i].nunique(), i)

15 hospital_admit_source
3 icu_stay_type
5 icu_admit_source
2 gender
10 apache_2_bodysystem
8 icu_type
6 ethnicity
11 apache_3j_bodysystem


In [50]:
df_eicu_dummy = pd.get_dummies(df_eicu, columns=['ethnicity','gender'])

In [52]:
df_eicu_dummy.to_csv(external_dataset_path+'training_v2_top15hosp_dummy_gossis.csv', index=False)

In [58]:
df_eicu['hospital_id'].value_counts()

118    4333
19     3925
188    3095
161    2792
70     2754
196    2730
176    2583
21     2470
194    2258
174    2225
100    2141
55     1909
185    1744
79     1510
18     1442
Name: hospital_id, dtype: int64