# eICU Data Joining
---

Reading and joining all preprocessed parts of the eICU dataset from MIT with the data from over 139k patients collected in the US.

The main goal of this notebook is to prepare a single parquet document that contains all the relevant data to be used when training a machine learning model that predicts mortality, joining tables, filtering useless columns and performing imputation.

## Importing the necessary packages

In [1]:
import os                                  # os handles directory/workspace changes
import numpy as np                         # NumPy to handle numeric and NaN operations
import yaml                                # Save and load YAML files

In [2]:
# Debugging packages
import pixiedust                           # Debugging in Jupyter Notebook cells

Pixiedust database opened successfully


In [3]:
# Change to parent directory (presumably "Documents")
os.chdir("../../..")
# Path to the parquet dataset files
data_path = 'Datasets/Thesis/eICU/uncompressed/cleaned/'
# Path to the code files
project_path = 'GitHub/eICU-mortality-prediction/'

In [4]:
# # Make sure that every large operation can be handled, by using the disk as an overflow for the memory
# !export MODIN_OUT_OF_CORE=true
# # Another trick to do with Pandas so as to be able to allocate bigger objects to memory
# !sudo bash -c 'echo 1 > /proc/sys/vm/overcommit_memory'

In [5]:
import modin.pandas as mpd                  # Optimized distributed version of Pandas
import pandas as pd
import data_utils as du                    # Data science and machine learning relevant methods



Allow pandas to show more columns:

In [6]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

Set the random seed for reproducibility

In [7]:
du.set_random_seed(42)

## Initializing variables

In [8]:
dtype_dict = {'patientunitstayid': 'uint32',
              'gender': 'UInt8',
              'age': 'float32',
              'admissionheight': 'float32',
              'admissionweight': 'float32',
              'death_ts': 'Int32',
              'ts': 'int32',
              'cad': 'UInt8',
              'cancer': 'UInt8',
              # 'diagnosis_type_1': 'UInt64',
              # 'diagnosis_disorder_2': 'UInt64',
              # 'diagnosis_detailed_3': 'UInt64',
              # 'allergyname': 'UInt64',
              # 'drugallergyhiclseqno': 'UInt64',
              # 'pasthistoryvalue': 'UInt64',
              # 'pasthistorytype': 'UInt64',
              # 'pasthistorydetails': 'UInt64',
              # 'treatmenttype': 'UInt64',
              # 'treatmenttherapy': 'UInt64',
              # 'treatmentdetails': 'UInt64',
              # 'drugunit_x': 'UInt64',
              # 'drugadmitfrequency_x': 'UInt64',
              # 'drughiclseqno_x': 'UInt64',
              'drugdosage_x': 'float32',
              # 'drugadmitfrequency_y': 'UInt64',
              # 'drughiclseqno_y': 'UInt64',
              'drugdosage_y': 'float32',
              # 'drugunit_y': 'UInt64',
              'bodyweight_(kg)': 'float32',
              'oral_intake': 'float32',
              'urine_output': 'float32',
              'i.v._intake	': 'float32',
              'saline_flush_(ml)_intake': 'float32',
              'volume_given_ml': 'float32',
              'stool_output': 'float32',
              'prbcs_intake': 'float32',
              'gastric_(ng)_output': 'float32',
              'dialysis_output': 'float32',
              'propofol_intake': 'float32',
              'lr_intake': 'float32',
              'indwellingcatheter_output': 'float32',
              'feeding_tube_flush_ml': 'float32',
              'patient_fluid_removal': 'float32',
              'fentanyl_intake': 'float32',
              'norepinephrine_intake': 'float32',
              'crystalloids_intake': 'float32',
              'voided_amount': 'float32',
              'nutrition_total_intake': 'float32',
              # 'nutrition': 'UInt64',
              # 'nurse_treatments': 'UInt64',
              # 'hygiene/adls': 'UInt64',
              # 'activity': 'UInt64',
              # 'pupils': 'UInt64',
              # 'neurologic': 'UInt64',
              # 'secretions': 'UInt64',
              # 'cough': 'UInt64',
              'priorvent': 'UInt8',
              'onvent': 'UInt8',
              'noninvasivesystolic': 'float32',
              'noninvasivediastolic': 'float32',
              'noninvasivemean': 'float32',
              'paop': 'float32',
              'cardiacoutput': 'float32',
              'cardiacinput': 'float32',
              'svr': 'float32',
              'svri': 'float32',
              'pvr': 'float32',
              'pvri': 'float32',
              'temperature': 'float32',
              'sao2': 'float32',
              'heartrate': 'float32',
              'respiration': 'float32',
              'cvp': 'float32',
              'etco2': 'float32',
              'systemicsystolic': 'float32',
              'systemicdiastolic': 'float32',
              'systemicmean': 'float32',
              'pasystolic': 'float32',
              'padiastolic': 'float32',
              'pamean': 'float32',
              'st1': 'float32',
              'st2': 'float32',
              'st3': 'float32',
              'icp': 'float32',
              # 'labtypeid': 'UInt64',
              # 'labname': 'UInt64',
              # 'lab_units': 'UInt64',
              'lab_result': 'float32'}

Load the lists of one hot encoded columns:

In [9]:
# stream_adms_drug = open(f'{data_path}cat_feat_ohe_adms_drug.yaml', 'r')
# stream_inf_drug = open(f'{data_path}cat_feat_ohe_inf_drug.yaml', 'r')
stream_med = open(f'{data_path}cat_feat_ohe_med.yml', 'r')
stream_treat = open(f'{data_path}cat_feat_ohe_treat.yml', 'r')
stream_diag = open(f'{data_path}cat_feat_ohe_diag.yml', 'r')
# stream_alrg = open(f'{data_path}cat_feat_ohe_alrg.yml', 'r')
stream_past_hist = open(f'{data_path}cat_feat_ohe_past_hist.yml', 'r')
stream_lab = open(f'{data_path}cat_feat_ohe_lab.yml', 'r')
stream_patient = open(f'{data_path}cat_feat_ohe_patient.yml', 'r')
stream_notes = open(f'{data_path}cat_feat_ohe_note.yml', 'r')

In [10]:
# cat_feat_ohe_adms_drug = yaml.load(stream_adms_drug, Loader=yaml.FullLoader)
# cat_feat_ohe_inf_drug = yaml.load(stream_inf_drug, Loader=yaml.FullLoader)
cat_feat_ohe_med = yaml.load(stream_med, Loader=yaml.FullLoader)
cat_feat_ohe_treat = yaml.load(stream_treat, Loader=yaml.FullLoader)
cat_feat_ohe_diag = yaml.load(stream_diag, Loader=yaml.FullLoader)
# cat_feat_ohe_alrg = yaml.load(stream_alrg, Loader=yaml.FullLoader)
cat_feat_ohe_past_hist = yaml.load(stream_past_hist, Loader=yaml.FullLoader)
cat_feat_ohe_lab = yaml.load(stream_lab, Loader=yaml.FullLoader)
cat_feat_ohe_patient = yaml.load(stream_patient, Loader=yaml.FullLoader)
cat_feat_ohe_notes = yaml.load(stream_notes, Loader=yaml.FullLoader)

In [11]:
cat_feat_ohe = du.utils.merge_dicts([cat_feat_ohe_med, cat_feat_ohe_treat,
                                     cat_feat_ohe_diag,
                                     cat_feat_ohe_past_hist, cat_feat_ohe_lab,
                                     cat_feat_ohe_patient, cat_feat_ohe_notes])

In [12]:
ohe_columns = du.utils.merge_lists(list(cat_feat_ohe.values()))
ohe_columns = du.data_processing.clean_naming(ohe_columns, lower_case=False)

Add the one hot encoded columns to the dtypes dictionary, specifying them with type `UInt8`

In [13]:
for col in ohe_columns:
    dtype_dict[col] = 'UInt8'

In [15]:
dtype_dict

{'patientunitstayid': 'uint32',
 'gender': 'UInt8',
 'age': 'float32',
 'admissionheight': 'float32',
 'admissionweight': 'float32',
 'death_ts': 'Int32',
 'ts': 'int32',
 'cad': 'UInt8',
 'cancer': 'UInt8',
 'drugdosage_x': 'float32',
 'drugdosage_y': 'float32',
 'bodyweight_(kg)': 'float32',
 'oral_intake': 'float32',
 'urine_output': 'float32',
 'i.v._intake\t': 'float32',
 'saline_flush_(ml)_intake': 'float32',
 'volume_given_ml': 'float32',
 'stool_output': 'float32',
 'prbcs_intake': 'float32',
 'gastric_(ng)_output': 'float32',
 'dialysis_output': 'float32',
 'propofol_intake': 'float32',
 'lr_intake': 'float32',
 'indwellingcatheter_output': 'float32',
 'feeding_tube_flush_ml': 'float32',
 'patient_fluid_removal': 'float32',
 'fentanyl_intake': 'float32',
 'norepinephrine_intake': 'float32',
 'crystalloids_intake': 'float32',
 'voided_amount': 'float32',
 'nutrition_total_intake': 'float32',
 'priorvent': 'UInt8',
 'onvent': 'UInt8',
 'noninvasivesystolic': 'float32',
 'noninva

In [14]:
dtype_dict['ethanol_use_heavy']

KeyError: 'ethanol_use_heavy'

In [17]:
dtype_dict['Ethanol_Use_heavy']

'UInt8'

In [16]:
cat_feat_ohe_notes

{'Ethanol Use': ['Ethanol Use_heavy',
  'Ethanol Use_moderate_(daily)',
  'Ethanol Use_rare',
  'Ethanol Use_hx_withdrawal_seizures',
  'Ethanol Use_none',
  'Ethanol Use_moderate_(not_daily)',
  'Ethanol Use_hx_delirium_tremens_and_withdrawal_seizures',
  'Ethanol Use_hx_delirium_tremens'],
 'Smoking Status': ['Smoking Status_20_-_40_pack_years_-_still_smoking',
  'Smoking Status_denies_smoking',
  'Smoking Status_20_-_40_pack_years_-_remote_hx_of_smoking',
  'Smoking Status_>_40_pack_years_-_remote_hx_of_smoking',
  'Smoking Status_uses_smokeless_tobacco',
  'Smoking Status_<_20_pack_years_-_remote_hx_of_smoking',
  'Smoking Status_smokes_cigar_or_pipe',
  'Smoking Status_<_20_pack_years_-_still_smoking',
  'Smoking Status_>_40_pack_years_-_still_smoking']}

## Loading the data

### Patient information

In [None]:
patient_df = mpd.read_csv(f'{data_path}normalized/ohe/patient.csv', dtype=dtype_dict)
patient_df = du.utils.convert_dataframe(patient_df, to='pandas', return_library=False, dtypes=dtype_dict)
patient_df = patient_df.drop(columns='Unnamed: 0')
patient_df.head()

In [None]:
note_df = mpd.read_csv(f'{data_path}normalized/ohe/note.csv', dtype=dtype_dict)
note_df = du.utils.convert_dataframe(note_df, to='pandas', return_library=False, dtypes=dtype_dict)
note_df = note_df.drop(columns='Unnamed: 0')
note_df.head()

### Diagnosis

In [None]:
diagns_df = mpd.read_csv(f'{data_path}normalized/ohe/diagnosis.csv', dtype=dtype_dict)
diagns_df = du.utils.convert_dataframe(diagns_df, to='pandas', return_library=False, dtypes=dtype_dict)
diagns_df = diagns_df.drop(columns='Unnamed: 0')
diagns_df.head()

In [None]:
# alrg_df = mpd.read_csv(f'{data_path}normalized/ohe/allergy.csv', dtype=dtype_dict)
# alrg_df = du.utils.convert_dataframe(alrg_df, to='pandas', return_library=False, dtypes=dtype_dict)
# alrg_df = alrg_df.drop(columns='Unnamed: 0')
# alrg_df.head()

In [None]:
past_hist_df = mpd.read_csv(f'{data_path}normalized/ohe/pastHistory.csv', dtype=dtype_dict)
past_hist_df = du.utils.convert_dataframe(past_hist_df, to='pandas', return_library=False, dtypes=dtype_dict)
past_hist_df = past_hist_df.drop(columns='Unnamed: 0')
past_hist_df.head()

### Treatments

In [None]:
treat_df = mpd.read_csv(f'{data_path}normalized/ohe/treatment.csv', dtype=dtype_dict)
treat_df = du.utils.convert_dataframe(treat_df, to='pandas', return_library=False, dtypes=dtype_dict)
treat_df = treat_df.drop(columns='Unnamed: 0')
treat_df.head()

In [None]:
adms_drug_df = mpd.read_csv(f'{data_path}normalized/ohe/admissionDrug.csv', dtype=dtype_dict)
adms_drug_df = du.utils.convert_dataframe(adms_drug_df, to='pandas', return_library=False, dtypes=dtype_dict)
adms_drug_df = adms_drug_df.drop(columns='Unnamed: 0')
adms_drug_df.head()

In [None]:
# inf_drug_df = mpd.read_csv(f'{data_path}normalized/ohe/infusionDrug.csv', dtype=dtype_dict)
# inf_drug_df = du.utils.convert_dataframe(inf_drug_df, to='pandas', return_library=False, dtypes=dtype_dict)
# inf_drug_df = inf_drug_df.drop(columns='Unnamed: 0')
# inf_drug_df.head()

In [None]:
med_df = mpd.read_csv(f'{data_path}normalized/ohe/medication.csv', dtype=dtype_dict)
med_df = du.utils.convert_dataframe(med_df, to='pandas', return_library=False, dtypes=dtype_dict)
med_df = med_df.drop(columns='Unnamed: 0')
med_df.head()

In [None]:
# in_out_df = mpd.read_csv(f'{data_path}normalized/intakeOutput.csv', dtype=dtype_dict)
# in_out_df = du.utils.convert_dataframe(in_out_df, to='pandas', return_library=False, dtypes=dtype_dict)
# in_out_df = in_out_df.drop(columns='Unnamed: 0')
# in_out_df.head()

### Nursing data

In [None]:
# nurse_care_df = mpd.read_csv(f'{data_path}normalized/ohe/nurseCare.csv')
# nurse_care_df.head()

In [None]:
# nurse_assess_df = mpd.read_csv(f'{data_path}normalized/ohe/nurseAssessment.csv')
# nurse_assess_df.head()

Remove the uneeded 'Unnamed: 0' column:

In [None]:
# nurse_care_df = nurse_care_df.drop(columns='Unnamed: 0')
# nurse_assess_df = nurse_assess_df.drop(columns='Unnamed: 0')

### Respiratory data

In [None]:
resp_care_df = mpd.read_csv(f'{data_path}normalized/ohe/respiratoryCare.csv', dtype=dtype_dict)
resp_care_df = du.utils.convert_dataframe(resp_care_df, to='pandas', return_library=False, dtypes=dtype_dict)
resp_care_df = resp_care_df.drop(columns='Unnamed: 0')
resp_care_df.head()

### Vital signals

In [None]:
vital_aprdc_df = mpd.read_csv(f'{data_path}normalized/vitalAperiodic.csv', dtype=dtype_dict)
vital_aprdc_df = du.utils.convert_dataframe(vital_aprdc_df, to='pandas', return_library=False, dtypes=dtype_dict)
vital_aprdc_df = vital_aprdc_df.drop(columns='Unnamed: 0')
vital_aprdc_df.head()

In [None]:
vital_prdc_df = mpd.read_csv(f'{data_path}normalized/vitalPeriodic.csv', dtype=dtype_dict)
vital_prdc_df = du.utils.convert_dataframe(vital_prdc_df, to='pandas', return_library=False, dtypes=dtype_dict)
vital_prdc_df = vital_prdc_df.drop(columns='Unnamed: 0')
vital_prdc_df.head()

### Exams data

In [None]:
lab_df = mpd.read_csv(f'{data_path}normalized/ohe/lab.csv', dtype=dtype_dict)
lab_df = du.utils.convert_dataframe(lab_df, to='pandas', return_library=False, dtypes=dtype_dict)
lab_df = lab_df.drop(columns='Unnamed: 0')
lab_df.head()

## Joining dataframes

### Checking the matching of unit stays IDs

In [None]:
full_stays_list = set(patient_df.patientunitstayid.unique())

Total number of unit stays:

In [None]:
len(full_stays_list)

In [None]:
note_stays_list = set(note_df.patientunitstayid.unique())

In [None]:
len(note_stays_list)

Number of unit stays that have note data:

In [None]:
len(set.intersection(full_stays_list, note_stays_list))

In [None]:
diagns_stays_list = set(diagns_df.patientunitstayid.unique())

In [None]:
len(diagns_stays_list)

Number of unit stays that have diagnosis data:

In [None]:
len(set.intersection(full_stays_list, diagns_stays_list))

In [None]:
# alrg_stays_list = set(alrg_df.patientunitstayid.unique())

In [None]:
# len(alrg_stays_list)

Number of unit stays that have allergy data:

In [None]:
# len(set.intersection(full_stays_list, alrg_stays_list))

In [None]:
past_hist_stays_list = set(past_hist_df.patientunitstayid.unique())

In [None]:
len(past_hist_stays_list)

Number of unit stays that have past history data:

In [None]:
len(set.intersection(full_stays_list, past_hist_stays_list))

In [None]:
treat_stays_list = set(treat_df.patientunitstayid.unique())

In [None]:
len(treat_stays_list)

Number of unit stays that have treatment data:

In [None]:
len(set.intersection(full_stays_list, treat_stays_list))

In [None]:
adms_drug_stays_list = set(adms_drug_df.patientunitstayid.unique())

In [None]:
len(adms_drug_stays_list)

Number of unit stays that have admission drug data:

In [None]:
len(set.intersection(full_stays_list, adms_drug_stays_list))

In [None]:
# inf_drug_stays_list = set(inf_drug_df.patientunitstayid.unique())

In [None]:
# len(inf_drug_stays_list)

Number of unit stays that have infusion drug data:

In [None]:
# len(set.intersection(full_stays_list, inf_drug_stays_list))

In [None]:
med_stays_list = set(med_df.patientunitstayid.unique())

In [None]:
len(med_stays_list)

Number of unit stays that have medication data:

In [None]:
len(set.intersection(full_stays_list, med_stays_list))

In [None]:
# in_out_stays_list = set(in_out_df.patientunitstayid.unique())

In [None]:
# len(in_out_stays_list)

Number of unit stays that have intake and output data:

In [None]:
# len(set.intersection(full_stays_list, in_out_stays_list))

In [None]:
# nurse_care_stays_list = set(nurse_care_df.patientunitstayid.unique())

In [None]:
# len(nurse_care_stays_list)

Number of unit stays that have nurse care data:

In [None]:
# len(set.intersection(full_stays_list, nurse_care_stays_list))

In [None]:
# nurse_assess_stays_list = set(nurse_assess_df.patientunitstayid.unique())

In [None]:
# len(nurse_assess_stays_list)

Number of unit stays that have nurse assessment data:

In [None]:
# len(set.intersection(full_stays_list, nurse_assess_stays_list))

In [None]:
resp_care_stays_list = set(resp_care_df.patientunitstayid.unique())

In [None]:
len(resp_care_stays_list)

Number of unit stays that have respiratory care data:

In [None]:
len(set.intersection(full_stays_list, resp_care_stays_list))

In [None]:
vital_aprdc_stays_list = set(vital_aprdc_df.patientunitstayid.unique())

In [None]:
len(vital_aprdc_stays_list)

Number of unit stays that have vital aperiodic data:

In [None]:
len(set.intersection(full_stays_list, vital_aprdc_stays_list))

In [None]:
vital_prdc_stays_list = set(vital_prdc_df.patientunitstayid.unique())

In [None]:
len(vital_prdc_stays_list)

Number of unit stays that have vital periodic data:

In [None]:
len(set.intersection(full_stays_list, vital_prdc_stays_list))

In [None]:
lab_stays_list = set(lab_df.patientunitstayid.unique())

In [None]:
len(lab_stays_list)

Number of unit stays that have lab data:

In [None]:
len(set.intersection(full_stays_list, lab_stays_list))

### Joining patient with note data

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df = patient_df
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
note_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(note_df, how='left')
eICU_df.head()

In [None]:
# eICU_df, pd = du.utils.convert_dataframe(eICU_df, to='pandas')

In [None]:
# eICU_df[(eICU_df.patientunitstayid == 838771) & (eICU_df.ts == 0)]

In [None]:
# eICU_df[(eICU_df.patientunitstayid == 2696741) & (eICU_df.ts == 0)]

In [None]:
# eICU_df.groupby(['patientunitstayid', 'ts']).size().value_counts()

In [None]:
# eICU_df = eICU_df.groupby(['patientunitstayid', 'ts']).first()
# eICU_df.head()

In [None]:
# eICU_df.groupby(['patientunitstayid', 'ts']).size().value_counts()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
# eICU_df = du.utils.convert_pyarrow_dtypes(eICU_df, inplace=True)

In [None]:
eICU_df.groupby(['patientunitstayid', 'ts']).size().sort_values()

In [None]:
eICU_df.ethanol_use_heavy.unique()

In [None]:
eICU_df.ethanol_use_heavy.nunique()

In [None]:
type(eICU_df.ethanol_use_heavy.unique()[0])

In [None]:
eICU_df.ethanol_use_heavy.astype('UInt8')

In [None]:
np.nan_to_num(eICU_df.ethanol_use_heavy.unique())

In [None]:
[x for x in list(np.sort(list(set(np.nan_to_num(eICU_df.ethanol_use_heavy.unique()))))) if str(x).lower() != 'nan' ]

In [None]:
[x for x in list(np.sort(list(set(np.nan_to_num(eICU_df.ethanol_use_heavy.unique()))))) if str(x).lower() != 'nan' ] == [0, 1]

In [None]:
np.sort(list(set(np.nan_to_num(eICU_df.ethanol_use_heavy.unique()))))

In [None]:
np.sort(list(set(np.nan_to_num(eICU_df.ethanol_use_heavy.unique())))) == [0, 1]

In [None]:
du.search_explore.is_one_hot_encoded_column(eICU_df, 'ethanol_use_heavy')

In [None]:
def is_one_hot_encoded_column(df, column, n_unique_values=None):
    '''Checks if a given column is one hot encoded.

    Parameters
    ----------
    df : pandas.DataFrame or dask.DataFrame
        Dataframe that will be used, which contains the specified column.
    column : string
        Name of the column that will be checked for one hot encoding.
    n_unique_values : int, default None
        Number of the column's unique values. If not specified, it will
        be automatically calculated.

    Returns
    -------
    bool
        Returns true if the column is in one hot encoding format.
        Otherwise, returns false.
    '''
    if n_unique_values is None:
        # Calculate the number of unique values
        n_unique_values = df[column].nunique()
    # Check if it only has 2 possible values
    if n_unique_values == 2:
        unique_values = df[column].unique()
        # Check if the only possible values are 0 and 1 (and ignore NaN's)
        unique_values = list(set(np.nan_to_num(unique_values)))
        unique_values.sort()
        unique_values = [val for val in unique_values if str(val).lower() != 'nan' ]
        if unique_values == [0, 1]:
            return True
    return False

In [None]:
is_one_hot_encoded_column(eICU_df, 'ethanol_use_heavy')

In [None]:
dtype_dict['ethanol_use_heavy']

In [None]:
eICU_df.dtypes

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_diagns.ftr')

### Joining with diagnosis data

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_diagns.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
diagns_df = diagns_df[diagns_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(diagns_df.patientunitstayid.unique())]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
diagns_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(diagns_df, how='outer')
eICU_df.head()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
# eICU_df = du.utils.convert_pyarrow_dtypes(eICU_df, inplace=True)

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_past_hist.ftr')

### Joining with allergy data

In [None]:
# eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_alrg.ftr')
# eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
# eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
# alrg_df = alrg_df[alrg_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

In [None]:
# eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
# eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
# alrg_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
# eICU_df = eICU_df.join(alrg_df, how='outer')
# eICU_df.head()

Save the current dataframe:

In [None]:
# eICU_df.reset_index(inplace=True)
# eICU_df.head()

In [None]:
# eICU_df = du.utils.convert_pyarrow_dtypes(eICU_df, inplace=True)

In [None]:
# eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_past_hist.ftr')

### Joining with past history data

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_past_hist.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
past_hist_df = past_hist_df[past_hist_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(past_hist_df.patientunitstayid.unique())]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index('patientunitstayid', inplace=True)
past_hist_df.set_index('patientunitstayid', inplace=True)

Merge the dataframes:

In [None]:
len(eICU_df)

In [None]:
eICU_df = eICU_df.join(past_hist_df, how='outer')
eICU_df.head()

In [None]:
len(eICU_df)

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
# eICU_df = du.utils.convert_pyarrow_dtypes(eICU_df, inplace=True)

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_treat.ftr')

### Joining with treatment data

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_treat.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
treat_df = treat_df[treat_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(treat_df.patientunitstayid.unique())]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
treat_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(treat_df, how='outer')
eICU_df.head()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
# eICU_df = du.utils.convert_pyarrow_dtypes(eICU_df, inplace=True)

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_med.ftr')

### Joining with infusion drug data

In [None]:
# eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_inf_drug.ftr')
# eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
# eICU_df.head()

In [None]:
# eICU_df.dtypes

In [None]:
# eICU_df.astype({'patientunitstayid': 'uint16'}).dtypes

In [None]:
# df_columns = list(eICU_df.columns)

In [None]:
# tmp_dict = dict()
# for key, val in dtype_dict.items():
#     if key in df_columns:
#         tmp_dict[key] = dtype_dict[key]

In [None]:
# tmp_dict

In [None]:
# # eICU_df = eICU_df.astype(tmp_dict, copy=False)
# eICU_df = eICU_df.astype(tmp_dict)

In [None]:
# eICU_df.dtypes

Filter to the unit stays that also have data in the other tables:

In [None]:
# inf_drug_df = inf_drug_df[inf_drug_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

In [None]:
# eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
# eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
# inf_drug_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
# eICU_df = eICU_df.join(inf_drug_df, how='outer')
# eICU_df.head()

Save the current dataframe:

In [None]:
# eICU_df.reset_index(inplace=True)
# eICU_df.head()

In [None]:
# eICU_df = du.utils.convert_pyarrow_dtypes(eICU_df, inplace=True)

In [None]:
# eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_med.ftr')

### Joining with medication data

#### Joining medication and admission drug data

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
med_df.set_index(['patientunitstayid', 'ts'], inplace=True)
adms_drug_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
med_df = med_df.join(adms_drug_df, how='outer')
med_df.head()

#### Merging duplicate columns

In [None]:
set([col.split('_x')[0].split('_y')[0] for col in eICU_df.columns if col.endswith('_x') or col.endswith('_y')])

In [None]:
eICU_df[['drugdosage_x', 'drugdosage_y']].head(20)

In [None]:
eICU_df = du.data_processing.merge_columns(eICU_df, inplace=True)
eICU_df.sample(20)

In [None]:
set([col.split('_x')[0].split('_y')[0] for col in eICU_df.columns if col.endswith('_x') or col.endswith('_y')])

In [None]:
eICU_df['drugdosage'].head(20)

Save the current dataframe:

In [None]:
med_df.reset_index(inplace=True)
med_df.head()

In [None]:
# med_df = du.utils.convert_pyarrow_dtypes(med_df, inplace=True)

In [None]:
med_df.to_feather(f'{data_path}normalized/ohe/med_and_adms_drug.ftr')

#### Joining with the rest of the eICU data

In [None]:
med_drug_df = pd.read_feather(f'{data_path}normalized/ohe/med_and_adms_drug.ftr')
med_drug_df = du.utils.convert_dtypes(med_drug_df, dtypes=dtype_dict, inplace=True)
med_drug_df.head()

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_med.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
med_drug_df = med_drug_df[med_drug_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(med_drug_df.patientunitstayid.unique())]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
med_drug_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(med_drug_df, how='outer')
eICU_df.head()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
# eICU_df = du.utils.convert_pyarrow_dtypes(eICU_df, inplace=True)

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_resp_care.ftr')

### Joining with intake outake data

In [None]:
# eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_in_out.ftr')
# eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
# eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
# in_out_df = in_out_df[in_out_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
# eICU_df = eICU_df[eICU_df.patientunitstayid.isin(in_out_df.patientunitstayid.unique())]

In [None]:
# eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
# eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
# in_out_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
# eICU_df = eICU_df.join(in_out_df, how='outer')
# eICU_df.head()

Save the current dataframe:

In [None]:
# eICU_df.reset_index(inplace=True)
# eICU_df.head()

In [None]:
# eICU_df = du.utils.convert_pyarrow_dtypes(eICU_df, inplace=True)

In [None]:
# eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_resp_care.ftr')

### Joining with nurse care data

In [None]:
# eICU_df = pd.merge(eICU_df, nurse_care_df, how='outer', on=['patientunitstayid', 'ts'], copy=False)
# eICU_df.head()

### Joining with nurse assessment data

In [None]:
# eICU_df = pd.merge(eICU_df, nurse_assess_df, how='outer', on=['patientunitstayid', 'ts'], copy=False)
# eICU_df.head()

### Joining with nurse charting data

In [None]:
# eICU_df = pd.merge(eICU_df, nurse_chart_df, how='outer', on=['patientunitstayid', 'ts'], copy=False)
# eICU_df.head()

### Joining with respiratory care data

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_resp_care.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
resp_care_df = resp_care_df[resp_care_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
resp_care_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(resp_care_df, how='outer')
eICU_df.head()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
# eICU_df = du.utils.convert_pyarrow_dtypes(eICU_df, inplace=True)

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_vital_aprdc.ftr')

### Joining with aperiodic vital signals data

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_vital_aprdc.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
vital_aprdc_df = vital_aprdc_df[vital_aprdc_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(vital_aprdc_df.patientunitstayid.unique())]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
vital_aprdc_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(vital_aprdc_df, how='outer')
eICU_df.head()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
# eICU_df = du.utils.convert_pyarrow_dtypes(eICU_df, inplace=True)

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_vital_prdc.ftr')

### Joining with periodic vital signals data

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_vital_prdc.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
vital_prdc_df = vital_prdc_df[vital_prdc_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(vital_prdc_df.patientunitstayid.unique())]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
vital_prdc_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(vital_prdc_df, how='outer')
eICU_df.head()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
# eICU_df = du.utils.convert_pyarrow_dtypes(eICU_df, inplace=True)

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_post_joining_vital_prdc.ftr')

### Joining with lab data

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_post_joining_vital_prdc.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
lab_df = lab_df[lab_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(lab_df.patientunitstayid.unique())]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
lab_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(lab_df, how='outer')
eICU_df.head()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
# eICU_df = du.utils.convert_pyarrow_dtypes(eICU_df, inplace=True)

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_post_joining.ftr')

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_post_joining.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

In [None]:
eICU_df.columns

In [None]:
eICU_df.dtypes

## Cleaning the joined data

### Removing unit stays that are too short

In [None]:
# eICU_df.info(memory_usage='deep')

Make sure that the dataframe is ordered by time `ts`:

In [None]:
eICU_df = eICU_df.sort_values('ts')
eICU_df.head()

Remove unit stays that have less than 10 records:

In [None]:
unit_stay_len = eICU_df.groupby('patientunitstayid').patientunitstayid.count()
unit_stay_len

In [None]:
unit_stay_short = set(unit_stay_len[unit_stay_len < 10].index)
unit_stay_short

In [None]:
len(unit_stay_short)

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[~eICU_df.patientunitstayid.isin(unit_stay_short)]

In [None]:
eICU_df.patientunitstayid.nunique()

Remove unit stays that have data that represent less than 48h:

In [None]:
unit_stay_duration = eICU_df.groupby('patientunitstayid').ts.apply(lambda x: x.max() - x.min())
unit_stay_duration

In [None]:
unit_stay_short = set(unit_stay_duration[unit_stay_duration < 48*60].index)
unit_stay_short

In [None]:
len(unit_stay_short)

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[~eICU_df.patientunitstayid.isin(unit_stay_short)]

In [None]:
eICU_df.patientunitstayid.nunique()

### Joining duplicate columns

In [None]:
set([col.split('_x')[0].split('_y')[0] for col in eICU_df.columns if col.endswith('_x') or col.endswith('_y')])

In [None]:
eICU_df[['drugdosage_x', 'drugdosage_y']].head(20)

In [None]:
eICU_df[eICU_df.index == 2564878][['drugdosage_x', 'drugdosage_y']]

Convert dataframe to Pandas, as the next cells aren't working properly with Modin:

In [None]:
eICU_df, pd = du.utils.convert_dataframe(eICU_df, to='pandas', dtypes=dtype_dict)

In [None]:
eICU_df = du.data_processing.merge_columns(eICU_df, inplace=True)
eICU_df.sample(20)

In [None]:
set([col.split('_x')[0].split('_y')[0] for col in eICU_df.columns if col.endswith('_x') or col.endswith('_y')])

In [None]:
eICU_df['drugdosage'].head(20)

In [None]:
eICU_df[eICU_df.index == 2564878][['drugdosage']]

Save the current dataframe:

In [None]:
# eICU_df = du.utils.convert_pyarrow_dtypes(eICU_df, inplace=True)

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_post_merge_duplicate_cols.ftr')

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_post_merge_duplicate_cols.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

### Removing unit stays with too many missing values

Consider removing all unit stays that have, combining rows and columns, a very high percentage of missing values.

Reconvert dataframe to Modin:

In [None]:
eICU_df, pd = du.utils.convert_dataframe(vital_prdc_df, to='modin', dtypes=dtype_dict)

In [None]:
n_features = len(eICU_df.columns)
n_features

Create a temporary column that counts each row's number of missing values:

In [None]:
eICU_df['row_msng_val'] = eICU_df.isnull().sum(axis=1)
eICU_df[['patientunitstayid', 'ts', 'row_msng_val']].head()

Check each unit stay's percentage of missing data points:

In [None]:
# Number of possible data points in each unit stay
n_data_points = eICU_df.groupby('patientunitstayid').ts.count() * n_features
n_data_points

In [None]:
# Number of missing values in each unit stay
n_msng_val = eICU_df.groupby('patientunitstayid').row_msng_val.sum()
n_msng_val

In [None]:
# Percentage of missing values in each unit stay
msng_val_prct = (n_msng_val / n_data_points) * 100
msng_val_prct

In [None]:
msng_val_prct.describe()

Remove unit stays that have too many missing values (>70% of their respective data points):

In [None]:
unit_stay_high_msgn = set(msng_val_prct[msng_val_prct > 70].index)
unit_stay_high_msgn

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[~eICU_df.patientunitstayid.isin(unit_stay_high_msgn)]

In [None]:
eICU_df.patientunitstayid.nunique()

### Removing columns with too many missing values

We should remove features that have too many missing values (in this case, those that have more than 40% of missing values). Without enough data, it's even risky to do imputation, as it's unlikely for the imputation to correctly model the missing feature.

In [None]:
du.search_explore.dataframe_missing_values(eICU_df)

In [None]:
prev_features = eICU_df.columns
len(prev_features)

In [None]:
eICU_df = du.data_processing.remove_cols_with_many_nans(eICU_df, nan_percent_thrsh=70, inplace=True)

In [None]:
features = eICU_df.columns
len(features)

Removed features:

In [None]:
set(prev_features) - set(features)

In [None]:
eICU_df.head()

### Performing imputation

In [None]:
du.search_explore.dataframe_missing_values(eICU_df)

Imputate `gender`:

In [None]:
# Forward fill
eICU_df.gender = eICU_df.groupby(id_column).gender.fillna(method='ffill')
# Backward fill
eICU_df.gender = eICU_df.groupby(id_column).gender.fillna(method='bfill')
# Replace remaining missing values with zero
eICU_df.gender = eICU_df.gender.fillna(value=0)

Imputate the remaining features:

In [None]:
eICU_df = du.data_processing.missing_values_imputation(eICU_df, method='interpolation',
                                                       id_column='patientunitstayid', inplace=True)
eICU_df.head()

In [None]:
du.search_explore.dataframe_missing_values(eICU_df)

### Rearranging columns

For ease of use and for better intuition, we should make sure that the ID columns (`patientunitstayid` and `ts`) are the first ones in the dataframe.

In [None]:
columns = list(eICU_df.columns)
columns

In [None]:
columns.remove('patientunitstayid')
columns.remove('ts')

In [None]:
columns = ['patientunitstayid', 'ts'] + columns
columns

In [None]:
eICU_df = eICU_df[columns]
eICU_df.head()

## Setting the label

Define the label column considering the desired time window on which we want to predict mortality (0, 24h, 48h, 72h, etc).

In [None]:
time_window_h = 24

In [None]:
eICU_df['label'] = eICU_df[eICU_df.death_ts - eICU_df.ts <= time_window_h * 60]
eICU_df.head()

## Creating a single encoding dictionary for the complete dataframe

Combine the one hot encoding dictionaries of all tables, having in account the converged ones, into a single dictionary representative of all the categorical features in the resulting dataframe.

In [None]:
stream_adms_drug = open(f'{data_path}cat_feat_ohe_adms_drug.yaml', 'r')
stream_inf_drug = open(f'{data_path}cat_feat_ohe_inf_drug.yaml', 'r')
stream_med = open(f'{data_path}cat_feat_ohe_med.yaml', 'r')
stream_treat = open(f'{data_path}cat_feat_ohe_treat.yaml', 'r')
stream_diag = open(f'{data_path}cat_feat_ohe_diag.yaml', 'r')
stream_alrg = open(f'{data_path}cat_feat_ohe_alrg.yaml', 'r')
stream_past_hist = open(f'{data_path}cat_feat_ohe_past_hist.yaml', 'r')
stream_lab = open(f'{data_path}cat_feat_ohe_lab.yaml', 'r')
stream_patient = open(f'{data_path}cat_feat_ohe_patient.yaml', 'r')
stream_notes = open(f'{data_path}cat_feat_ohe_note.yaml', 'r')

In [None]:
cat_feat_ohe_adms_drug = yaml.load(stream_adms_drug, Loader=yaml.FullLoader)
cat_feat_ohe_inf_drug = yaml.load(stream_inf_drug, Loader=yaml.FullLoader)
cat_feat_ohe_med = yaml.load(stream_med, Loader=yaml.FullLoader)
cat_feat_ohe_treat = yaml.load(stream_treat, Loader=yaml.FullLoader)
cat_feat_ohe_diag = yaml.load(stream_diag, Loader=yaml.FullLoader)
cat_feat_ohe_alrg = yaml.load(stream_alrg, Loader=yaml.FullLoader)
cat_feat_ohe_past_hist = yaml.load(stream_past_hist, Loader=yaml.FullLoader)
cat_feat_ohe_lab = yaml.load(stream_lab, Loader=yaml.FullLoader)
cat_feat_ohe_patient = yaml.load(stream_patient, Loader=yaml.FullLoader)
cat_feat_ohe_notes = yaml.load(stream_notes, Loader=yaml.FullLoader)

In [None]:
cat_feat_ohe = du.utils.merge_dicts([cat_feat_ohe_adms_drug, cat_feat_ohe_inf_drug,
                                     cat_feat_ohe_med, cat_feat_ohe_treat,
                                     cat_feat_ohe_diag, cat_feat_ohe_alrg,
                                     cat_feat_ohe_past_hist, cat_feat_ohe_lab,
                                     cat_feat_ohe_patient, cat_feat_ohe_notes])

In [None]:
list(cat_feat_ohe.keys())

Clean the one hot encoded column names, as they are in the final dataframe:

In [None]:
for key, val in cat_feat_ohe.items():
    cat_feat_ohe[key] = du.data_processing.clean_naming(cat_feat_ohe[key], lower_case=False)

Save the final encoding dictionary:

In [None]:
stream = open(f'{data_path}/cleaned/cat_feat_ohe_eICU.yaml', 'w')
yaml.dump(cat_feat_ohe, stream, default_flow_style=False)

## Creating a single normalization dictionary for the complete dataframe

Combine the normalization stats dictionaries of all tables into a single dictionary representative of all the continuous features in the resulting dataframe.

In [None]:
stream_adms_drug = open(f'{data_path}admissionDrug_norm_stats.yaml', 'r')
stream_inf_drug = open(f'{data_path}infusionDrug_norm_stats.yaml', 'r')
stream_med = open(f'{data_path}medication_norm_stats.yaml', 'r')
# stream_in_out = open(f'{data_path}cat_embed_feat_enum_in_out.yaml', 'r')
stream_lab = open(f'{data_path}lab_norm_stats.yaml', 'r')
stream_patient = open(f'{data_path}patient_norm_stats.yaml', 'r')
# stream_vital_aprdc = open(f'{data_path}vitalAperiodic_norm_stats.yaml', 'r')
# stream_vital_prdc = open(f'{data_path}vitalPeriodic_norm_stats.yaml', 'r')

In [None]:
norm_stats_adms_drug = yaml.load(stream_adms_drug, Loader=yaml.FullLoader)
norm_stats_inf_drug = yaml.load(stream_inf_drug, Loader=yaml.FullLoader)
norm_stats_med = yaml.load(stream_med, Loader=yaml.FullLoader)
# norm_stats__in_out = yaml.load(stream_in_out, Loader=yaml.FullLoader)
norm_stats_lab = yaml.load(stream_lab, Loader=yaml.FullLoader)
norm_stats_patient = yaml.load(stream_patient, Loader=yaml.FullLoader)
# norm_stats_vital_aprdc = yaml.load(stream_vital_aprdc, Loader=yaml.FullLoader)
# norm_stats_vital_prdc = yaml.load(stream_vital_prdc, Loader=yaml.FullLoader)

In [None]:
norm_stats = du.utils.merge_dicts([norm_stats_adms_drug, norm_stats_inf_drug,
                                   norm_stats_med,
#                                    norm_stats_in_out,
                                   norm_stats_lab, norm_stats_patient,
                                   norm_stats_vital_aprdc, norm_stats_vital_prdc])

In [None]:
list(norm_stats.keys())

Save the final encoding dictionary:

In [None]:
stream = open(f'{data_path}/cleaned/eICU_norm_stats.yaml', 'w')
yaml.dump(norm_stats, stream, default_flow_style=False)