# eICU Data Joining
---

Reading and joining all preprocessed parts of the eICU dataset from MIT with the data from over 139k patients collected in the US.

The main goal of this notebook is to prepare a single parquet document that contains all the relevant data to be used when training a machine learning model that predicts mortality, joining tables, filtering useless columns and performing imputation.

## Importing the necessary packages

In [None]:
import os                                  # os handles directory/workspace changes
import numpy as np                         # NumPy to handle numeric and NaN operations
import yaml                                # Save and load YAML files

In [None]:
# Debugging packages
import pixiedust                           # Debugging in Jupyter Notebook cells

In [None]:
# Change to parent directory (presumably "Documents")
os.chdir("../../..")
# Path to the parquet dataset files
data_path = 'data/eICU/cleaned/'
# Path to the code files
project_path = 'code/eICU-mortality-prediction/'

In [None]:
# Make sure that every large operation can be handled, by using the disk as an overflow for the memory
!export MODIN_OUT_OF_CORE=true
# Another trick to do with Pandas so as to be able to allocate bigger objects to memory
!sudo bash -c 'echo 1 > /proc/sys/vm/overcommit_memory'

In [None]:
import modin.pandas as mpd                  # Optimized distributed version of Pandas
import pandas as pd
import data_utils as du                     # Data science and machine learning relevant methods

In [None]:
du.set_pandas_library('pandas')

Allow pandas to show more columns:

In [None]:
pd.set_option('display.max_columns', 3000)
pd.set_option('display.max_rows', 3000)

Set the random seed for reproducibility

In [None]:
du.set_random_seed(42)

## Initializing variables

In [None]:
stream_dtypes = open(f'{data_path}eICU_dtype_dict.yml', 'r')
stream_const_columns = open(f'{data_path}eICU_const_columns.yml', 'r')

In [None]:
dtype_dict = yaml.load(stream_dtypes, Loader=yaml.FullLoader)
dtype_dict

In [None]:
const_columns = yaml.load(stream_const_columns, Loader=yaml.FullLoader)
const_columns

## Loading the data

### Patient information

In [None]:
patient_df = mpd.read_csv(f'{data_path}normalized/ohe/patient.csv', dtype=dtype_dict)
patient_df = du.utils.convert_dataframe(patient_df, to='pandas', return_library=False, dtypes=dtype_dict)
patient_df = patient_df.drop(columns='Unnamed: 0')
patient_df.head()

In [None]:
patient_df.to_numpy()

In [None]:
type(patient_df.to_numpy())

In [None]:
import sys

In [None]:
sys.getsizeof(patient_df)

In [None]:
sys.getsizeof(patient_df.values)

In [None]:
patient_df.values.nbytes

In [None]:
note_df = mpd.read_csv(f'{data_path}normalized/ohe/note.csv', dtype=dtype_dict)
note_df = du.utils.convert_dataframe(note_df, to='pandas', return_library=False, dtypes=dtype_dict)
note_df = note_df.drop(columns='Unnamed: 0')
note_df.head()

### Diagnosis

In [None]:
diagns_df = mpd.read_csv(f'{data_path}normalized/ohe/diagnosis.csv', dtype=dtype_dict)
diagns_df = du.utils.convert_dataframe(diagns_df, to='pandas', return_library=False, dtypes=dtype_dict)
diagns_df = diagns_df.drop(columns='Unnamed: 0')
diagns_df.head()

In [None]:
past_hist_df = mpd.read_csv(f'{data_path}normalized/ohe/pastHistory.csv', dtype=dtype_dict)
past_hist_df = du.utils.convert_dataframe(past_hist_df, to='pandas', return_library=False, dtypes=dtype_dict)
past_hist_df = past_hist_df.drop(columns='Unnamed: 0')
past_hist_df.head()

### Treatments

In [None]:
treat_df = mpd.read_csv(f'{data_path}normalized/ohe/treatment.csv', dtype=dtype_dict)
treat_df = du.utils.convert_dataframe(treat_df, to='pandas', return_library=False, dtypes=dtype_dict)
treat_df = treat_df.drop(columns='Unnamed: 0')
treat_df.head()

In [None]:
adms_drug_df = mpd.read_csv(f'{data_path}normalized/ohe/admissionDrug.csv', dtype=dtype_dict)
adms_drug_df = du.utils.convert_dataframe(adms_drug_df, to='pandas', return_library=False, dtypes=dtype_dict)
adms_drug_df = adms_drug_df.drop(columns='Unnamed: 0')
adms_drug_df.head()

In [None]:
adms_drug_df.dtypes

In [None]:
med_df = mpd.read_csv(f'{data_path}normalized/ohe/medication.csv', dtype=dtype_dict)
med_df = du.utils.convert_dataframe(med_df, to='pandas', return_library=False, dtypes=dtype_dict)
med_df = med_df.drop(columns='Unnamed: 0')
med_df.head()

In [None]:
med_df.dtypes

### Respiratory data

In [None]:
resp_care_df = mpd.read_csv(f'{data_path}normalized/ohe/respiratoryCare.csv', dtype=dtype_dict)
resp_care_df = du.utils.convert_dataframe(resp_care_df, to='pandas', return_library=False, dtypes=dtype_dict)
resp_care_df = resp_care_df.drop(columns='Unnamed: 0')
resp_care_df.head()

### Vital signals

In [None]:
# vital_aprdc_df = mpd.read_csv(f'{data_path}normalized/vitalAperiodic.csv', dtype=dtype_dict)
# vital_aprdc_df = du.utils.convert_dataframe(vital_aprdc_df, to='pandas', return_library=False, dtypes=dtype_dict)
vital_aprdc_df = pd.read_csv(f'{data_path}normalized/vitalAperiodic.csv', dtype=dtype_dict)
vital_aprdc_df = vital_aprdc_df.drop(columns='Unnamed: 0')
vital_aprdc_df.head()

In [None]:
# vital_prdc_df = mpd.read_csv(f'{data_path}normalized/ohe/vitalPeriodic.csv', dtype=dtype_dict)
# vital_prdc_df = du.utils.convert_dataframe(vital_prdc_df, to='pandas', return_library=False, dtypes=dtype_dict)
vital_prdc_df = pd.read_csv(f'{data_path}normalized/ohe/vitalPeriodic.csv', dtype=dtype_dict)
vital_prdc_df = vital_prdc_df.drop(columns='Unnamed: 0')
vital_prdc_df.head()

### Exams data

In [None]:
lab_df = mpd.read_csv(f'{data_path}normalized/ohe/lab.csv', dtype=dtype_dict)
lab_df = du.utils.convert_dataframe(lab_df, to='pandas', return_library=False, dtypes=dtype_dict)
lab_df = lab_df.drop(columns='Unnamed: 0')
lab_df.head()

In [None]:
lab_df.dtypes.value_counts()

## Joining dataframes

### Checking the matching of unit stays IDs

In [None]:
full_stays_list = set(patient_df.patientunitstayid.unique())

Total number of unit stays:

In [None]:
len(full_stays_list)

In [None]:
note_stays_list = set(note_df.patientunitstayid.unique())

In [None]:
len(note_stays_list)

Number of unit stays that have note data:

In [None]:
len(set.intersection(full_stays_list, note_stays_list))

In [None]:
diagns_stays_list = set(diagns_df.patientunitstayid.unique())

In [None]:
len(diagns_stays_list)

Number of unit stays that have diagnosis data:

In [None]:
len(set.intersection(full_stays_list, diagns_stays_list))

In [None]:
# alrg_stays_list = set(alrg_df.patientunitstayid.unique())

In [None]:
# len(alrg_stays_list)

Number of unit stays that have allergy data:

In [None]:
# len(set.intersection(full_stays_list, alrg_stays_list))

In [None]:
past_hist_stays_list = set(past_hist_df.patientunitstayid.unique())

In [None]:
len(past_hist_stays_list)

Number of unit stays that have past history data:

In [None]:
len(set.intersection(full_stays_list, past_hist_stays_list))

In [None]:
treat_stays_list = set(treat_df.patientunitstayid.unique())

In [None]:
len(treat_stays_list)

Number of unit stays that have treatment data:

In [None]:
len(set.intersection(full_stays_list, treat_stays_list))

In [None]:
adms_drug_stays_list = set(adms_drug_df.patientunitstayid.unique())

In [None]:
len(adms_drug_stays_list)

Number of unit stays that have admission drug data:

In [None]:
len(set.intersection(full_stays_list, adms_drug_stays_list))

In [None]:
# inf_drug_stays_list = set(inf_drug_df.patientunitstayid.unique())

In [None]:
# len(inf_drug_stays_list)

Number of unit stays that have infusion drug data:

In [None]:
# len(set.intersection(full_stays_list, inf_drug_stays_list))

In [None]:
med_stays_list = set(med_df.patientunitstayid.unique())

In [None]:
len(med_stays_list)

Number of unit stays that have medication data:

In [None]:
len(set.intersection(full_stays_list, med_stays_list))

In [None]:
# in_out_stays_list = set(in_out_df.patientunitstayid.unique())

In [None]:
# len(in_out_stays_list)

Number of unit stays that have intake and output data:

In [None]:
# len(set.intersection(full_stays_list, in_out_stays_list))

In [None]:
# nurse_care_stays_list = set(nurse_care_df.patientunitstayid.unique())

In [None]:
# len(nurse_care_stays_list)

Number of unit stays that have nurse care data:

In [None]:
# len(set.intersection(full_stays_list, nurse_care_stays_list))

In [None]:
# nurse_assess_stays_list = set(nurse_assess_df.patientunitstayid.unique())

In [None]:
# len(nurse_assess_stays_list)

Number of unit stays that have nurse assessment data:

In [None]:
# len(set.intersection(full_stays_list, nurse_assess_stays_list))

In [None]:
resp_care_stays_list = set(resp_care_df.patientunitstayid.unique())

In [None]:
len(resp_care_stays_list)

Number of unit stays that have respiratory care data:

In [None]:
len(set.intersection(full_stays_list, resp_care_stays_list))

In [None]:
vital_aprdc_stays_list = set(vital_aprdc_df.patientunitstayid.unique())

In [None]:
len(vital_aprdc_stays_list)

Number of unit stays that have vital aperiodic data:

In [None]:
len(set.intersection(full_stays_list, vital_aprdc_stays_list))

In [None]:
vital_prdc_stays_list = set(vital_prdc_df.patientunitstayid.unique())

In [None]:
len(vital_prdc_stays_list)

Number of unit stays that have vital periodic data:

In [None]:
len(set.intersection(full_stays_list, vital_prdc_stays_list))

In [None]:
lab_stays_list = set(lab_df.patientunitstayid.unique())

In [None]:
len(lab_stays_list)

Number of unit stays that have lab data:

In [None]:
len(set.intersection(full_stays_list, lab_stays_list))

### Joining patient with note data

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df = patient_df
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
note_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(note_df, how='left')
eICU_df.head()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
eICU_df.groupby(['patientunitstayid', 'ts']).size().sort_values()

In [None]:
eICU_df.dtypes

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_diagns.ftr')

In [None]:
eICU_df.dtypes.value_counts()

### Joining with diagnosis data

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_diagns.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
diagns_df = diagns_df[diagns_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

In [None]:
patient_dies = ~(eICU_df.groupby('patientunitstayid').death_ts.max().isna())
patient_dies

In [None]:
unit_stay_patient_dies = set(patient_dies[patient_dies == True].index)
unit_stay_patient_dies

In [None]:
len(unit_stay_patient_dies)

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(set(diagns_df.patientunitstayid.unique()) | unit_stay_patient_dies)]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
diagns_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(diagns_df, how='outer')
eICU_df.head()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_past_hist.ftr')

In [None]:
eICU_df.dtypes.value_counts()

### Joining with past history data

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_past_hist.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
past_hist_df = past_hist_df[past_hist_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

In [None]:
patient_dies = ~(eICU_df.groupby('patientunitstayid').death_ts.max().isna())
patient_dies

In [None]:
unit_stay_patient_dies = set(patient_dies[patient_dies == True].index)
unit_stay_patient_dies

In [None]:
len(unit_stay_patient_dies)

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(set(past_hist_df.patientunitstayid.unique()) | unit_stay_patient_dies)]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index('patientunitstayid', inplace=True)
past_hist_df.set_index('patientunitstayid', inplace=True)

Merge the dataframes:

In [None]:
len(eICU_df)

In [None]:
eICU_df = eICU_df.join(past_hist_df, how='outer')
eICU_df.head()

In [None]:
len(eICU_df)

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_treat.ftr')

In [None]:
eICU_df.dtypes

In [None]:
eICU_df.dtypes.value_counts()

### Joining with treatment data

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_treat.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
treat_df = treat_df[treat_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

In [None]:
patient_dies = ~(eICU_df.groupby('patientunitstayid').death_ts.max().isna())
patient_dies

In [None]:
unit_stay_patient_dies = set(patient_dies[patient_dies == True].index)
unit_stay_patient_dies

In [None]:
len(unit_stay_patient_dies)

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(set(treat_df.patientunitstayid.unique()) | unit_stay_patient_dies)]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
treat_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(treat_df, how='outer')
eICU_df.head()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_med.ftr')

In [None]:
eICU_df.dtypes.value_counts()

### Joining with medication data

#### Joining medication and admission drug data

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
med_df.set_index(['patientunitstayid', 'ts'], inplace=True)
adms_drug_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
med_df = med_df.join(adms_drug_df, how='outer', lsuffix='_x', rsuffix='_y')
med_df.head()

#### Merging duplicate columns

In [None]:
set([col.split('_x')[0].split('_y')[0] for col in med_df.columns if col.endswith('_x') or col.endswith('_y')])

In [None]:
med_df[['drugadmitfrequency_twice_a_day_x', 'drugadmitfrequency_twice_a_day_y',
        'drughiclseqno_10321_x', 'drughiclseqno_10321_y']].head(20)

In [None]:
med_df.dtypes.value_counts()

In [None]:
med_df = du.data_processing.merge_columns(med_df, inplace=True)
med_df.sample(20)

In [None]:
set([col.split('_x')[0].split('_y')[0] for col in med_df.columns if col.endswith('_x') or col.endswith('_y')])

In [None]:
med_df[['drugadmitfrequency_twice_a_day', 'drughiclseqno_10321']].head(20)

Save the current dataframe:

In [None]:
med_df.reset_index(inplace=True)
med_df.head()

In [None]:
med_df.to_feather(f'{data_path}normalized/ohe/med_and_adms_drug.ftr')

In [None]:
med_df.dtypes.value_counts()

#### Joining with the rest of the eICU data

In [None]:
med_drug_df = pd.read_feather(f'{data_path}normalized/ohe/med_and_adms_drug.ftr')
med_drug_df = du.utils.convert_dtypes(med_drug_df, dtypes=dtype_dict, inplace=True)
med_drug_df.head()

In [None]:
med_drug_df.dtypes.value_counts()

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_med.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
med_drug_df = med_drug_df[med_drug_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

In [None]:
patient_dies = ~(eICU_df.groupby('patientunitstayid').death_ts.max().isna())
patient_dies

In [None]:
unit_stay_patient_dies = set(patient_dies[patient_dies == True].index)
unit_stay_patient_dies

In [None]:
len(unit_stay_patient_dies)

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(set(med_drug_df.patientunitstayid.unique()) | unit_stay_patient_dies)]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
med_drug_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
len(eICU_df)

In [None]:
eICU_df = eICU_df.join(med_drug_df, how='outer')
eICU_df.head()

In [None]:
len(eICU_df)

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_resp_care.ftr')

In [None]:
eICU_df.dtypes.value_counts()

### Joining with respiratory care data

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_resp_care.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
resp_care_df = resp_care_df[resp_care_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
resp_care_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(resp_care_df, how='outer')
eICU_df.head()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
eICU_df.to_feather(f'{data_path}normalized/ohe/eICU_before_joining_vital_aprdc.ftr')

In [None]:
eICU_df.dtypes.value_counts()

### Joining with aperiodic vital signals data

In [None]:
eICU_df = pd.read_feather(f'{data_path}normalized/ohe/eICU_before_joining_vital_aprdc.ftr')
eICU_df = du.utils.convert_dtypes(eICU_df, dtypes=dtype_dict, inplace=True)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
vital_aprdc_df = vital_aprdc_df[vital_aprdc_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

In [None]:
patient_dies = ~(eICU_df.groupby('patientunitstayid').death_ts.max().isna())
patient_dies

In [None]:
unit_stay_patient_dies = set(patient_dies[patient_dies == True].index)
unit_stay_patient_dies

In [None]:
len(unit_stay_patient_dies)

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(set(vital_aprdc_df.patientunitstayid.unique()) | unit_stay_patient_dies)]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
vital_aprdc_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(vital_aprdc_df, how='outer')
eICU_df.head()

#### Filtering for the lengthiest unit stays

Filter to the 10k lengthiest unit stays, also including those where the patient dies (even if it's not one of the lengthiest, so as to avoid making the dataset even more unbalanced)

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
eICU_df = du.data_processing.load_chunked_data(file_name='eICU_before_joining_vital_prdc_full', n_chunks=8, 
                                               data_path=f'{data_path}normalized/ohe/', dtypes=dtype_dict)
eICU_df.head()

Get the 10k lengthiest unit stays:

In [None]:
unit_stay_len = eICU_df.groupby('patientunitstayid').patientunitstayid.count().sort_values(ascending=False)
unit_stay_len

In [None]:
unit_stay_len.value_counts()

In [None]:
unit_stay_long = set(unit_stay_len[:10000].index)
unit_stay_long

In [None]:
len(unit_stay_long)

In [None]:
patient_dies = ~(eICU_df.groupby('patientunitstayid').death_ts.max().isna())
patient_dies

In [None]:
unit_stay_patient_dies = set(patient_dies[patient_dies == True].index)
unit_stay_patient_dies

In [None]:
len(unit_stay_patient_dies)

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(unit_stay_long | unit_stay_patient_dies)]

In [None]:
eICU_df.patientunitstayid.nunique()

Save the current dataframe:

In [None]:
du.data_processing.save_chunked_data(eICU_df, file_name='eICU_before_joining_vital_prdc', n_chunks=8, 
                                     data_path=f'{data_path}normalized/ohe/')

In [None]:
eICU_df.dtypes.value_counts()

### Joining with periodic vital signals data

In [None]:
eICU_df = du.data_processing.load_chunked_data(file_name='eICU_before_joining_vital_prdc', n_chunks=8, 
                                               data_path=f'{data_path}normalized/ohe/', dtypes=dtype_dict)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
vital_prdc_df = vital_prdc_df[vital_prdc_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

In [None]:
patient_dies = ~(eICU_df.groupby('patientunitstayid').death_ts.max().isna())
patient_dies

In [None]:
unit_stay_patient_dies = set(patient_dies[patient_dies == True].index)
unit_stay_patient_dies

In [None]:
len(unit_stay_patient_dies)

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(set(vital_prdc_df.patientunitstayid.unique()) | unit_stay_patient_dies)]

In [None]:
eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
vital_prdc_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(vital_prdc_df, how='outer')
eICU_df.head()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
du.data_processing.save_chunked_data(eICU_df, file_name='eICU_before_joining_lab', n_chunks=8, 
                                     data_path=f'{data_path}normalized/ohe/')

In [None]:
eICU_df.dtypes.value_counts()

### Joining with lab data

In [None]:
eICU_df = du.data_processing.load_chunked_data(file_name='eICU_before_joining_lab', n_chunks=8, 
                                               data_path=f'{data_path}normalized/ohe/', dtypes=dtype_dict)
eICU_df.head()

Filter to the unit stays that also have data in the other tables:

In [None]:
lab_df = lab_df[lab_df.patientunitstayid.isin(eICU_df.patientunitstayid.unique())]

In [None]:
# patient_dies = ~(eICU_df.groupby('patientunitstayid').death_ts.max().isna())
# patient_dies

In [None]:
# unit_stay_patient_dies = set(patient_dies[patient_dies == True].index)
# unit_stay_patient_dies

In [None]:
# len(unit_stay_patient_dies)

Also filter only to the unit stays that have data in this new table, considering its importance:

In [None]:
# eICU_df.patientunitstayid.nunique()

In [None]:
# eICU_df = eICU_df[eICU_df.patientunitstayid.isin(set(lab_df.patientunitstayid.unique()) | unit_stay_patient_dies)]

In [None]:
# eICU_df.patientunitstayid.nunique()

Set `patientunitstayid` and `ts` as indeces, for faster data merging:

In [None]:
eICU_df.set_index(['patientunitstayid', 'ts'], inplace=True)
lab_df.set_index(['patientunitstayid', 'ts'], inplace=True)

Merge the dataframes:

In [None]:
eICU_df = eICU_df.join(lab_df, how='outer')
eICU_df.head()

Save the current dataframe:

In [None]:
eICU_df.reset_index(inplace=True)
eICU_df.head()

In [None]:
len(eICU_df)

In [None]:
du.data_processing.save_chunked_data(eICU_df, file_name='eICU_post_joining', n_chunks=8, 
                                     data_path=f'{data_path}normalized/ohe/')

In [None]:
eICU_df.dtypes.value_counts()

In [None]:
eICU_df.dtypes[eICU_df.dtypes=='float64']

## Cleaning the joined data

In [None]:
eICU_df = du.data_processing.load_chunked_data(file_name='eICU_post_joining', n_chunks=8, 
                                               data_path=f'{data_path}normalized/ohe/', dtypes=dtype_dict)
eICU_df.head()

In [None]:
eICU_df.dtypes.value_counts()

In [None]:
# eICU_df.info(memory_usage='deep')

### Removing unit stays that are too short

Make sure that the dataframe is ordered by time `ts`:

In [None]:
eICU_df = eICU_df.sort_values('ts')
eICU_df.head()

Remove unit stays that have data that represent less than 48h:

In [None]:
unit_stay_duration = eICU_df.groupby('patientunitstayid').ts.apply(lambda x: x.max() - x.min())
unit_stay_duration

In [None]:
unit_stay_long = set(unit_stay_duration[unit_stay_duration >= 48*60].index)
unit_stay_long

In [None]:
len(unit_stay_long)

In [None]:
patient_dies = ~(eICU_df.groupby('patientunitstayid').death_ts.max().isna())
patient_dies

In [None]:
unit_stay_patient_dies = set(patient_dies[patient_dies == True].index)
unit_stay_patient_dies

In [None]:
len(unit_stay_patient_dies)

In [None]:
len(unit_stay_long | unit_stay_patient_dies)

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(unit_stay_long | unit_stay_patient_dies)]

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
du.data_processing.save_chunked_data(eICU_df, file_name='eICU_post_short_stay_removal', n_chunks=8, 
                                     data_path=f'{data_path}normalized/ohe/')

### Removing unit stays with too many missing values

Consider removing all unit stays that have, combining rows and columns, a very high percentage of missing values.

In [None]:
eICU_df = du.data_processing.load_chunked_data(file_name='eICU_post_short_stay_removal', n_chunks=8, 
                                               data_path=f'{data_path}normalized/ohe/', dtypes=dtype_dict)
eICU_df.head()

In [None]:
n_features = len(eICU_df.columns)
n_features

Create a temporary column that counts each row's number of missing values:

In [None]:
eICU_df['row_msng_val'] = eICU_df.isnull().sum(axis=1)
eICU_df[['patientunitstayid', 'ts', 'row_msng_val']].head()

Check each unit stay's percentage of missing data points:

In [None]:
# Number of possible data points in each unit stay
n_data_points = eICU_df.groupby('patientunitstayid').ts.count() * n_features
n_data_points

In [None]:
# Number of missing values in each unit stay
n_msng_val = eICU_df.groupby('patientunitstayid').row_msng_val.sum()
n_msng_val

In [None]:
# Percentage of missing values in each unit stay
msng_val_prct = (n_msng_val / n_data_points) * 100
msng_val_prct

In [None]:
msng_val_prct.describe()

Remove unit stays that have too many missing values (>99% of their respective data points):

In [None]:
unit_stay_low_msgn = set(msng_val_prct[msng_val_prct < 99].index)
unit_stay_low_msgn

In [None]:
len(unit_stay_low_msgn)

In [None]:
patient_dies = ~(eICU_df.groupby('patientunitstayid').death_ts.max().isna())
patient_dies

In [None]:
unit_stay_patient_dies = set(patient_dies[patient_dies == True].index)
unit_stay_patient_dies

In [None]:
len(unit_stay_patient_dies)

In [None]:
len(unit_stay_low_msgn | unit_stay_patient_dies)

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[eICU_df.patientunitstayid.isin(unit_stay_low_msgn | unit_stay_patient_dies)]

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df.drop(columns='row_msng_val', inplace=True)

In [None]:
du.data_processing.save_chunked_data(eICU_df, file_name='eICU_post_high_missing_stay_removal', n_chunks=8, 
                                     data_path=f'{data_path}normalized/ohe/')

### Removing columns with too many missing values

We should remove features that have too many missing values (in this case, those that have more than 40% of missing values). Without enough data, it's even risky to do imputation, as it's unlikely for the imputation to correctly model the missing feature.

Actually, it doesn't seem to be worth it, as most columns have around 98% missing data points.

In [None]:
# eICU_df = du.data_processing.load_chunked_data(file_name='eICU_post_high_missing_stay_removal', n_chunks=8, 
#                                                data_path=f'{data_path}normalized/ohe/', dtypes=dtype_dict)
# eICU_df.head()

In [None]:
# missing_values = du.search_explore.dataframe_missing_values(eICU_df)
# missing_values

In [None]:
# len(eICU_df)

In [None]:
# missing_values.percent_missing.describe()

In [None]:
# prev_features = eICU_df.columns
# len(prev_features)

In [None]:
# eICU_df = du.data_processing.remove_cols_with_many_nans(eICU_df, nan_percent_thrsh=99, inplace=True)

In [None]:
# features = eICU_df.columns
# len(features)

Removed features:

In [None]:
# set(prev_features) - set(features)

In [None]:
# eICU_df.head()

In [None]:
# du.data_processing.save_chunked_data(eICU_df, file_name='eICU_post_high_missing_cols_removal', n_chunks=8, 
#                                      data_path=f'{data_path}normalized/ohe/')

### Performing imputation

In [None]:
eICU_df = du.data_processing.load_chunked_data(file_name='eICU_post_high_missing_stay_removal', n_chunks=8, 
                                               data_path=f'{data_path}normalized/ohe/', dtypes=dtype_dict)
eICU_df.head()

In [None]:
du.search_explore.dataframe_missing_values(eICU_df)

#### Constant columns

Imputate patient and past history features separately, as they should remain the same regardless of time.

In [None]:
eICU_df.reset_index(drop=True, inplace=True)

In [None]:
# eICU_df.loc[0, const_columns]

In [None]:
# # Trying to imputate data of just one unit stay
# id_const_columns = ['patientunitstayid'] + const_columns
# # Forward fill and backward fill
# eICU_df[eICU_df.patientunitstayid == 2385766][id_const_columns].groupby('patientunitstayid').apply(lambda group: group.ffill().bfill())

In [None]:
id_const_columns = ['patientunitstayid'] + const_columns
# Forward fill and backward fill
eICU_df.loc[:, id_const_columns] = eICU_df[id_const_columns].groupby('patientunitstayid').apply(lambda group: group.ffill().bfill())

In [None]:
# Replace remaining missing values with zero
eICU_df.loc[:, const_columns] = eICU_df[const_columns].fillna(value=0)

In [None]:
du.data_processing.save_chunked_data(eICU_df, file_name='eICU_post_const_imputation', n_chunks=8, 
                                     data_path=f'{data_path}normalized/ohe/')

In [None]:
du.search_explore.dataframe_missing_values(eICU_df)

In [None]:
# eICU_df[eICU_df.patientunitstayid == 2385766][id_const_columns]

#### Boolean columns

Boolean columns should have their missing values filled with zeros, so as to signal the absense of each feature.

In [None]:
eICU_df = du.data_processing.load_chunked_data(file_name='eICU_post_const_imputation', n_chunks=8, 
                                               data_path=f'{data_path}normalized/ohe/', dtypes=dtype_dict)
eICU_df.head()

In [None]:
du.search_explore.dataframe_missing_values(eICU_df)

In [None]:
existing_columns = list(eICU_df.columns)

In [None]:
bool_columns = list()
for key, val in dtype_dict.items():
    if val == 'UInt8' or val == 'boolean':
        if key in existing_columns:
            bool_columns.append(key)
        elif key.lower() in existing_columns:
            bool_columns.append(key.lower())
        else:
            print(f'Column {key} not found in the dataframe.')

In [None]:
bool_columns

In [None]:
eICU_df.dtypes.value_counts()

In [None]:
eICU_df.dtypes[eICU_df.dtypes != 'UInt8']

In [None]:
# Replace missing values with zero
eICU_df.loc[:, bool_columns] = eICU_df[bool_columns].fillna(value=0)

In [None]:
du.data_processing.save_chunked_data(eICU_df, file_name='eICU_post_bool_imputation', n_chunks=8, 
                                     data_path=f'{data_path}normalized/ohe/')

In [None]:
du.search_explore.dataframe_missing_values(eICU_df)

#### Remaining features

In [None]:
eICU_df = du.data_processing.load_chunked_data(file_name='eICU_post_bool_imputation', n_chunks=8, 
                                               data_path=f'{data_path}normalized/ohe/', dtypes=dtype_dict)
eICU_df.head()

In [None]:
du.search_explore.dataframe_missing_values(eICU_df)

In [None]:
existing_columns = list(eICU_df.columns)

In [None]:
bool_columns = list()
for key, val in dtype_dict.items():
    if val == 'UInt8' or val == 'boolean':
        if key in existing_columns:
            bool_columns.append(key)
        elif key.lower() in existing_columns:
            bool_columns.append(key.lower())
        else:
            print(f'Column {key} not found in the dataframe.')

In [None]:
columns_to_imputate = list(eICU_df.columns)
columns_to_imputate.remove('death_ts')
columns_to_imputate.remove('ts')
columns_to_imputate = list(set(columns_to_imputate) - set(const_columns) - set(bool_columns))
columns_to_imputate

In [None]:
# %%pixie_debugger
# # Trying to imputate data of just one unit stay
# du.data_processing.missing_values_imputation(eICU_df[eICU_df.patientunitstayid == 2385766], 
#                                              columns_to_imputate=columns_to_imputate,
#                                              method='interpolation', id_column='patientunitstayid',
#                                              inplace=True)

In [None]:
eICU_df = du.data_processing.missing_values_imputation(eICU_df, method='interpolation',
                                                       columns_to_imputate=columns_to_imputate,
                                                       id_column='patientunitstayid', 
                                                       zero_bool=False, inplace=True)
eICU_df.head()

In [None]:
# eICU_df = eICU_df.fillna(value=0)
# eICU_df.head()

In [None]:
du.data_processing.save_chunked_data(eICU_df, file_name='eICU', n_chunks=8, 
                                     data_path=f'{data_path}normalized/ohe/')

In [None]:
du.search_explore.dataframe_missing_values(eICU_df)

### Rearranging columns

For ease of use and for better intuition, we should make sure that the ID columns (`patientunitstayid` and `ts`) are the first ones in the dataframe.

In [None]:
eICU_df = du.data_processing.load_chunked_data(file_name='eICU_post_imputation', n_chunks=8, 
                                               data_path=f'{data_path}normalized/ohe/', dtypes=dtype_dict)
eICU_df.head()

In [None]:
columns = list(eICU_df.columns)
columns

In [None]:
columns.remove('patientunitstayid')
columns.remove('ts')

In [None]:
columns = ['patientunitstayid', 'ts'] + columns
columns

In [None]:
eICU_df = eICU_df[columns]
eICU_df.head()

In [None]:
du.data_processing.save_chunked_data(eICU_df, file_name='eICU', n_chunks=8, 
                                     data_path=f'{data_path}normalized/ohe/')

## Setting the label

Define the label column considering the desired time window on which we want to predict mortality (0, 24h, 48h, 72h, etc).

In [None]:
time_window_h = 24

In [None]:
eICU_df['label'] = eICU_df[eICU_df.death_ts - eICU_df.ts <= time_window_h * 60]
eICU_df.head()