# eICU Data Joining
---

Reading and joining all preprocessed parts of the eICU dataset from MIT with the data from over 139k patients collected in the US.

The main goal of this notebook is to prepare a single CSV document that contains all the relevant data to be used when training a machine learning model that predicts mortality, joining tables, filtering useless columns and performing imputation.

## Importing the necessary packages

In [None]:
import os                                  # os handles directory/workspace changes
import numpy as np                         # NumPy to handle numeric and NaN operations
import yaml                                # Save and load YAML files

In [None]:
# Debugging packages
import pixiedust                           # Debugging in Jupyter Notebook cells

In [None]:
# Change to parent directory (presumably "Documents")
os.chdir("../../..")
# Path to the CSV dataset files
data_path = 'Datasets/Thesis/eICU/uncompressed/cleaned/'
# Path to the code files
project_path = 'GitHub/eICU-mortality-prediction/'

In [None]:
# import ray                                 # Parallelization backend used by Modin
# import modin.pandas as pd                  # Optimized distributed version of Pandas
import pandas as pd
import data_utils as du                    # Data science and machine learning relevant methods

In [None]:
# Setting ray's object size limit to 5GB
# ray.shutdown()
# ray.init(memory=8000 * 1024 * 1024,
#          object_store_memory=5000 * 1024 * 1024)

Set the random seed for reproducibility

In [None]:
du.set_random_seed(42)

## Loading the data

### Patient information

In [None]:
patient_df = pd.read_csv(f'{data_path}normalized/patient.csv')
patient_df.head()

In [None]:
du.search_explore.dataframe_missing_values(patient_df)

Remove rows that don't identify the patient:

In [None]:
patient_df = patient_df[~patient_df.patientunitstayid.isnull()]

In [None]:
du.search_explore.dataframe_missing_values(patient_df)

In [None]:
patient_df.patientunitstayid = patient_df.patientunitstayid.astype(int)
patient_df.ts = patient_df.ts.astype(int)
patient_df.dtypes

In [None]:
note_df = pd.read_csv(f'{data_path}normalized/note.csv')
note_df.head()

Remove the uneeded 'Unnamed: 0' column:

In [None]:
patient_df = patient_df.drop(columns='Unnamed: 0')
# note_df = note_df.drop(columns='Unnamed: 0')

### Diagnosis

In [None]:
diagns_df = pd.read_csv(f'{data_path}normalized/diagnosis.csv')
diagns_df.head()

In [None]:
alrg_df = pd.read_csv(f'{data_path}normalized/allergy.csv')
alrg_df.head()

In [None]:
past_hist_df = pd.read_csv(f'{data_path}normalized/pastHistory.csv')
past_hist_df.head()

Remove the uneeded 'Unnamed: 0' column:

In [None]:
diagns_df = diagns_df.drop(columns='Unnamed: 0')
alrg_df = alrg_df.drop(columns='Unnamed: 0')
past_hist_df = past_hist_df.drop(columns='Unnamed: 0')

### Treatments

In [None]:
treat_df = pd.read_csv(f'{data_path}normalized/treatment.csv')
treat_df.head()

In [None]:
adms_drug_df = pd.read_csv(f'{data_path}normalized/admissionDrug.csv')
adms_drug_df.head()

In [None]:
inf_drug_df = pd.read_csv(f'{data_path}normalized/infusionDrug.csv')
inf_drug_df.head()

In [None]:
med_df = pd.read_csv(f'{data_path}normalized/medication.csv')
med_df.head()

In [None]:
# in_out_df = pd.read_csv(f'{data_path}normalized/intakeOutake.csv')
# in_out_df.head()

Remove the uneeded 'Unnamed: 0' column:

In [None]:
treat_df = treat_df.drop(columns='Unnamed: 0')
adms_drug_df = adms_drug_df.drop(columns='Unnamed: 0')
inf_drug_df = inf_drug_df.drop(columns='Unnamed: 0')
med_df = med_df.drop(columns='Unnamed: 0')
# in_out_df = in_out_df.drop(columns='Unnamed: 0')

### Nursing data

In [None]:
nurse_care_df = pd.read_csv(f'{data_path}normalized/nurseCare.csv')
nurse_care_df.head()

In [None]:
nurse_assess_df = pd.read_csv(f'{data_path}normalized/nurseAssessment.csv')
nurse_assess_df.head()

Remove the uneeded 'Unnamed: 0' column:

In [None]:
nurse_care_df = nurse_care_df.drop(columns='Unnamed: 0')
nurse_assess_df = nurse_assess_df.drop(columns='Unnamed: 0')

### Respiratory data

In [None]:
resp_care_df = pd.read_csv(f'{data_path}normalized/respiratoryCare.csv')
resp_care_df.head()

Remove the uneeded 'Unnamed: 0' column:

In [None]:
resp_care_df = resp_care_df.drop(columns='Unnamed: 0')

### Vital signals

In [None]:
# vital_aprdc_df = pd.read_csv(f'{data_path}normalized/vitalAperiodic.csv')
# vital_aprdc_df.head()

In [None]:
# vital_prdc_df = pd.read_csv(f'{data_path}normalized/vitalPeriodic.csv')
# vital_prdc_df.head()

Remove the uneeded 'Unnamed: 0' column:

In [None]:
# patient_df = patient_df.drop(columns='Unnamed: 0')
# note_df = note_df.drop(columns='Unnamed: 0')

### Exams data

In [None]:
# lab_df = pd.read_csv(f'{data_path}normalized/lab.csv')
# lab_df.head()

In [None]:
# phys_exam_df = pd.read_csv(f'{data_path}normalized/physicalExam.csv')
# phys_exam_df.head()

Remove the uneeded 'Unnamed: 0' column:

In [None]:
# patient_df = patient_df.drop(columns='Unnamed: 0')
# note_df = note_df.drop(columns='Unnamed: 0')

## Joining dataframes

### Checking the matching of unit stays IDs

In [None]:
full_stays_list = set(patient_df.patientunitstayid.unique())

Total number of unit stays:

In [None]:
len(full_stays_list)

In [None]:
note_stays_list = set(note_df.patientunitstayid.unique())

In [None]:
len(note_stays_list)

Number of unit stays that have note data:

In [None]:
len(set.intersection(full_stays_list, note_stays_list))

In [None]:
diagns_stays_list = set(diagns_df.patientunitstayid.unique())

In [None]:
len(diagns_stays_list)

Number of unit stays that have diagnosis data:

In [None]:
len(set.intersection(full_stays_list, diagns_stays_list))

In [None]:
alrg_stays_list = set(alrg_df.patientunitstayid.unique())

In [None]:
len(alrg_stays_list)

Number of unit stays that have allergy data:

In [None]:
len(set.intersection(full_stays_list, alrg_stays_list))

In [None]:
past_hist_stays_list = set(past_hist_df.patientunitstayid.unique())

In [None]:
len(past_hist_stays_list)

Number of unit stays that have past history data:

In [None]:
len(set.intersection(full_stays_list, past_hist_stays_list))

In [None]:
treat_stays_list = set(treat_df.patientunitstayid.unique())

In [None]:
len(treat_stays_list)

Number of unit stays that have treatment data:

In [None]:
len(set.intersection(full_stays_list, treat_stays_list))

In [None]:
adms_drug_stays_list = set(adms_drug_df.patientunitstayid.unique())

In [None]:
len(adms_drug_stays_list)

Number of unit stays that have admission drug data:

In [None]:
len(set.intersection(full_stays_list, adms_drug_stays_list))

In [None]:
inf_drug_stays_list = set(inf_drug_df.patientunitstayid.unique())

In [None]:
len(inf_drug_stays_list)

Number of unit stays that have infusion drug data:

In [None]:
len(set.intersection(full_stays_list, inf_drug_stays_list))

In [None]:
med_stays_list = set(med_df.patientunitstayid.unique())

In [None]:
len(med_stays_list)

Number of unit stays that have medication data:

In [None]:
len(set.intersection(full_stays_list, med_stays_list))

In [None]:
nurse_care_stays_list = set(nurse_care_df.patientunitstayid.unique())

In [None]:
len(nurse_care_stays_list)

Number of unit stays that have nurse care data:

In [None]:
len(set.intersection(full_stays_list, nurse_care_stays_list))

In [None]:
nurse_assess_stays_list = set(nurse_assess_df.patientunitstayid.unique())

In [None]:
len(nurse_assess_stays_list)

Number of unit stays that have nurse assessment data:

In [None]:
len(set.intersection(full_stays_list, nurse_assess_stays_list))

In [None]:
resp_care_stays_list = set(resp_care_df.patientunitstayid.unique())

In [None]:
len(resp_care_stays_list)

Number of unit stays that have nurse care data:

In [None]:
len(set.intersection(full_stays_list, resp_care_stays_list))

### Joining patient with note data

In [None]:
eICU_df = pd.merge(patient_df, note_df, how='outer', on=['patientunitstayid', 'ts'])
eICU_df.head()

### Joining with diagnosis data

In [None]:
eICU_df = pd.merge(eICU_df, diagns_df, how='outer', on=['patientunitstayid', 'ts'])
eICU_df.head()

### Joining with allergy data

In [None]:
eICU_df = pd.merge(eICU_df, alrg_df, how='outer', on=['patientunitstayid', 'ts'])
eICU_df.head()

### Joining with past history data

In [None]:
len(eICU_df)

In [None]:
eICU_df = pd.merge(eICU_df, past_hist_df, how='outer', on='patientunitstayid')
eICU_df.head()

In [None]:
len(eICU_df)

### Joining with treatment data

In [None]:
eICU_df = pd.merge(eICU_df, treat_df, how='outer', on=['patientunitstayid', 'ts'])
eICU_df.head()

### Joining with admission drug data

In [None]:
eICU_df = pd.merge(eICU_df, adms_drug_df, how='outer', on=['patientunitstayid', 'ts'])
eICU_df.head()

### Joining with medication data

In [None]:
eICU_df = pd.merge(eICU_df, med_df, how='outer', on=['patientunitstayid', 'ts'])
eICU_df.head()

### Joining with intake outake data

In [None]:
# eICU_df = pd.merge(eICU_df, in_out_df, how='outer', on=['patientunitstayid', 'ts'])
# eICU_df.head()

### Joining with nurse care data

In [None]:
eICU_df = pd.merge(eICU_df, nurse_care_df, how='outer', on=['patientunitstayid', 'ts'])
eICU_df.head()

### Joining with nurse assessment data

In [None]:
eICU_df = pd.merge(eICU_df, nurse_assess_df, how='outer', on=['patientunitstayid', 'ts'])
eICU_df.head()

### Joining with nurse charting data

In [None]:
# eICU_df = pd.merge(eICU_df, nurse_chart_df, how='outer', on=['patientunitstayid', 'ts'])
# eICU_df.head()

### Joining with respiratory care data

In [None]:
eICU_df = pd.merge(eICU_df, resp_care_df, how='outer', on=['patientunitstayid', 'ts'])
eICU_df.head()

### Joining with aperiodic vital signals data

In [None]:
# eICU_df = pd.merge(eICU_df, vital_aprdc_df, how='outer', on=['patientunitstayid', 'ts'])
# eICU_df.head()

### Joining with periodic vital signals data

In [None]:
# eICU_df = pd.merge(eICU_df, vital_prdc_df, how='outer', on=['patientunitstayid', 'ts'])
# eICU_df.head()

### Joining with lab data

In [None]:
# eICU_df = pd.merge(eICU_df, lab_df, how='outer', on=['patientunitstayid', 'ts'])
# eICU_df.head()

### Joining with physical exam data

In [None]:
# eICU_df = pd.merge(eICU_df, phys_exam_df, how='outer', on=['patientunitstayid', 'ts'])
# eICU_df.head()

## Cleaning the joined data

### Removing unit stays that are too short

In [None]:
eICU_df.info()

Make sure that the dataframe is ordered by time `ts`:

In [None]:
eICU_df = eICU_df.sort_values('ts')
eICU_df.head()

Remove unit stays that have less than 10 records:

In [None]:
unit_stay_len = eICU_df.groupby('patientunitstayid').patientunitstayid.count()
unit_stay_len

In [None]:
unit_stay_short = set(unit_stay_len[unit_stay_len < 10].index)
unit_stay_short

In [None]:
len(unit_stay_short)

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[~eICU_df.patientunitstayid.isin(unit_stay_short)]

In [None]:
eICU_df.patientunitstayid.nunique()

Remove unit stays that have data that represent less than 48h:

In [None]:
unit_stay_duration = eICU_df.groupby('patientunitstayid').ts.apply(lambda x: x.max() - x.min())
unit_stay_duration

In [None]:
unit_stay_short = set(unit_stay_duration[unit_stay_duration < 48*60].index)
unit_stay_short

In [None]:
len(unit_stay_short)

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[~eICU_df.patientunitstayid.isin(unit_stay_short)]

In [None]:
eICU_df.patientunitstayid.nunique()

### Joining duplicate columns

#### Continuous features

In [None]:
set([col.split('_x')[0].split('_y')[0] for col in eICU_df.columns if col.endswith('_x') or col.endswith('_y')])

In [None]:
eICU_df[['drugdosage_x', 'drughiclseqno_x',
         'drugdosage_y', 'drughiclseqno_y']].head(20)

In [None]:
eICU_df[eICU_df.index == 2564878][['drugdosage_x', 'drughiclseqno_x',
                                   'drugdosage_y', 'drughiclseqno_y']]

In [None]:
eICU_df = du.data_processing.merge_columns(eICU_df, cols_to_merge=['drugdosage', 'drughiclseqno'])
eICU_df.sample(20)

In [None]:
eICU_df[['drugdosage', 'drughiclseqno']].head(20)

In [None]:
eICU_df[eICU_df.index == 2564878][['drugdosage', 'drughiclseqno']]

#### Categorical features

Join encodings of the same features, from different tables.

Load encoding dictionaries:

In [None]:
stream_adms_drug = open(f'{data_path}cat_embed_feat_enum_adms_drug.yaml', 'r')
stream_med = open(f'{data_path}cat_embed_feat_enum_med.yaml', 'r')
cat_embed_feat_enum_adms_drug = yaml.load(stream_adms_drug, Loader=yaml.FullLoader)
cat_embed_feat_enum_med = yaml.load(stream_med, Loader=yaml.FullLoader)

In [None]:
eICU_df[['drugadmitfrequency_x', 'drugunit_x',
         'drugadmitfrequency_y', 'drugunit_y']].head(20)

Standardize the encoding of similar columns:

In [None]:
list(cat_embed_feat_enum_adms_drug.keys())

In [None]:
list(cat_embed_feat_enum_med.keys())

In [None]:
eICU_df.to_csv(f'{data_path}normalized/eICU_post_merge_continuous_cols.csv')

In [None]:
eICU_df = pd.read_csv(f'{data_path}normalized/eICU_post_merge_continuous_cols.csv')
eICU_df.head()

In [None]:
eICU_df, cat_embed_feat_enum['drugadmitfrequency'] = du.embedding.converge_enum(eICU_df, cat_feat_name=['drugadmitfrequency_x', 
                                                                                                        'drugadmitfrequency_y'], 
                                                                                dict1=cat_embed_feat_enum_adms_drug['drugadmitfrequency'],
                                                                                dict2=cat_embed_feat_enum_med['frequency'],
                                                                                nan_value=0, sort=True, inplace=True)

In [None]:
eICU_df, cat_embed_feat_enum['drugunit'] = du.embedding.converge_enum(eICU_df, cat_feat_name=['drugunit_x', 
                                                                                              'drugunit_y'],
                                                                      dict1=cat_embed_feat_enum_adms_drug['drugunit'],
                                                                      dict2=cat_embed_feat_enum_med['drugunit'],
                                                                      nan_value=0, sort=True, inplace=True)

Merge the features:

In [None]:
eICU_df = du.data_processing.merge_columns(eICU_df, cols_to_merge=['drugadmitfrequency', 'drugunit'])
eICU_df.sample(20)

In [None]:
eICU_df[['drugadmitfrequency', 'drugunit']].head(20)

In [None]:
eICU_df.to_csv(f'{data_path}normalized/eICU_post_merge_categorical_cols.csv')

### Creating a single encoding dictionary for the complete dataframe

Combine the encoding dictionaries of all tables, having in account the converged ones, into a single dictionary representative of all the categorical features in the resulting dataframe.

In [None]:
# [TODO] Add dictionaries if their keys aren't already in the overall dictionary `cat_embed_feat_enum`

Save the final encoding dictionary:

In [None]:
stream = open(f'{data_path}/cleaned/cat_embed_feat_enum_eICU.yaml', 'w')
yaml.dump(cat_embed_feat_enum, stream, default_flow_style=False)

### Removing columns with too many missing values

We should remove features that have too many missing values (in this case, those that have more than 40% of missing values). Without enough data, it's even risky to do imputation, as it's unlikely for the imputation to correctly model the missing feature.

In [None]:
du.search_explore.dataframe_missing_values(eICU_df)

In [None]:
prev_features = eICU_df.columns
len(prev_features)

In [None]:
eICU_df = du.data_processing.remove_cols_with_many_nans(eICU_df, nan_percent_thrsh=70, inplace=True)

In [None]:
features = eICU_df.columns
len(features)

Removed features:

In [None]:
set(prev_features) - set(features)

In [None]:
eICU_df.head()

### Removing rows with too many missing values

This actually might not make sense to do, as some tables, such as `patient`, are set in a timestamp that is unlikely to have matches in other tables, although it's still useful to add.

In [None]:
# len(eICU_df)

In [None]:
# n_features = len(eICU_df.columns)
# n_features

In [None]:
# eICU_df = eICU_df[eICU_df.isnull().sum(axis=1) < 0.5 * n_features]

In [None]:
# len(eICU_df)

### Removing unit stays with too many missing values

Consider removing all unit stays that have, combining rows and columns, a very high percentage of missing values.

In [None]:
n_features = len(eICU_df.columns)
n_features

Create a temporary column that counts each row's number of missing values:

In [None]:
eICU_df['row_msng_val'] = eICU_df.isnull().sum(axis=1)
eICU_df[['patientunitstay', 'ts', 'row_msng_val']].head()

Check each unit stay's percentage of missing data points:

In [None]:
# Number of possible data points in each unit stay
n_data_points = eICU_df.groupby('patientunitstayid').ts.count() * n_features
n_data_points

In [None]:
# Number of missing values in each unit stay
n_msng_val = eICU_df.groupby('patientunitstayid').row_msng_val.sum()
n_msng_val

In [None]:
# Percentage of missing values in each unit stay
msng_val_prct = (n_msng_val / n_data_points) * 100
msng_val_prct

In [None]:
msng_val_prct.describe()

Remove unit stays that have too many missing values (>70% of their respective data points):

In [None]:
unit_stay_high_msgn = set(msng_val_prct[msng_val_prct > 70].index)
unit_stay_high_msgn

In [None]:
eICU_df.patientunitstayid.nunique()

In [None]:
eICU_df = eICU_df[~eICU_df.patientunitstayid.isin(unit_stay_high_msgn)]

In [None]:
eICU_df.patientunitstayid.nunique()

### Performing imputation

In [None]:
du.search_explore.dataframe_missing_values(eICU_df)

In [None]:
# [TODO] Be careful to avoid interpolating categorical features (e.g. `drugunit`); these must only 
# be imputated through zero filling
eICU_df = du.data_processing.missing_values_imputation(eICU_df, method='interpolation', 
                                                       id_column='patientunitstay', inplace=True)
eICU_df.head()

In [None]:
du.search_explore.dataframe_missing_values(eICU_df)

## Setting the label

Define the label column considering the desired time window on which we want to predict mortality (0, 24h, 48h, 72h, etc).

In [None]:
time_window_h = 24

In [None]:
eICU_df['label'] = eICU_df[eICU_df.death_ts - eICU_df.ts <= time_window_h * 60]
eICU_df.head()