# eICU Data Exploration
---

Exploring the eICU dataset from MIT with the data from over 139k patients collected in the US.

The eICU Collaborative Research Database is a multi-center intensive care unit (ICU) database with high granularity data for over 200,000 admissions to ICUs monitored by eICU Programs across the United States. The database is deidentified, and includes vital sign measurements, care plan documentation, severity of illness measures, diagnosis information, treatment information, and more.

## Importing the necessary packages

In [1]:
import pandas as pd              # Pandas to handle the data in dataframes
import re                        # re to do regex searches in string data
import plotly                    # Plotly for interactive and pretty plots
import plotly.graph_objs as go
from datetime import datetime    # datetime to use proper date and time formats
import os                        # os handles directory/workspace changes
from glob import glob            # Find files by name
import yaml                      # Save and load YAML files
import numpy as np               # NumPy to handle numeric and NaN operations
from tqdm import tqdm_notebook   # tqdm allows to track code execution progress
import numbers                   # numbers allows to check if data is numeric
import torch                     # PyTorch to create and apply deep learning models
from torch.utils.data.sampler import SubsetRandomSampler
import data_utils as du          # Data science and machine learning relevant methods

In [2]:
import plotly.io as pio
pio.templates

Templates configuration
-----------------------
    Default template: 'plotly'
    Available templates:
        ['ggplot2', 'seaborn', 'simple_white', 'plotly',
         'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
         'ygridoff', 'gridon', 'none']

Use Plotly in dark mode:

In [3]:
pio.templates.default = 'plotly_dark'

In [4]:
# Change to parent directory (presumably "Documents")
os.chdir("../../..")
# Path to the CSV dataset files
data_path = 'data/'

In [5]:
du.set_pandas_library(lib='pandas')

Allow pandas to show more columns:

In [6]:
pd.set_option('display.max_columns', 3000)
pd.set_option('display.max_rows', 3000)

Set the random seed for reproducibility:

In [7]:
du.set_random_seed(42)

## Initializing variables

In [8]:
stream_dtypes = open(f'{data_path}eICU_dtype_dict.yml', 'r')

In [9]:
dtype_dict = yaml.load(stream_dtypes, Loader=yaml.FullLoader)
dtype_dict

{'CAD': 'UInt8',
 'Cancer': 'UInt8',
 'Ethanol_Use_heavy': 'UInt8',
 'Ethanol_Use_hx_delirium_tremens': 'UInt8',
 'Ethanol_Use_hx_delirium_tremens_and_withdrawal_seizures': 'UInt8',
 'Ethanol_Use_hx_withdrawal_seizures': 'UInt8',
 'Ethanol_Use_moderate_(daily)': 'UInt8',
 'Ethanol_Use_moderate_(not_daily)': 'UInt8',
 'Ethanol_Use_none': 'UInt8',
 'Ethanol_Use_rare': 'UInt8',
 'Smoking_Status_20_-_40_pack_years_-_remote_hx_of_smoking': 'UInt8',
 'Smoking_Status_20_-_40_pack_years_-_still_smoking': 'UInt8',
 'Smoking_Status_<_20_pack_years_-_remote_hx_of_smoking': 'UInt8',
 'Smoking_Status_<_20_pack_years_-_still_smoking': 'UInt8',
 'Smoking_Status_>_40_pack_years_-_remote_hx_of_smoking': 'UInt8',
 'Smoking_Status_>_40_pack_years_-_still_smoking': 'UInt8',
 'Smoking_Status_denies_smoking': 'UInt8',
 'Smoking_Status_smokes_cigar_or_pipe': 'UInt8',
 'Smoking_Status_uses_smokeless_tobacco': 'UInt8',
 'admissionheight': 'float32',
 'admissionweight': 'float32',
 'age': 'float32',
 'bodyweigh

## Exploring the cleaned dataset

### Loading the data

In [None]:
eICU_df = du.data_processing.load_chunked_data(file_name='eICU', data_path=data_path, 
                                               dtypes=dtype_dict, ordered_naming=False)
eICU_df.head()

HBox(children=(FloatProgress(value=0.0, max=800.0), HTML(value='')))

### Basic stuff

In [None]:
eICU_df.dtypes

In [None]:
eICU_df.nunique()

In [None]:
du.search_explore.dataframe_missing_values(eICU_df)

In [None]:
eICU_df.describe().transpose()

Number of model features:

In [None]:
len(eICU_df.columns) - 3

Number of data points (# features x # rows):

In [None]:
len(eICU_df.columns) * len(eICU_df)

### Label analysis

Counting the samples with positive label:

In [None]:
label_count = eICU_df.niv_label.value_counts()
label_count

In [None]:
print(f'{(label_count[True] / (label_count[True] + label_count[False])) * 100}%')

How many subjects always have the same label in their time series:

In [None]:
const_label_subj = list()
for subject in eICU_df.subject_id.unique():
    subject_data = eICU_df[eICU_df.subject_id == subject]
    if subject_data.niv_label.min() == subject_data.niv_label.max():
        const_label_subj.append(subject)
const_label_subj

In [None]:
len(const_label_subj)

In [None]:
percent_const_label_subj = (len(const_label_subj) / eICU_df.subject_id.nunique()) * 100
print(f'{percent_const_label_subj}%')

### Time / sampling variation

In [None]:
eICU_df['delta_ts'] = eICU_df.groupby('subject_id').ts.diff()
eICU_df.head()

In [None]:
eICU_df.delta_ts.describe()

### Sequence length analysis

In [None]:
seq_len = eICU_df.groupby('subject_id').ts.count()
seq_len.head()

In [None]:
seq_len.describe()

### Plots

In [None]:
ALS_proc_gender_count = ALS_proc_df.groupby('subject_id').first().gender.value_counts().to_frame()
data = [go.Pie(labels=ALS_proc_gender_count.index, values=ALS_proc_gender_count.gender)]
layout = go.Layout(title='Patients Gender Demographics')
fig = go.Figure(data, layout)
fig.show()

In [None]:
data = [go.Histogram(x = orig_ALS_df.groupby('subject_id').first().age_at_onset)]
layout = go.Layout(title='Patient age distribution')
fig = go.Figure(data, layout)
fig.show()

In [None]:
ALS_proc_niv_count = ALS_proc_df.niv.value_counts().to_frame()
data = [go.Pie(labels=ALS_proc_niv_count.index, values=ALS_proc_niv_count.niv)]
layout = go.Layout(title='Visits where the patient is using NIV')
fig = go.Figure(data, layout)
fig.show()

In [None]:
data = [go.Histogram(x = ALS_proc_df.niv)]
layout = go.Layout(title='Number of visits where the patient is using NIV')
fig = go.Figure(data, layout)
fig.show()

In [None]:
ALS_proc_patient_niv_count = ALS_proc_df.groupby('subject_id').niv.max().value_counts().to_frame()
data = [go.Pie(labels=ALS_proc_patient_niv_count.index, values=ALS_proc_patient_niv_count.niv)]
layout = go.Layout(title='Patients which eventually use NIV')
fig = go.Figure(data, layout)
fig.show()

In [None]:
data = [go.Scatter(
                    x = ALS_proc_df.fvc,
                    y = ALS_proc_df.niv,
                    mode = 'markers'
                  )]
layout = go.Layout(
                    title='Relation between NIV use and FVC values',
                    xaxis=dict(title='FVC'),
                    yaxis=dict(title='NIV')
                  )
fig = go.Figure(data, layout)
fig.show()

In [None]:
# Average FVC value when NIV is used:
ALS_proc_df[ALS_proc_df.niv == 1].fvc.mean()

**Comments:** The average FVC when NIV is 1 is lower than average, but the scatter plot doesn't show a very clear dependence between the variables.

In [None]:
data = [go.Scatter(
                    x = ALS_proc_df['disease_duration'],
                    y = ALS_proc_df.niv,
                    mode = 'markers'
                  )]
layout = go.Layout(
                    title='Relation between NIV use and disease duration',
                    xaxis=dict(title='Disease duration'),
                    yaxis=dict(title='NIV')
                  )
fig = go.Figure(data, layout)
fig.show()

In [None]:
# Average disease duration when NIV is used:
ALS_proc_df[ALS_proc_df.niv == 1]['disease_duration'].mean()

In [None]:
data = [go.Scatter(
                    x = ALS_proc_df['age_at_onset'],
                    y = ALS_proc_df.niv,
                    mode = 'markers'
                  )]
layout = go.Layout(
                    title='Relation between NIV use and age',
                    xaxis=dict(title='Age at onset'),
                    yaxis=dict(title='NIV')
                  )
fig = go.Figure(data, layout)
fig.show()

In [None]:
# Average age at onset when NIV is used:
ALS_proc_df[ALS_proc_df.niv == 1]['age_at_onset'].mean()

In [None]:
ALS_proc_NIV_3R = ALS_proc_df.groupby(['3r', 'niv']).subject_id.count().to_frame().reset_index()
data = [go.Bar(
                    x=ALS_proc_NIV_3R[ALS_proc_NIV_3R.niv == 0]['3r'],
                    y=ALS_proc_NIV_3R[ALS_proc_NIV_3R.niv == 0]['subject_id'],
                    name='Not used'
              ),
        go.Bar(
                    x=ALS_proc_NIV_3R[ALS_proc_NIV_3R.niv == 1]['3r'],
                    y=ALS_proc_NIV_3R[ALS_proc_NIV_3R.niv == 1]['subject_id'],
                    name='Using NIV'
        )]
layout = go.Layout(title='Relation between NIV use and normalized 3R', barmode='group')
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
ALS_NIV_3R = orig_ALS_df.groupby(['3r', 'niv']).subject_id.count().to_frame().reset_index()
data = [go.Bar(
                    x=ALS_NIV_3R[ALS_NIV_3R.niv == 0]['3r'],
                    y=ALS_NIV_3R[ALS_NIV_3R.niv == 0]['subject_id'],
                    name='Not used'
              ),
        go.Bar(
                    x=ALS_NIV_3R[ALS_NIV_3R.niv == 1]['3r'],
                    y=ALS_NIV_3R[ALS_NIV_3R.niv == 1]['subject_id'],
                    name='Using NIV'
        )]
layout = go.Layout(title='Relation between NIV use and 3R', barmode='group')
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
# Average 3R value when NIV is used:
ALS_proc_df[ALS_proc_df.niv == 1]['3r'].mean()

**Comments:** Clearly, there's a big dependence of the use of NIV with the respiratory symptoms indicated by 3R, as expected.

In [None]:
data = [go.Histogram(x = ALS_proc_df[ALS_proc_df.niv == 0].p10, name='Not used'),
        go.Histogram(x = ALS_proc_df[ALS_proc_df.niv == 1].p10, name='Using NIV')]
layout = go.Layout(title='Relation between NIV use and normalized P10.')
fig = go.Figure(data, layout)
fig.show()

In [None]:
data = [go.Histogram(x = orig_ALS_df[orig_ALS_df.niv == 0].p10, name='Not used'),
        go.Histogram(x = orig_ALS_df[orig_ALS_df.niv == 1].p10, name='Using NIV')]
layout = go.Layout(title='Relation between NIV use and P10.')
fig = go.Figure(data, layout)
fig.show()

In [None]:
# Average P10 value when NIV is used:
ALS_proc_df[ALS_proc_df.niv == 1]['p10'].mean()

**Comments:** Clearly, there's a big dependence of the use of NIV with the respiratory symptoms indicated by P10, as expected.

In [None]:
ALS_proc_NIV_R = ALS_proc_df.groupby(['r', 'niv']).subject_id.count().to_frame().reset_index()
data = [go.Bar(
                    x=ALS_proc_NIV_R[ALS_proc_NIV_R.niv == 0]['r'],
                    y=ALS_proc_NIV_R[ALS_proc_NIV_R.niv == 0]['subject_id'],
                    name='Not used'
              ),
        go.Bar(
                    x=ALS_proc_NIV_R[ALS_proc_NIV_R.niv == 1]['r'],
                    y=ALS_proc_NIV_R[ALS_proc_NIV_R.niv == 1]['subject_id'],
                    name='Using NIV'
        )]
layout = go.Layout(title='Relation between NIV use and normalized R', barmode='group')
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
ALS_NIV_R = orig_ALS_df.groupby(['r', 'niv']).subject_id.count().to_frame().reset_index()
data = [go.Bar(
                    x=ALS_NIV_R[ALS_NIV_R.niv == 0]['r'],
                    y=ALS_NIV_R[ALS_NIV_R.niv == 0]['subject_id'],
                    name='Not used'
              ),
        go.Bar(
                    x=ALS_NIV_R[ALS_NIV_R.niv == 1]['r'],
                    y=ALS_NIV_R[ALS_NIV_R.niv == 1]['subject_id'],
                    name='Using NIV'
        )]
layout = go.Layout(title='Relation between NIV use and R', barmode='group')
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
# Average R value when NIV is used:
ALS_proc_df[ALS_proc_df.niv == 1]['r'].mean()

**Comments:** There seems to be a relationship between the use of NIV and the respiratory symptoms indicated by R, as expected.

In [None]:
data = [go.Histogram(x = ALS_proc_df[ALS_proc_df.niv == 0].bmi, name='Not used'),
        go.Histogram(x = ALS_proc_df[ALS_proc_df.niv == 1].bmi, name='Using NIV')]
layout = go.Layout(title='Relation between NIV use and normalized BMI.')
fig = go.Figure(data, layout)
fig.show()

In [None]:
data = [go.Histogram(x = orig_ALS_df[orig_ALS_df.niv == 0].bmi, name='Not used'),
        go.Histogram(x = orig_ALS_df[orig_ALS_df.niv == 1].bmi, name='Using NIV')]
layout = go.Layout(title='Relation between NIV use and BMI.')
fig = go.Figure(data, layout)
fig.show()

In [None]:
# Average BMI value when NIV is used:
ALS_proc_df[ALS_proc_df.niv == 1]['bmi'].mean()

**Comments:** There is no clear, universal relationship between the use of NIV and BMI.

In [None]:
ALS_proc_NIV_p5 = ALS_proc_df.groupby(['p5', 'niv']).subject_id.count().to_frame().reset_index()
data = [go.Bar(
                    x=ALS_proc_NIV_p5[ALS_proc_NIV_p5.niv == 0]['p5'],
                    y=ALS_proc_NIV_p5[ALS_proc_NIV_p5.niv == 0]['subject_id'],
                    name='Not used'
              ),
        go.Bar(
                    x=ALS_proc_NIV_p5[ALS_proc_NIV_p5.niv == 1]['p5'],
                    y=ALS_proc_NIV_p5[ALS_proc_NIV_p5.niv == 1]['subject_id'],
                    name='Using NIV'
        )]
layout = go.Layout(title='Relation between NIV use and normalized P5', barmode='group')
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
ALS_NIV_p5 = orig_ALS_df.groupby(['p5', 'niv']).subject_id.count().to_frame().reset_index()
data = [go.Bar(
                    x=ALS_NIV_p5[ALS_NIV_p5.niv == 0]['p5'],
                    y=ALS_NIV_p5[ALS_NIV_p5.niv == 0]['subject_id'],
                    name='Not used'
              ),
        go.Bar(
                    x=ALS_NIV_p5[ALS_NIV_p5.niv == 1]['p5'],
                    y=ALS_NIV_p5[ALS_NIV_p5.niv == 1]['subject_id'],
                    name='Using NIV'
        )]
layout = go.Layout(title='Relation between NIV use and P5', barmode='group')
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
# Average P5 value when NIV is used:
ALS_proc_df[ALS_proc_df.niv == 1]['p5'].mean()

**Comments:** There seems to be a relationship between the use of NIV and the strength symptoms indicated by P5.

In [None]:
ALS_proc_NIV_P4 = ALS_proc_df.groupby(['p4', 'niv']).subject_id.count().to_frame().reset_index()
data = [go.Bar(
                    x=ALS_proc_NIV_P4[ALS_proc_NIV_P4.niv == 0]['p4'],
                    y=ALS_proc_NIV_P4[ALS_proc_NIV_P4.niv == 0]['subject_id'],
                    name='Not used'
              ),
        go.Bar(
                    x=ALS_proc_NIV_P4[ALS_proc_NIV_P4.niv == 1]['p4'],
                    y=ALS_proc_NIV_P4[ALS_proc_NIV_P4.niv == 1]['subject_id'],
                    name='Using NIV'
        )]
layout = go.Layout(title='Relation between NIV use and normalized P4', barmode='group')
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
ALS_NIV_P4 = orig_ALS_df.groupby(['p4', 'niv']).subject_id.count().to_frame().reset_index()
data = [go.Bar(
                    x=ALS_NIV_P4[ALS_NIV_P4.niv == 0]['p4'],
                    y=ALS_NIV_P4[ALS_NIV_P4.niv == 0]['subject_id'],
                    name='Not used'
              ),
        go.Bar(
                    x=ALS_NIV_P4[ALS_NIV_P4.niv == 1]['p4'],
                    y=ALS_NIV_P4[ALS_NIV_P4.niv == 1]['subject_id'],
                    name='Using NIV'
        )]
layout = go.Layout(title='Relation between NIV use and P4', barmode='group')
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
# Average P4 value when NIV is used:
ALS_proc_df[ALS_proc_df.niv == 1]['p4'].mean()

**Comments:** There seems to be a relationship between the use of NIV and the handwriting symptoms indicated by P4.

In [None]:
data = [go.Histogram(x = ALS_proc_df[ALS_proc_df.niv == 0]['p0.1'], name='Not used'),
        go.Histogram(x = ALS_proc_df[ALS_proc_df.niv == 1]['p0.1'], name='Using NIV')]
layout = go.Layout(title='Relation between NIV use and normalized P0.1.')
fig = go.Figure(data, layout)
fig.show()

In [None]:
data = [go.Histogram(x = orig_ALS_df[orig_ALS_df.niv == 0]['p0.1'], name='Not used'),
        go.Histogram(x = orig_ALS_df[orig_ALS_df.niv == 1]['p0.1'], name='Using NIV')]
layout = go.Layout(title='Relation between NIV use and P0.1.')
fig = go.Figure(data, layout)
fig.show()

In [None]:
# Average P0.1 value when NIV is used:
ALS_proc_df[ALS_proc_df.niv == 1]['p0.1'].mean()

**Comments:** There is no clear, universal relationship between the use of NIV and P0.1.