In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2

# below imports are used to print out pretty pandas dataframes
from IPython.display import display, HTML

%matplotlib inline
plt.style.use('ggplot')

In [2]:
# information used to create a database connection
sqluser = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'

# Connect to postgres with a copy of the MIMIC-III database
con = psycopg2.connect(dbname=dbname, user=sqluser)

# the below statement is prepended to queries to ensure they select from the right schema
query_schema = 'set search_path to ' + schema_name + ';'

In [3]:
def apply_inclusion_criteria(df):
    print('Initial size of table : ' + str(df.shape[0]))
    df = df.drop_duplicates(['icustay_id','creatinine_time'])
    print('After dropping duplicates : ' + str(df.shape[0]))
    criteria_list = [i for i in df.columns.values if 'inclusion' in i]
    for c in criteria_list:
        df = df.loc[df[c]==1,:].drop(c,axis=1)    
    print('After applying inclusion criteria : ' + str(df.shape[0]))
    return df

# Load file with features to be found in chartevents table

In [4]:
features_info = pd.read_csv('features_info.csv')
print(features_info)

                          name        table  variable  item_id  item_id_2  \
0   Arterial_pressure_systolic  chartevents  valuenum        6       51.0   
1  Arterial_pressure_diastolic  chartevents  valuenum     8364     8368.0   
2                   Heart_rate  chartevents  valuenum      211   220045.0   
3                 Weight_daily  chartevents  valuenum      763   224639.0   
4                  Temperature  chartevents  valuenum   223761      678.0   
5                 Urine_output  chartevents  valuenum    43966    44706.0   
6             day_urine_output  chartevents  valuenum    43372        NaN   

   item_id_3  item_id_4  item_id_5  item_id_6  
0      455.0   220050.0   220179.0   225309.0  
1     8441.0   220051.0   220180.0   225310.0  
2        NaN        NaN        NaN        NaN  
3        NaN        NaN        NaN        NaN  
4      645.0        NaN        NaN        NaN  
5        NaN        NaN        NaN        NaN  
6        NaN        NaN        NaN        NaN  

# Retrieve interesting features for patients that match inclusion criteria

In [5]:
query = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    ce.valuenum as creatinine, ce.storetime as creatinine_time,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt
  from icustays icu
  inner join chartevents ce
    on icu.subject_id = ce.subject_id
    and ce.itemid = 220615
    and ce.valuenum is not null
    and ce.storetime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
),
cr_inc as
(
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt
  from cr
),
cr_feat as
(
select
cr_inc.subject_id, cr_inc.icustay_id, cr_inc.intime, cr_inc.outtime,
    cr_inc.creatinine, cr_inc.creatinine_time,
    cr_inc.age, cr_inc.inclusion_age,
    cr_inc.length_of_stay, cr_inc.inclusion_length_of_stay,
    cr_inc.rrt, cr_inc.inclusion_rrt,
    ce.valuenum as """+features_info.loc[0,'name']+""", EXTRACT('epoch' from cr_inc.creatinine_time - ce.storetime) as """+features_info.loc[0,'name']+"""_delay, ce.storetime as """+features_info.loc[0,'name']+"""_time
  from cr_inc
  inner join """+features_info.loc[0,'table']+""" ce
    on cr_inc.subject_id = ce.subject_id
    and ce.itemid = """+str(features_info.loc[0,'item_id'])+"""
    and ce."""+features_info.loc[0,'variable']+""" is not null
    and ce.storetime between cr_inc.intime and cr_inc.creatinine_time
)
select 
a.subject_id, a.icustay_id, a.intime, a.outtime,
    a.creatinine, a.creatinine_time,
    a.age, a.inclusion_age,
    a.length_of_stay, a.inclusion_length_of_stay,
    a.rrt, a.inclusion_rrt,
    a."""+features_info.loc[0,'name']+""", a."""+features_info.loc[0,'name']+"""_delay, a."""+features_info.loc[0,'name']+"""_time
from cr_feat as a
    join (
        select creatinine_time, min("""+features_info.loc[0,'name']+"""_delay) as """+features_info.loc[0,'name']+"""_delay
        from cr_feat
        group by creatinine_time
    ) as b on a.creatinine_time = b.creatinine_time
where a."""+features_info.loc[0,'name']+"""_delay = b."""+features_info.loc[0,'name']+"""_delay
"""
df_chartevents = pd.read_sql_query(query, con)
print(df_chartevents.head())
df_chartevents = apply_inclusion_criteria(df_chartevents)
print(df_chartevents.head())
    
for i,row in features_info.loc[1:,:].iterrows():
    print('------------------------------------')
    print('--- Processing feature : ' + row['name'])
    print(row['table'], row['variable'])

    query = query_schema + """
    with cr as
    (
    select
        icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
        ce.valuenum as creatinine, ce.storetime as creatinine_time,
        EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
        (rrt.icustay_id is null) as rrt
      from icustays icu
      inner join chartevents ce
        on icu.subject_id = ce.subject_id
        and ce.itemid = 220615
        and ce.valuenum is not null
        and ce.storetime between icu.intime and icu.outtime
      inner join patients pat
        ON icu.subject_id = pat.subject_id
      left outer join rrt 
        on icu.icustay_id = rrt.icustay_id
    ),
    cr_inc as
    (
    select
    cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
        cr.creatinine, cr.creatinine_time,
        cr.age, CASE
                    WHEN cr.age >= 15 then 1
                ELSE 0 END
                as inclusion_age,
      cr.length_of_stay, CASE
                            WHEN cr.length_of_stay >= 48 then 1
                         ELSE 0 END
                         as inclusion_length_of_stay,
      cr.rrt, CASE
                WHEN cr.rrt = False then 1
              ELSE 0 END
              as inclusion_rrt
      from cr
    ),
    cr_feat as
    (
    select
    cr_inc.subject_id, cr_inc.icustay_id, cr_inc.intime, cr_inc.outtime,
        cr_inc.creatinine, cr_inc.creatinine_time,
        cr_inc.age, cr_inc.inclusion_age,
        cr_inc.length_of_stay, cr_inc.inclusion_length_of_stay,
        cr_inc.rrt, cr_inc.inclusion_rrt,
        ce.valuenum as """+features_info.loc[i,'name']+""", EXTRACT('epoch' from cr_inc.creatinine_time - ce.storetime) as """+features_info.loc[i,'name']+"""_delay, ce.storetime as """+features_info.loc[i,'name']+"""_time
      from cr_inc
      inner join """+features_info.loc[i,'table']+""" ce
        on cr_inc.subject_id = ce.subject_id
        and ce.itemid = """+str(features_info.loc[i,'item_id'])+"""
        and ce."""+features_info.loc[i,'variable']+""" is not null
        and ce.storetime between cr_inc.intime and cr_inc.creatinine_time
    )
    select 
    a.subject_id, a.icustay_id, a.intime, a.outtime,
        a.creatinine, a.creatinine_time,
        a.age, a.inclusion_age,
        a.length_of_stay, a.inclusion_length_of_stay,
        a.rrt, a.inclusion_rrt,
        a."""+features_info.loc[i,'name']+""", a."""+features_info.loc[i,'name']+"""_delay, a."""+features_info.loc[i,'name']+"""_time
    from cr_feat as a
        join (
            select creatinine_time, min("""+features_info.loc[i,'name']+"""_delay) as """+features_info.loc[i,'name']+"""_delay
            from cr_feat
            group by creatinine_time
        ) as b on a.creatinine_time = b.creatinine_time
    where a."""+features_info.loc[i,'name']+"""_delay = b."""+features_info.loc[i,'name']+"""_delay
    """
    df = pd.read_sql_query(query, con)
    df = apply_inclusion_criteria(df)
    print(df.head())
    df_chartevents = pd.merge(df_chartevents,df,on=['subject_id', 'icustay_id', 'intime', 'outtime', 'creatinine',
       'creatinine_time', 'age', 'length_of_stay', 'rrt'],how='outer')
    print('---------------------------------')
    print('Merged table size : ' + str(df_chartevents.shape[0]))
    print(df_chartevents.head())    

Empty DataFrame
Columns: [subject_id, icustay_id, intime, outtime, creatinine, creatinine_time, age, inclusion_age, length_of_stay, inclusion_length_of_stay, rrt, inclusion_rrt, arterial_pressure_systolic, arterial_pressure_systolic_delay, arterial_pressure_systolic_time]
Index: []
Initial size of table : 0
After dropping duplicates : 0
After applying inclusion criteria : 0
Empty DataFrame
Columns: [subject_id, icustay_id, intime, outtime, creatinine, creatinine_time, age, length_of_stay, rrt, arterial_pressure_systolic, arterial_pressure_systolic_delay, arterial_pressure_systolic_time]
Index: []
------------------------------------
--- Processing feature : Arterial_pressure_diastolic
chartevents valuenum
Initial size of table : 0
After dropping duplicates : 0
After applying inclusion criteria : 0
Empty DataFrame
Columns: [subject_id, icustay_id, intime, outtime, creatinine, creatinine_time, age, length_of_stay, rrt, arterial_pressure_diastolic, arterial_pressure_diastolic_delay, arter

  stride //= shape[i]


Initial size of table : 20983
After dropping duplicates : 16755
After applying inclusion criteria : 14391
    subject_id  icustay_id              intime             outtime  \
6          191      280149 2196-04-09 19:52:59 2196-04-15 18:33:22   
7          191      280149 2196-04-09 19:52:59 2196-04-15 18:33:22   
9          191      280149 2196-04-09 19:52:59 2196-04-15 18:33:22   
18         191      280149 2196-04-09 19:52:59 2196-04-15 18:33:22   
19         191      280149 2196-04-09 19:52:59 2196-04-15 18:33:22   

    creatinine     creatinine_time        age  length_of_stay    rrt  \
6          1.8 2196-04-10 00:30:00  73.159244      142.673056  False   
7          2.0 2196-04-10 06:39:00  73.159244      142.673056  False   
9          2.2 2196-04-11 00:30:00  73.159244      142.673056  False   
18         2.6 2196-04-11 05:57:00  73.159244      142.673056  False   
19         2.3 2196-04-12 03:40:00  73.159244      142.673056  False   

    heart_rate  heart_rate_delay     hea

---------------------------------
Merged table size : 120650
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

In [6]:
# Dump to file
df_chartevents.to_csv('creatinine_measurements_1.csv')
print(df_chartevents)

       arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                             NaN                              NaN   
1                             NaN                              NaN   
2                             NaN                              NaN   
3                             NaN                              NaN   
4                             NaN                              NaN   
5                             NaN                              NaN   
6                             NaN                              NaN   
7                             NaN                              NaN   
8                             NaN                              NaN   
9                             NaN                              NaN   
10                            NaN                              NaN   
11                            NaN                              NaN   
12                            NaN                              NaN   
13                  

# Retrieve missing static information

In [7]:
# Creatinine measurements, AKI flags + stage for all stays
# Inclusion flags :
# patient age > 15
# length of stay > 48hours
query = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    ce.valuenum as creatinine, ce.storetime as creatinine_time,
    adm.ethnicity, adm.diagnosis,
    pat.gender as gender,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt
  from icustays icu
  inner join chartevents ce
    on icu.subject_id = ce.subject_id
    and ce.itemid = 220615
    and ce.valuenum is not null
    and ce.storetime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  inner join admissions adm
    on icu.subject_id = adm.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
)
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.ethnicity, cr.diagnosis,
    cr.gender,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt
  from cr
"""
df_static = pd.read_sql_query(query, con)
print(df_static.head())
df_static = apply_inclusion_criteria(df_static)
print(df_static.head())

   subject_id  icustay_id              intime             outtime  creatinine  \
0         494      268296 2171-12-20 22:10:57 2171-12-21 22:37:41         1.0   
1         494      268296 2171-12-20 22:10:57 2171-12-21 22:37:41         1.0   
2         199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50         1.1   
3         199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50         1.1   
4         199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50         1.1   

      creatinine_time        ethnicity       diagnosis gender        age  \
0 2171-12-21 02:16:00  WHITE - RUSSIAN      CHEST PAIN      F  61.997044   
1 2171-12-21 02:16:00  WHITE - RUSSIAN          ANEMIA      F  61.997044   
2 2174-04-17 02:12:00            WHITE  CARDIOMYOPATHY      M  48.500607   
3 2174-04-14 03:11:00            WHITE  CARDIOMYOPATHY      M  48.500607   
4 2174-04-15 03:29:00            WHITE  CARDIOMYOPATHY      M  48.500607   

   inclusion_age  length_of_stay  inclusion_length_of_st

In [8]:
# Dump to file
df_static.to_csv('creatinine_measurements_2.csv')
print(df_static)

        subject_id  icustay_id              intime             outtime  \
2              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
3              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
4              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
5              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
6              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
7              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
8              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
9              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
18             209      229904 2127-08-11 20:43:43 2127-08-15 20:53:22   
19             209      229904 2127-08-11 20:43:43 2127-08-15 20:53:22   
20             209      229904 2127-08-11 20:43:43 2127-08-15 20:53:22   
21             209      229904 2127-08-11 20:43:43 2127-08-15 20:53:22   
22             209      229904 2127-08

# Merge tables into one

In [9]:
print('Chartevents :')
print(df_chartevents.shape)
print(df_chartevents.columns.values)
print('')

print('Static:')
print(df_static.shape)
print(df_static.columns.values)

Chartevents :
(120650, 30)
['arterial_pressure_systolic' 'arterial_pressure_systolic_delay'
 'arterial_pressure_systolic_time' 'arterial_pressure_diastolic'
 'arterial_pressure_diastolic_delay' 'arterial_pressure_diastolic_time'
 'subject_id' 'icustay_id' 'intime' 'outtime' 'creatinine'
 'creatinine_time' 'age' 'length_of_stay' 'rrt' 'heart_rate'
 'heart_rate_delay' 'heart_rate_time' 'weight_daily' 'weight_daily_delay'
 'weight_daily_time' 'temperature' 'temperature_delay' 'temperature_time'
 'urine_output' 'urine_output_delay' 'urine_output_time' 'day_urine_output'
 'day_urine_output_delay' 'day_urine_output_time']

Static:
(126881, 12)
['subject_id' 'icustay_id' 'intime' 'outtime' 'creatinine'
 'creatinine_time' 'ethnicity' 'diagnosis' 'gender' 'age' 'length_of_stay'
 'rrt']


In [10]:
merged_df = pd.merge(df_chartevents,df_static,on=['subject_id', 'icustay_id', 'intime', 'outtime', 'creatinine',
       'creatinine_time', 'age', 'length_of_stay', 'rrt'],how='outer')

# Remove columns that are not features (except icustay_id and patient_id that are needed to build the table with labels)

In [11]:
# REMOVE COLUMNS THAT WERE USED FOR INCLUSION CRITERIA BUT THAT ARE NOT AVAILABLE FEATURES FOR THE PREDICTION
# REMOVE ICUSTAY_IDS FOR WHICH THERE IS ONLY ONE MEASUREMENT OF CREATININE

print('Number of lines : ' + str(merged_df.shape[0]))
to_remove = ['intime','outtime','length_of_stay','rrt']
for c in to_remove:
    if (c in merged_df.columns.values): merged_df = merged_df.drop(c,axis=1)
        
# Remove columns with _time suffix
to_remove = [i for i in merged_df.columns.values if '_time' in i]
for c in to_remove:
    if (c in merged_df.columns.values): merged_df = merged_df.drop(c,axis=1)

# Remove icustay_ids for which there's only one measurement of creatinine
count_mes = merged_df['icustay_id'].value_counts()
to_remove = count_mes.index.values[count_mes==1]
for i in to_remove:
    merged_df = merged_df.loc[merged_df['icustay_id']!=i,:]

print('After dropping unique measurements of creatinine : ' + str(merged_df.shape[0]))
merged_df.head()

Number of lines : 126884
After dropping unique measurements of creatinine : 126815


Unnamed: 0,arterial_pressure_systolic,arterial_pressure_systolic_delay,arterial_pressure_diastolic,arterial_pressure_diastolic_delay,subject_id,icustay_id,creatinine,age,heart_rate,heart_rate_delay,...,weight_daily_delay,temperature,temperature_delay,urine_output,urine_output_delay,day_urine_output,day_urine_output_delay,ethnicity,diagnosis,gender
0,,,,,191,280149,1.8,73.159244,93.0,240.0,...,15360.0,,,,,,,ASIAN,"ARF,GIB",M
1,,,,,191,280149,2.0,73.159244,81.0,4620.0,...,37500.0,,,,,,,ASIAN,"ARF,GIB",M
2,,,,,191,280149,2.2,73.159244,113.0,540.0,...,101760.0,,,,,,,ASIAN,"ARF,GIB",M
3,,,,,191,280149,2.6,73.159244,92.0,4320.0,...,121380.0,,,,,,,ASIAN,"ARF,GIB",M
4,,,,,191,280149,2.3,73.159244,81.0,2880.0,...,199560.0,,,,,,,ASIAN,"ARF,GIB",M


In [12]:
# Dump to file
merged_df.to_csv('creatinine_measurements_merged.csv')
print(merged_df)

       arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                             NaN                              NaN   
1                             NaN                              NaN   
2                             NaN                              NaN   
3                             NaN                              NaN   
4                             NaN                              NaN   
5                             NaN                              NaN   
6                             NaN                              NaN   
7                             NaN                              NaN   
8                             NaN                              NaN   
9                             NaN                              NaN   
10                            NaN                              NaN   
11                            NaN                              NaN   
12                            NaN                              NaN   
13                  