In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2

# below imports are used to print out pretty pandas dataframes
from IPython.display import display, HTML

%matplotlib inline
plt.style.use('ggplot')

In [2]:
# information used to create a database connection
sqluser = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'

# Connect to postgres with a copy of the MIMIC-III database
con = psycopg2.connect(dbname=dbname, user=sqluser)

# the below statement is prepended to queries to ensure they select from the right schema
query_schema = 'set search_path to ' + schema_name + ';'

In [3]:
def apply_inclusion_criteria(df):
    print('Initial size of table : ' + str(df.shape[0]))
    df = df.drop_duplicates(['icustay_id','creatinine_time'])
    print('After dropping duplicates : ' + str(df.shape[0]))
    criteria_list = [i for i in df.columns.values if 'inclusion' in i]
    for c in criteria_list:
        df = df.loc[df[c]==1,:].drop(c,axis=1)    
    print('After applying inclusion criteria : ' + str(df.shape[0]))
    return df

# Load file with features to be found in chartevents table

In [4]:
# Read file
features_info = pd.read_csv('features_info.csv')
# Drop lines with no item_id
item_col = [c for c in features_info.columns.values if "item" in c]
features_info = features_info.dropna(axis=0, how='all', subset=item_col).reset_index(drop=True)
features_info.head()

Unnamed: 0,name,table,variable,item_id,item_id_2,item_id_3,item_id_4,item_id_5,item_id_6
0,Arterial_pressure_systolic,chartevents,valuenum,6.0,51.0,455.0,220050.0,220179.0,225309.0
1,Arterial_pressure_diastolic,chartevents,valuenum,8364.0,8368.0,8441.0,220051.0,220180.0,225310.0
2,Heart_rate,chartevents,valuenum,211.0,220045.0,,,,
3,Weight_daily,chartevents,valuenum,763.0,224639.0,,,,
4,Temperature,chartevents,valuenum,223761.0,678.0,645.0,,,


## Remove cell below : test of inclusion criteria with icd-9 codes

In [None]:
# From features_info, retrieve the list of item_id to use with chartevents
item_col = [c for c in features_info.columns.values if "item_id" in c]
item_list = features_info.loc[0,item_col].dropna().astype('int').values
item_str = "(" + str(item_list[0])
for it in item_list[1:]:
    item_str = item_str + "," + str(it)
item_str = item_str + ")"

# List of ICD-9 codes to be excluded
icd_list = ['5856','V420','99681'] # ESRD: 5856 / kidney transplant: V420,99681
icd_str = "'{" + str(icd_list[0])
for icd in icd_list[1:]:
    icd_str = icd_str + "," + str(icd)
icd_str = icd_str + "}'"
print(icd_str)

query = query_schema + """
with cr1 as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    ce.valuenum as creatinine, ce.storetime as creatinine_time,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt,
    diag.icd9_code as diagnosis 
  from icustays icu
  inner join chartevents ce
    on icu.subject_id = ce.subject_id
    and ce.itemid = 220615
    and ce.valuenum is not null
    and ce.storetime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
  inner join diagnoses_icd diag
    on icu.subject_id = diag.subject_id
    and icu.hadm_id = diag.hadm_id
),
cr as
(
select
cr1.subject_id, cr1.icustay_id, cr1.intime, cr1.outtime,
cr1.creatinine, cr1.creatinine_time,
cr1.age,
cr1.length_of_stay,
cr1.rrt,
array_agg(cr1.diagnosis) as diagnoses
from cr1
group by cr1.subject_id, cr1.icustay_id, cr1.intime, cr1.outtime, cr1.creatinine, cr1.creatinine_time, cr1.age,
cr1.length_of_stay,cr1.rrt
),
cr_inc as
(
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt,
  cr.diagnoses, CASE
            WHEN cr.diagnoses && """ + icd_str + """ then 0
          ELSE 1 END
          as inclusion_diagnoses
  from cr
),
cr_feat as
(
select
cr_inc.subject_id, cr_inc.icustay_id, cr_inc.intime, cr_inc.outtime,
    cr_inc.creatinine, cr_inc.creatinine_time,
    cr_inc.age, cr_inc.inclusion_age,
    cr_inc.length_of_stay, cr_inc.inclusion_length_of_stay,
    cr_inc.rrt, cr_inc.inclusion_rrt, cr_inc.diagnoses, cr_inc.inclusion_diagnoses,
    ce.valuenum as """+features_info.loc[0,'name']+""", 
    EXTRACT('epoch' from cr_inc.creatinine_time - ce.storetime) as """+features_info.loc[0,'name']+"""_delay,
    ce.storetime as """+features_info.loc[0,'name']+"""_time,
    ce.itemid as """+features_info.loc[0,'name']+"""_itemid
  from cr_inc
  inner join """+features_info.loc[0,'table']+""" ce
    on cr_inc.subject_id = ce.subject_id
    and ce.itemid in """+ item_str +"""
    and ce."""+features_info.loc[0,'variable']+""" is not null
    and ce.storetime between cr_inc.intime and cr_inc.creatinine_time
)
select 
a.subject_id, a.icustay_id, a.intime, a.outtime,
    a.creatinine, a.creatinine_time,
    a.age, a.inclusion_age,
    a.length_of_stay, a.inclusion_length_of_stay,
    a.rrt, a.inclusion_rrt, a.diagnoses, a.inclusion_diagnoses,
    a."""+features_info.loc[0,'name']+""", a."""+features_info.loc[0,'name']+"""_delay,
    a."""+features_info.loc[0,'name']+"""_time,
    a."""+features_info.loc[0,'name']+"""_itemid
from cr_feat as a
    join (
        select creatinine_time, min("""+features_info.loc[0,'name']+"""_delay) as """+features_info.loc[0,'name']+"""_delay
        from cr_feat
        group by creatinine_time
    ) as b on a.creatinine_time = b.creatinine_time
where a."""+features_info.loc[0,'name']+"""_delay = b."""+features_info.loc[0,'name']+"""_delay
"""
df_chartevents = pd.read_sql_query(query, con)
df_chartevents = apply_inclusion_criteria(df_chartevents)

# Retrieve interesting features for patients that match inclusion criteria

In [6]:
# From features_info, retrieve the list of item_id to use with chartevents
item_col = [c for c in features_info.columns.values if "item_id" in c]
item_list = features_info.loc[0,item_col].dropna().astype('int').values
item_str = "(" + str(item_list[0])
for it in item_list[1:]:
    item_str = item_str + "," + str(it)
item_str = item_str + ")"

# List of ICD-9 codes to be excluded
icd_list = ['5856','V420','99681'] # ESRD: 5856 / kidney transplant: V420,99681
icd_str = "'{" + str(icd_list[0])
for icd in icd_list[1:]:
    icd_str = icd_str + "," + str(icd)
icd_str = icd_str + "}'"
print(icd_str)

query = query_schema + """
with cr1 as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    ce.valuenum as creatinine, ce.storetime as creatinine_time,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt,
    diag.icd9_code as diagnosis 
  from icustays icu
  inner join chartevents ce
    on icu.subject_id = ce.subject_id
    and ce.itemid = 220615
    and ce.valuenum is not null
    and ce.storetime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
  inner join diagnoses_icd diag
    on icu.subject_id = diag.subject_id
    and icu.hadm_id = diag.hadm_id
),
cr as
(
select
cr1.subject_id, cr1.icustay_id, cr1.intime, cr1.outtime,
cr1.creatinine, cr1.creatinine_time,
cr1.age,
cr1.length_of_stay,
cr1.rrt,
array_agg(cr1.diagnosis) as diagnoses
from cr1
group by cr1.subject_id, cr1.icustay_id, cr1.intime, cr1.outtime, cr1.creatinine, cr1.creatinine_time, cr1.age,
cr1.length_of_stay,cr1.rrt
),
cr_inc as
(
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt,
  cr.diagnoses, CASE
            WHEN cr.diagnoses && """ + icd_str + """ then 0
          ELSE 1 END
          as inclusion_diagnoses
  from cr
),
cr_feat as
(
select
cr_inc.subject_id, cr_inc.icustay_id, cr_inc.intime, cr_inc.outtime,
    cr_inc.creatinine, cr_inc.creatinine_time,
    cr_inc.age, cr_inc.inclusion_age,
    cr_inc.length_of_stay, cr_inc.inclusion_length_of_stay,
    cr_inc.rrt, cr_inc.inclusion_rrt, cr_inc.diagnoses, cr_inc.inclusion_diagnoses,
    ce.valuenum as """+features_info.loc[0,'name']+""", 
    EXTRACT('epoch' from cr_inc.creatinine_time - ce.storetime) as """+features_info.loc[0,'name']+"""_delay,
    ce.storetime as """+features_info.loc[0,'name']+"""_time,
    ce.itemid as """+features_info.loc[0,'name']+"""_itemid
  from cr_inc
  inner join """+features_info.loc[0,'table']+""" ce
    on cr_inc.subject_id = ce.subject_id
    and ce.itemid in """+ item_str +"""
    and ce."""+features_info.loc[0,'variable']+""" is not null
    and ce.storetime between cr_inc.intime and cr_inc.creatinine_time
)
select 
a.subject_id, a.icustay_id, a.intime, a.outtime,
    a.creatinine, a.creatinine_time,
    a.age, a.inclusion_age,
    a.length_of_stay, a.inclusion_length_of_stay,
    a.rrt, a.inclusion_rrt, a.diagnoses, a.inclusion_diagnoses,
    a."""+features_info.loc[0,'name']+""", a."""+features_info.loc[0,'name']+"""_delay,
    a."""+features_info.loc[0,'name']+"""_time,
    a."""+features_info.loc[0,'name']+"""_itemid
from cr_feat as a
    join (
        select creatinine_time, min("""+features_info.loc[0,'name']+"""_delay) as """+features_info.loc[0,'name']+"""_delay
        from cr_feat
        group by creatinine_time
    ) as b on a.creatinine_time = b.creatinine_time
where a."""+features_info.loc[0,'name']+"""_delay = b."""+features_info.loc[0,'name']+"""_delay
"""
df_chartevents = pd.read_sql_query(query, con)
df_chartevents = apply_inclusion_criteria(df_chartevents)

# Convert list of diagnoses into str (required to perform the merge)
df_chartevents.loc[:,'diagnoses'] = df_chartevents['diagnoses'].apply(lambda x: ', '.join(sorted(x)))

for i,row in features_info.loc[1:,:].iterrows():
    print('------------------------------------')
    print('--- Processing feature : ' + row['name'])
    # From features_info, retrieve the list of item_id to use with chartevents
    item_col = [c for c in features_info.columns.values if "item_id" in c]
    item_list = features_info.loc[i,item_col].dropna().astype('int').values
    item_str = "(" + str(item_list[0])
    for it in item_list[1:]:
        item_str = item_str + "," + str(it)
    item_str = item_str + ")"

    # List of ICD-9 codes to be excluded
    icd_list = ['5856','V420','99681'] # ESRD: 5856 / kidney transplant: V420,99681
    icd_str = "'{" + str(icd_list[0])
    for icd in icd_list[1:]:
        icd_str = icd_str + "," + str(icd)
    icd_str = icd_str + "}'"

    query = query_schema + """
    with cr1 as
    (
    select
        icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
        ce.valuenum as creatinine, ce.storetime as creatinine_time,
        EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
        (rrt.icustay_id is null) as rrt,
        diag.icd9_code as diagnosis 
      from icustays icu
      inner join chartevents ce
        on icu.subject_id = ce.subject_id
        and ce.itemid = 220615
        and ce.valuenum is not null
        and ce.storetime between icu.intime and icu.outtime
      inner join patients pat
        ON icu.subject_id = pat.subject_id
      left outer join rrt 
        on icu.icustay_id = rrt.icustay_id
      inner join diagnoses_icd diag
        on icu.subject_id = diag.subject_id
        and icu.hadm_id = diag.hadm_id
    ),
    cr as
    (
    select
    cr1.subject_id, cr1.icustay_id, cr1.intime, cr1.outtime,
    cr1.creatinine, cr1.creatinine_time,
    cr1.age,
    cr1.length_of_stay,
    cr1.rrt,
    array_agg(cr1.diagnosis) as diagnoses
    from cr1
    group by cr1.subject_id, cr1.icustay_id, cr1.intime, cr1.outtime, cr1.creatinine, cr1.creatinine_time, cr1.age,
    cr1.length_of_stay,cr1.rrt
    ),
    cr_inc as
    (
    select
    cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
        cr.creatinine, cr.creatinine_time,
        cr.age, CASE
                    WHEN cr.age >= 15 then 1
                ELSE 0 END
                as inclusion_age,
      cr.length_of_stay, CASE
                            WHEN cr.length_of_stay >= 48 then 1
                         ELSE 0 END
                         as inclusion_length_of_stay,
      cr.rrt, CASE
                WHEN cr.rrt = False then 1
              ELSE 0 END
              as inclusion_rrt,
      cr.diagnoses, CASE
                WHEN cr.diagnoses && """ + icd_str + """ then 0
              ELSE 1 END
              as inclusion_diagnoses
      from cr
    ),
    cr_feat as
    (
    select
    cr_inc.subject_id, cr_inc.icustay_id, cr_inc.intime, cr_inc.outtime,
        cr_inc.creatinine, cr_inc.creatinine_time,
        cr_inc.age, cr_inc.inclusion_age,
        cr_inc.length_of_stay, cr_inc.inclusion_length_of_stay,
        cr_inc.rrt, cr_inc.inclusion_rrt, cr_inc.diagnoses, cr_inc.inclusion_diagnoses,
        ce.valuenum as """+features_info.loc[i,'name']+""", 
        EXTRACT('epoch' from cr_inc.creatinine_time - ce.storetime) as """+features_info.loc[i,'name']+"""_delay,
        ce.storetime as """+features_info.loc[i,'name']+"""_time,
        ce.itemid as """+features_info.loc[i,'name']+"""_itemid
      from cr_inc
      inner join """+features_info.loc[i,'table']+""" ce
        on cr_inc.subject_id = ce.subject_id
        and ce.itemid in """+ item_str +"""
        and ce."""+features_info.loc[i,'variable']+""" is not null
        and ce.storetime between cr_inc.intime and cr_inc.creatinine_time
    )
    select 
    a.subject_id, a.icustay_id, a.intime, a.outtime,
        a.creatinine, a.creatinine_time,
        a.age, a.inclusion_age,
        a.length_of_stay, a.inclusion_length_of_stay,
        a.rrt, a.inclusion_rrt, a.diagnoses, a.inclusion_diagnoses,
        a."""+features_info.loc[i,'name']+""", a."""+features_info.loc[i,'name']+"""_delay,
        a."""+features_info.loc[i,'name']+"""_time,
        a."""+features_info.loc[i,'name']+"""_itemid
    from cr_feat as a
        join (
            select creatinine_time, min("""+features_info.loc[i,'name']+"""_delay) as """+features_info.loc[i,'name']+"""_delay
            from cr_feat
            group by creatinine_time
        ) as b on a.creatinine_time = b.creatinine_time
    where a."""+features_info.loc[i,'name']+"""_delay = b."""+features_info.loc[i,'name']+"""_delay
    """
    
    df = pd.read_sql_query(query, con)
    df = apply_inclusion_criteria(df)
    # Convert list of diagnoses into str (required to perform the merge)
    df.loc[:,'diagnoses'] = df['diagnoses'].apply(lambda x: ', '.join(sorted(x)))
    
    df_chartevents = pd.merge(df_chartevents,df,on=['subject_id', 'icustay_id', 'intime', 'outtime', 'creatinine',
       'creatinine_time', 'age', 'length_of_stay', 'rrt', 'diagnoses'],how='outer')
    print('Merged table size : ' + str(df_chartevents.shape[0]))
    print(df_chartevents.head())



'{5856,V420,99681}'
Initial size of table : 208840
After dropping duplicates : 150528
After applying inclusion criteria : 114382
------------------------------------
--- Processing feature : Arterial_pressure_diastolic
Initial size of table : 208914
After dropping duplicates : 150526
After applying inclusion criteria : 114382
Merged table size : 114383
   subject_id  icustay_id              intime             outtime  creatinine  \
0       77815      239231 2166-09-16 16:45:22 2166-09-29 10:07:21         0.6   
1       31558      249349 2150-07-04 03:10:20 2150-07-22 14:13:53         0.6   
2       53763      223136 2194-12-19 10:00:17 2194-12-26 23:51:37         0.8   
3        7533      208809 2114-03-29 09:36:53 2114-04-08 18:39:51         0.8   
4       50772      272254 2110-07-21 12:00:12 2110-07-27 14:20:35         0.8   

      creatinine_time        age  length_of_stay    rrt  \
0 2166-09-21 03:21:00  58.363217      305.366389  False   
1 2150-07-17 01:36:00  52.688716      44

Initial size of table : 145880
After dropping duplicates : 142139
After applying inclusion criteria : 108923
Merged table size : 114691
   subject_id  icustay_id              intime             outtime  creatinine  \
0       77815      239231 2166-09-16 16:45:22 2166-09-29 10:07:21         0.6   
1       31558      249349 2150-07-04 03:10:20 2150-07-22 14:13:53         0.6   
2       53763      223136 2194-12-19 10:00:17 2194-12-26 23:51:37         0.8   
3        7533      208809 2114-03-29 09:36:53 2114-04-08 18:39:51         0.8   
4       50772      272254 2110-07-21 12:00:12 2110-07-27 14:20:35         0.8   

      creatinine_time        age  length_of_stay    rrt  \
0 2166-09-21 03:21:00  58.363217      305.366389  False   
1 2150-07-17 01:36:00  52.688716      443.059167  False   
2 2194-12-21 04:12:00  70.937671      181.855556  False   
3 2114-04-08 07:06:00  66.441977      249.049444  False   
4 2110-07-22 18:14:00  74.513611      146.339722  False   

                      

Initial size of table : 19493
After dropping duplicates : 19487
After applying inclusion criteria : 15083
Merged table size : 114732
   subject_id  icustay_id              intime             outtime  creatinine  \
0       77815      239231 2166-09-16 16:45:22 2166-09-29 10:07:21         0.6   
1       31558      249349 2150-07-04 03:10:20 2150-07-22 14:13:53         0.6   
2       53763      223136 2194-12-19 10:00:17 2194-12-26 23:51:37         0.8   
3        7533      208809 2114-03-29 09:36:53 2114-04-08 18:39:51         0.8   
4       50772      272254 2110-07-21 12:00:12 2110-07-27 14:20:35         0.8   

      creatinine_time        age  length_of_stay    rrt  \
0 2166-09-21 03:21:00  58.363217      305.366389  False   
1 2150-07-17 01:36:00  52.688716      443.059167  False   
2 2194-12-21 04:12:00  70.937671      181.855556  False   
3 2114-04-08 07:06:00  66.441977      249.049444  False   
4 2110-07-22 18:14:00  74.513611      146.339722  False   

                         

Initial size of table : 18364
After dropping duplicates : 18361
After applying inclusion criteria : 14480
Merged table size : 115042
   subject_id  icustay_id              intime             outtime  creatinine  \
0       77815      239231 2166-09-16 16:45:22 2166-09-29 10:07:21         0.6   
1       31558      249349 2150-07-04 03:10:20 2150-07-22 14:13:53         0.6   
2       53763      223136 2194-12-19 10:00:17 2194-12-26 23:51:37         0.8   
3        7533      208809 2114-03-29 09:36:53 2114-04-08 18:39:51         0.8   
4       50772      272254 2110-07-21 12:00:12 2110-07-27 14:20:35         0.8   

      creatinine_time        age  length_of_stay    rrt  \
0 2166-09-21 03:21:00  58.363217      305.366389  False   
1 2150-07-17 01:36:00  52.688716      443.059167  False   
2 2194-12-21 04:12:00  70.937671      181.855556  False   
3 2114-04-08 07:06:00  66.441977      249.049444  False   
4 2110-07-22 18:14:00  74.513611      146.339722  False   

                         

Initial size of table : 19366
After dropping duplicates : 19364
After applying inclusion criteria : 15057
Merged table size : 115049
   subject_id  icustay_id              intime             outtime  creatinine  \
0       77815      239231 2166-09-16 16:45:22 2166-09-29 10:07:21         0.6   
1       31558      249349 2150-07-04 03:10:20 2150-07-22 14:13:53         0.6   
2       53763      223136 2194-12-19 10:00:17 2194-12-26 23:51:37         0.8   
3        7533      208809 2114-03-29 09:36:53 2114-04-08 18:39:51         0.8   
4       50772      272254 2110-07-21 12:00:12 2110-07-27 14:20:35         0.8   

      creatinine_time        age  length_of_stay    rrt  \
0 2166-09-21 03:21:00  58.363217      305.366389  False   
1 2150-07-17 01:36:00  52.688716      443.059167  False   
2 2194-12-21 04:12:00  70.937671      181.855556  False   
3 2114-04-08 07:06:00  66.441977      249.049444  False   
4 2110-07-22 18:14:00  74.513611      146.339722  False   

                         

Initial size of table : 0
After dropping duplicates : 0
After applying inclusion criteria : 0
Merged table size : 115342
   subject_id  icustay_id              intime             outtime  creatinine  \
0       77815      239231 2166-09-16 16:45:22 2166-09-29 10:07:21         0.6   
1       31558      249349 2150-07-04 03:10:20 2150-07-22 14:13:53         0.6   
2       53763      223136 2194-12-19 10:00:17 2194-12-26 23:51:37         0.8   
3        7533      208809 2114-03-29 09:36:53 2114-04-08 18:39:51         0.8   
4       50772      272254 2110-07-21 12:00:12 2110-07-27 14:20:35         0.8   

      creatinine_time        age  length_of_stay    rrt  \
0 2166-09-21 03:21:00  58.363217      305.366389  False   
1 2150-07-17 01:36:00  52.688716      443.059167  False   
2 2194-12-21 04:12:00  70.937671      181.855556  False   
3 2114-04-08 07:06:00  66.441977      249.049444  False   
4 2110-07-22 18:14:00  74.513611      146.339722  False   

                                     

Initial size of table : 6255
After dropping duplicates : 6255
After applying inclusion criteria : 5171
Merged table size : 115345
   subject_id  icustay_id              intime             outtime  creatinine  \
0       77815      239231 2166-09-16 16:45:22 2166-09-29 10:07:21         0.6   
1       31558      249349 2150-07-04 03:10:20 2150-07-22 14:13:53         0.6   
2       53763      223136 2194-12-19 10:00:17 2194-12-26 23:51:37         0.8   
3        7533      208809 2114-03-29 09:36:53 2114-04-08 18:39:51         0.8   
4       50772      272254 2110-07-21 12:00:12 2110-07-27 14:20:35         0.8   

      creatinine_time        age  length_of_stay    rrt  \
0 2166-09-21 03:21:00  58.363217      305.366389  False   
1 2150-07-17 01:36:00  52.688716      443.059167  False   
2 2194-12-21 04:12:00  70.937671      181.855556  False   
3 2114-04-08 07:06:00  66.441977      249.049444  False   
4 2110-07-22 18:14:00  74.513611      146.339722  False   

                            

Initial size of table : 0
After dropping duplicates : 0
After applying inclusion criteria : 0
Merged table size : 115345
   subject_id  icustay_id              intime             outtime  creatinine  \
0       77815      239231 2166-09-16 16:45:22 2166-09-29 10:07:21         0.6   
1       31558      249349 2150-07-04 03:10:20 2150-07-22 14:13:53         0.6   
2       53763      223136 2194-12-19 10:00:17 2194-12-26 23:51:37         0.8   
3        7533      208809 2114-03-29 09:36:53 2114-04-08 18:39:51         0.8   
4       50772      272254 2110-07-21 12:00:12 2110-07-27 14:20:35         0.8   

      creatinine_time        age  length_of_stay    rrt  \
0 2166-09-21 03:21:00  58.363217      305.366389  False   
1 2150-07-17 01:36:00  52.688716      443.059167  False   
2 2194-12-21 04:12:00  70.937671      181.855556  False   
3 2114-04-08 07:06:00  66.441977      249.049444  False   
4 2110-07-22 18:14:00  74.513611      146.339722  False   

                                     

In [7]:
# Dump to file
df_chartevents.to_csv('creatinine_measurements_1.csv')
df_chartevents.head()

Unnamed: 0,subject_id,icustay_id,intime,outtime,creatinine,creatinine_time,age,length_of_stay,rrt,diagnoses,...,total_protein_urine_time,total_protein_urine_itemid,bilirubin,bilirubin_delay,bilirubin_time,bilirubin_itemid,c_reactive_protein,c_reactive_protein_delay,c_reactive_protein_time,c_reactive_protein_itemid
0,77815,239231,2166-09-16 16:45:22,2166-09-29 10:07:21,0.6,2166-09-21 03:21:00,58.363217,305.366389,False,"04104, 2724, 2761, 3229, 3314, 3485, 4019, 430...",...,,,,,,,,,NaT,
1,31558,249349,2150-07-04 03:10:20,2150-07-22 14:13:53,0.6,2150-07-17 01:36:00,52.688716,443.059167,False,"0389, 4538, 4822, 5109, 5118, 51881, 8052, 807...",...,,,,,,,,,NaT,
2,53763,223136,2194-12-19 10:00:17,2194-12-26 23:51:37,0.8,2194-12-21 04:12:00,70.937671,181.855556,False,"2536, 2724, 29181, 30000, 3051, 34290, 3485, 4...",...,,,,,,,,,NaT,
3,7533,208809,2114-03-29 09:36:53,2114-04-08 18:39:51,0.8,2114-04-08 07:06:00,66.441977,249.049444,False,"0389, 2442, 2724, 2763, 2875, 2930, 34982, 401...",...,,,,,,,,,NaT,
4,50772,272254,2110-07-21 12:00:12,2110-07-27 14:20:35,0.8,2110-07-22 18:14:00,74.513611,146.339722,False,"0414, 25002, 2724, 2760, 2859, 2948, 3004, 427...",...,,,,,,,,,NaT,


# Retrieve missing static information

## !!!! Add missing inclusion criteria here

In [12]:
query = query_schema + """
with cr1 as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    ce.valuenum as creatinine, ce.storetime as creatinine_time,
    adm.ethnicity, adm.diagnosis as diagnosis,
    pat.gender as gender,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt,
    diag.icd9_code as diag 
  from icustays icu
  inner join chartevents ce
    on icu.subject_id = ce.subject_id
    and ce.itemid = 220615
    and ce.valuenum is not null
    and ce.storetime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  inner join admissions adm
    on icu.subject_id = adm.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
  inner join diagnoses_icd diag
    on icu.subject_id = diag.subject_id
    and icu.hadm_id = diag.hadm_id
),
cr as
(
select
cr1.subject_id, cr1.icustay_id, cr1.intime, cr1.outtime,
cr1.creatinine, cr1.creatinine_time,
cr1.ethnicity, cr1.diagnosis,
cr1.gender as gender,
cr1.age,
cr1.length_of_stay,
cr1.rrt,
array_agg(cr1.diag) as diagnoses
from cr1
group by cr1.subject_id, cr1.icustay_id, cr1.intime, cr1.outtime, cr1.creatinine, cr1.creatinine_time,
cr1.ethnicity, cr1.diagnosis, cr1.gender, cr1.age, cr1.length_of_stay, cr1.rrt
)
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.ethnicity, cr.diagnosis, cr.gender,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt,
  cr.diagnoses, CASE
            WHEN cr.diagnoses && """ + icd_str + """ then 0
          ELSE 1 END
          as inclusion_diagnoses
  from cr
"""
df_static = pd.read_sql_query(query, con)
df_static = apply_inclusion_criteria(df_static)

# Convert list of diagnoses into str (required to perform the merge)
df_static.loc[:,'diagnoses'] = df_static['diagnoses'].apply(lambda x: ', '.join(sorted(x)))
print(df_static.head())

Initial size of table : 288243
After dropping duplicates : 152630
After applying inclusion criteria : 115784
   subject_id  icustay_id              intime             outtime  creatinine  \
0       96260      278161 2196-08-31 18:24:20 2196-09-05 15:39:35         1.1   
1       92788      241776 2148-12-05 18:00:11 2148-12-23 19:52:48         5.8   
3       56854      279535 2201-02-02 11:39:44 2201-03-17 14:55:55         1.8   
4       74626      274172 2137-05-06 20:20:31 2137-05-30 15:14:22         1.3   
6       90834      280700 2161-05-16 07:30:53 2161-05-20 19:03:29         0.7   

      creatinine_time               ethnicity                 diagnosis  \
0 2196-08-31 21:23:00  BLACK/AFRICAN AMERICAN  STEVENS JOHNSON SYNDROME   
1 2148-12-09 06:13:00                   WHITE              ENDOCARDITIS   
3 2201-02-04 01:46:00                   WHITE               HYPOTENSION   
4 2137-05-17 05:09:00                   WHITE                   DYSPNEA   
6 2161-05-20 16:27:00        

## Remove cell below : bakup

In [None]:
query = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    ce.valuenum as creatinine, ce.storetime as creatinine_time,
    adm.ethnicity, adm.diagnosis,
    pat.gender as gender,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt
  from icustays icu
  inner join chartevents ce
    on icu.subject_id = ce.subject_id
    and ce.itemid = 220615
    and ce.valuenum is not null
    and ce.storetime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  inner join admissions adm
    on icu.subject_id = adm.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
)
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.ethnicity, cr.diagnosis,
    cr.gender,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt
  from cr
"""
df_static = pd.read_sql_query(query, con)
print(df_static.head())
df_static = apply_inclusion_criteria(df_static)
print(df_static.head())

In [13]:
# Dump to file
df_static.to_csv('creatinine_measurements_2.csv')
print(df_static)

        subject_id  icustay_id              intime             outtime  \
0            96260      278161 2196-08-31 18:24:20 2196-09-05 15:39:35   
1            92788      241776 2148-12-05 18:00:11 2148-12-23 19:52:48   
3            56854      279535 2201-02-02 11:39:44 2201-03-17 14:55:55   
4            74626      274172 2137-05-06 20:20:31 2137-05-30 15:14:22   
6            90834      280700 2161-05-16 07:30:53 2161-05-20 19:03:29   
7            31779      217318 2105-12-10 04:47:35 2105-12-30 17:02:55   
8            15311      267389 2148-12-28 17:02:35 2148-12-31 13:09:18   
9            73440      203714 2161-11-23 00:55:11 2161-11-25 13:29:43   
10           77484      200282 2164-05-03 02:49:44 2164-05-18 13:47:57   
11           57935      287815 2146-08-19 21:14:03 2146-08-31 18:48:59   
13           90369      235667 2149-11-04 16:38:12 2149-11-11 11:22:02   
14           29968      202124 2124-12-12 18:11:02 2125-01-06 18:40:01   
15           91046      265671 2156-10

# Merge tables into one

In [14]:
print('Chartevents :')
print(df_chartevents.shape)
print(df_chartevents.columns.values)
print('')

print('Static:')
print(df_static.shape)
print(df_static.columns.values)

Chartevents :
(115345, 118)
['subject_id' 'icustay_id' 'intime' 'outtime' 'creatinine'
 'creatinine_time' 'age' 'length_of_stay' 'rrt' 'diagnoses'
 'arterial_pressure_systolic' 'arterial_pressure_systolic_delay'
 'arterial_pressure_systolic_time' 'arterial_pressure_systolic_itemid'
 'arterial_pressure_diastolic' 'arterial_pressure_diastolic_delay'
 'arterial_pressure_diastolic_time' 'arterial_pressure_diastolic_itemid'
 'heart_rate' 'heart_rate_delay' 'heart_rate_time' 'heart_rate_itemid'
 'weight_daily' 'weight_daily_delay' 'weight_daily_time'
 'weight_daily_itemid' 'temperature' 'temperature_delay' 'temperature_time'
 'temperature_itemid' 'urine_output' 'urine_output_delay'
 'urine_output_time' 'urine_output_itemid' 'day_urine_output'
 'day_urine_output_delay' 'day_urine_output_time' 'day_urine_output_itemid'
 'scr' 'scr_delay' 'scr_time' 'scr_itemid' 'sodium' 'sodium_delay'
 'sodium_time' 'sodium_itemid' 'potassium' 'potassium_delay'
 'potassium_time' 'potassium_itemid' 'calcium' 'c

In [15]:
merged_df = pd.merge(df_chartevents,df_static,on=['subject_id', 'icustay_id', 'intime', 'outtime', 'creatinine',
       'creatinine_time', 'age', 'length_of_stay', 'rrt', 'diagnoses'],how='outer')

# Remove columns that are not features (except icustay_id and patient_id that are needed to build the table with labels)

In [17]:
# REMOVE COLUMNS THAT WERE USED FOR INCLUSION CRITERIA BUT THAT ARE NOT AVAILABLE FEATURES FOR THE PREDICTION
# REMOVE ICUSTAY_IDS FOR WHICH THERE IS ONLY ONE MEASUREMENT OF CREATININE

print('Number of lines : ' + str(merged_df.shape[0]))
to_remove = ['intime','outtime','length_of_stay','rrt','diagnoses']
for c in to_remove:
    if (c in merged_df.columns.values): merged_df = merged_df.drop(c,axis=1)
        
# Remove columns with _time suffix
to_remove = [i for i in merged_df.columns.values if '_time' in i]
for c in to_remove:
    if (c in merged_df.columns.values): merged_df = merged_df.drop(c,axis=1)

# Remove columns with _itemid suffix
to_remove = [i for i in merged_df.columns.values if '_itemid' in i]
for c in to_remove:
    if (c in merged_df.columns.values): merged_df = merged_df.drop(c,axis=1)

# Remove icustay_ids for which there's only one measurement of creatinine
count_mes = merged_df['icustay_id'].value_counts()
to_remove = count_mes.index.values[count_mes==1]
for i in to_remove:
    merged_df = merged_df.loc[merged_df['icustay_id']!=i,:]

print('After dropping unique measurements of creatinine : ' + str(merged_df.shape[0]))
merged_df.head()

Number of lines : 119104
After dropping unique measurements of creatinine : 119037


Unnamed: 0,subject_id,icustay_id,creatinine,age,arterial_pressure_systolic,arterial_pressure_systolic_delay,arterial_pressure_diastolic,arterial_pressure_diastolic_delay,heart_rate,heart_rate_delay,...,albumin_delay,total_protein_urine,total_protein_urine_delay,bilirubin,bilirubin_delay,c_reactive_protein,c_reactive_protein_delay,ethnicity,diagnosis,gender
0,77815,239231,0.6,58.363217,155.0,1800.0,75.0,1800.0,58.0,1800.0,...,,,,,,,,WHITE,INTRACRANIAL HEMORRHAGE,M
1,31558,249349,0.6,52.688716,165.0,1440.0,75.0,1440.0,102.0,1440.0,...,256500.0,,,,,,,WHITE,S/P BOATING ACCIDENT,M
2,53763,223136,0.8,70.937671,162.0,600.0,76.0,600.0,101.0,600.0,...,,,,,,,,WHITE,HEAD BLEED,M
3,7533,208809,0.8,66.441977,98.0,3060.0,51.0,3060.0,95.0,3060.0,...,,,,,,,,ASIAN - KOREAN,S/P ARREST,F
4,50772,272254,0.8,74.513611,124.0,4320.0,60.0,4320.0,87.0,4320.0,...,,,,,,,,WHITE,ALTERED MENTAL STATUS,F


**WARNING : the columns named "..._itemid" can be used to filter on the itemid used to retrieve the feature after the query has been done. But these are not features.**

In [18]:
# Dump to file
merged_df.to_csv('creatinine_measurements_merged.csv')
print(merged_df)

        subject_id  icustay_id  creatinine         age  \
0            77815      239231         0.6   58.363217   
1            31558      249349         0.6   52.688716   
2            53763      223136         0.8   70.937671   
3             7533      208809         0.8   66.441977   
4            50772      272254         0.8   74.513611   
5            89500      287743         1.1   68.764969   
6            76602      204448         1.2   81.705970   
7            29495      252299         0.7   63.776649   
8            60897      282836         2.0   67.864024   
9            16275      256649         0.5  309.637886   
10           45995      296994         0.7  300.003783   
11           29755      263813         0.8  300.003425   
12           85141      251157         0.7   72.399244   
13           61587      226920         0.6   85.678008   
14           97569      230777         1.9   58.298071   
15           64230      208889         2.9   71.375963   
16           2