In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2

# below imports are used to print out pretty pandas dataframes
from IPython.display import display, HTML

%matplotlib inline
plt.style.use('ggplot')

In [2]:
# information used to create a database connection
sqluser = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'

# Connect to postgres with a copy of the MIMIC-III database
con = psycopg2.connect(dbname=dbname, user=sqluser)

# the below statement is prepended to queries to ensure they select from the right schema
query_schema = 'set search_path to ' + schema_name + ';'

In [3]:
def apply_inclusion_criteria(df):
    print('Initial size of table : ' + str(df.shape[0]))
    df = df.drop_duplicates(['icustay_id','creatinine_time'])
    print('After dropping duplicates : ' + str(df.shape[0]))
    criteria_list = [i for i in df.columns.values if 'inclusion' in i]
    for c in criteria_list:
        df = df.loc[df[c]==1,:].drop(c,axis=1)    
    print('After applying inclusion criteria : ' + str(df.shape[0]))
    return df

# Load file with features to be found in chartevents table

In [4]:
# Read file
features_info = pd.read_csv('features_info.csv')
# Drop lines with no item_id
item_col = [c for c in features_info.columns.values if "item" in c]
features_info = features_info.dropna(axis=0, how='all', subset=item_col).reset_index(drop=True)
features_info.head()

Unnamed: 0,name,table,variable,item_id,item_id_2,item_id_3,item_id_4,item_id_5,item_id_6
0,Arterial_pressure_systolic,chartevents,valuenum,6.0,51.0,455.0,220050.0,220179.0,225309.0
1,Arterial_pressure_diastolic,chartevents,valuenum,8364.0,8368.0,8441.0,220051.0,220180.0,225310.0
2,Heart_rate,chartevents,valuenum,211.0,220045.0,,,,
3,Weight_daily,chartevents,valuenum,763.0,224639.0,,,,
4,Temperature,chartevents,valuenum,223761.0,678.0,645.0,,,


# Retrieve interesting features for patients that match inclusion criteria

In [5]:
item_col = [c for c in features_info.columns.values if "item_id" in c]
item_list = features_info.loc[0,item_col].dropna().astype('int').values
item_str = "(" + str(item_list[0])
for i in item_list[1:]:
    item_str = item_str + "," + str(i)
item_str = item_str + ")"

query = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    ce.valuenum as creatinine, ce.storetime as creatinine_time,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt
  from icustays icu
  inner join chartevents ce
    on icu.subject_id = ce.subject_id
    and ce.itemid = 220615
    and ce.valuenum is not null
    and ce.storetime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
),
cr_inc as
(
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt
  from cr
),
cr_feat as
(
select
cr_inc.subject_id, cr_inc.icustay_id, cr_inc.intime, cr_inc.outtime,
    cr_inc.creatinine, cr_inc.creatinine_time,
    cr_inc.age, cr_inc.inclusion_age,
    cr_inc.length_of_stay, cr_inc.inclusion_length_of_stay,
    cr_inc.rrt, cr_inc.inclusion_rrt,
    ce.valuenum as """+features_info.loc[0,'name']+""", 
    EXTRACT('epoch' from cr_inc.creatinine_time - ce.storetime) as """+features_info.loc[0,'name']+"""_delay,
    ce.storetime as """+features_info.loc[0,'name']+"""_time,
    ce.itemid as """+features_info.loc[0,'name']+"""_itemid
  from cr_inc
  inner join """+features_info.loc[0,'table']+""" ce
    on cr_inc.subject_id = ce.subject_id
    and ce.itemid in """+ item_str +"""
    and ce."""+features_info.loc[0,'variable']+""" is not null
    and ce.storetime between cr_inc.intime and cr_inc.creatinine_time
)
select 
a.subject_id, a.icustay_id, a.intime, a.outtime,
    a.creatinine, a.creatinine_time,
    a.age, a.inclusion_age,
    a.length_of_stay, a.inclusion_length_of_stay,
    a.rrt, a.inclusion_rrt,
    a."""+features_info.loc[0,'name']+""", a."""+features_info.loc[0,'name']+"""_delay,
    a."""+features_info.loc[0,'name']+"""_time,
    a."""+features_info.loc[0,'name']+"""_itemid
from cr_feat as a
    join (
        select creatinine_time, min("""+features_info.loc[0,'name']+"""_delay) as """+features_info.loc[0,'name']+"""_delay
        from cr_feat
        group by creatinine_time
    ) as b on a.creatinine_time = b.creatinine_time
where a."""+features_info.loc[0,'name']+"""_delay = b."""+features_info.loc[0,'name']+"""_delay
"""
df_chartevents = pd.read_sql_query(query, con)
df_chartevents = apply_inclusion_criteria(df_chartevents)

for i,row in features_info.loc[1:,:].iterrows():
    print('------------------------------------')
    print('--- Processing feature : ' + row['name'])
    item_col = [c for c in features_info.columns.values if "item_id" in c]
    item_list = features_info.loc[i,item_col].dropna().astype('int').values
    item_str = "(" + str(item_list[0])
    for it in item_list[1:]:
        item_str = item_str + "," + str(it)
    item_str = item_str + ")"
    item_name = features_info.loc[i,'name']
    
    query = query_schema + """
    with cr as
    (
    select
        icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
        ce.valuenum as creatinine, ce.storetime as creatinine_time,
        EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
        (rrt.icustay_id is null) as rrt
      from icustays icu
      inner join chartevents ce
        on icu.subject_id = ce.subject_id
        and ce.itemid = 220615
        and ce.valuenum is not null
        and ce.storetime between icu.intime and icu.outtime
      inner join patients pat
        ON icu.subject_id = pat.subject_id
      left outer join rrt 
        on icu.icustay_id = rrt.icustay_id
    ),
    cr_inc as
    (
    select
    cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
        cr.creatinine, cr.creatinine_time,
        cr.age, CASE
                    WHEN cr.age >= 15 then 1
                ELSE 0 END
                as inclusion_age,
      cr.length_of_stay, CASE
                            WHEN cr.length_of_stay >= 48 then 1
                         ELSE 0 END
                         as inclusion_length_of_stay,
      cr.rrt, CASE
                WHEN cr.rrt = False then 1
              ELSE 0 END
              as inclusion_rrt
      from cr
    ),
    cr_feat as
    (
    select
    cr_inc.subject_id, cr_inc.icustay_id, cr_inc.intime, cr_inc.outtime,
        cr_inc.creatinine, cr_inc.creatinine_time,
        cr_inc.age, cr_inc.inclusion_age,
        cr_inc.length_of_stay, cr_inc.inclusion_length_of_stay,
        cr_inc.rrt, cr_inc.inclusion_rrt,
        ce.valuenum as """+item_name+""", 
        EXTRACT('epoch' from cr_inc.creatinine_time - ce.storetime) as """+item_name+"""_delay,
        ce.storetime as """+item_name+"""_time,
        ce.itemid as """+item_name+"""_itemid
      from cr_inc
      inner join """+features_info.loc[i,'table']+""" ce
        on cr_inc.subject_id = ce.subject_id
        and ce.itemid in """+ item_str +"""
        and ce."""+features_info.loc[i,'variable']+""" is not null
        and ce.storetime between cr_inc.intime and cr_inc.creatinine_time
    )
    select 
    a.subject_id, a.icustay_id, a.intime, a.outtime,
        a.creatinine, a.creatinine_time,
        a.age, a.inclusion_age,
        a.length_of_stay, a.inclusion_length_of_stay,
        a.rrt, a.inclusion_rrt,
        a."""+item_name+""", a."""+item_name+"""_delay,
        a."""+item_name+"""_time,
        a."""+item_name+"""_itemid
    from cr_feat as a
        join (
            select creatinine_time, 
            min("""+item_name+"""_delay) as """+item_name+"""_delay
            from cr_feat
            group by creatinine_time
        ) as b on a.creatinine_time = b.creatinine_time
    where a."""+item_name+"""_delay = b."""+item_name+"""_delay
    """
    
    df = pd.read_sql_query(query, con)
    df = apply_inclusion_criteria(df)
    df_chartevents = pd.merge(df_chartevents,df,on=['subject_id', 'icustay_id', 'intime', 'outtime', 'creatinine',
       'creatinine_time', 'age', 'length_of_stay', 'rrt'],how='outer')
    print('Merged table size : ' + str(df_chartevents.shape[0]))
    print(df_chartevents.head())



Initial size of table : 209249
After dropping duplicates : 150528
After applying inclusion criteria : 125376
------------------------------------
--- Processing feature : Arterial_pressure_diastolic
Initial size of table : 209323
After dropping duplicates : 150526
After applying inclusion criteria : 125376
Merged table size : 125379
   subject_id  icustay_id              intime             outtime  creatinine  \
0          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
1          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.8   
2          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
3         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   
4         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   

      creatinine_time        age  length_of_stay    rrt  \
0 2134-05-12 10:01:00  72.733384      104.359167  False   
1 2134-05-15 04:58:00  72.733384      104.359167  False   
2

Initial size of table : 146164
After dropping duplicates : 142139
After applying inclusion criteria : 119607
Merged table size : 125712
   subject_id  icustay_id              intime             outtime  creatinine  \
0          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
1          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.8   
2          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
3         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   
4         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   

      creatinine_time        age  length_of_stay    rrt  \
0 2134-05-12 10:01:00  72.733384      104.359167  False   
1 2134-05-15 04:58:00  72.733384      104.359167  False   
2 2134-05-14 04:08:00  72.733384      104.359167  False   
3 2141-06-14 06:24:00  23.845640      100.895278  False   
4 2141-06-11 14:08:00  23.845640      100.895278  False   

   arterial_pressure_s

Initial size of table : 52215
After dropping duplicates : 52044
After applying inclusion criteria : 47237
Merged table size : 126086
   subject_id  icustay_id              intime             outtime  creatinine  \
0          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
1          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.8   
2          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
3         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   
4         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   

      creatinine_time        age  length_of_stay    rrt  \
0 2134-05-12 10:01:00  72.733384      104.359167  False   
1 2134-05-15 04:58:00  72.733384      104.359167  False   
2 2134-05-14 04:08:00  72.733384      104.359167  False   
3 2141-06-14 06:24:00  23.845640      100.895278  False   
4 2141-06-11 14:08:00  23.845640      100.895278  False   

   arterial_pressure_syst

Initial size of table : 19377
After dropping duplicates : 19348
After applying inclusion criteria : 16571
Merged table size : 126095
   subject_id  icustay_id              intime             outtime  creatinine  \
0          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
1          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.8   
2          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
3         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   
4         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   

      creatinine_time        age  length_of_stay    rrt  \
0 2134-05-12 10:01:00  72.733384      104.359167  False   
1 2134-05-15 04:58:00  72.733384      104.359167  False   
2 2134-05-14 04:08:00  72.733384      104.359167  False   
3 2141-06-14 06:24:00  23.845640      100.895278  False   
4 2141-06-11 14:08:00  23.845640      100.895278  False   

   arterial_pressure_syst

Initial size of table : 110084
After dropping duplicates : 109719
After applying inclusion criteria : 100516
Merged table size : 126398
   subject_id  icustay_id              intime             outtime  creatinine  \
0          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
1          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.8   
2          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
3         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   
4         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   

      creatinine_time        age  length_of_stay    rrt  \
0 2134-05-12 10:01:00  72.733384      104.359167  False   
1 2134-05-15 04:58:00  72.733384      104.359167  False   
2 2134-05-14 04:08:00  72.733384      104.359167  False   
3 2141-06-14 06:24:00  23.845640      100.895278  False   
4 2141-06-11 14:08:00  23.845640      100.895278  False   

   arterial_pressure_s

Merged table size : 126400
   subject_id  icustay_id              intime             outtime  creatinine  \
0          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
1          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.8   
2          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
3         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   
4         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   

      creatinine_time        age  length_of_stay    rrt  \
0 2134-05-12 10:01:00  72.733384      104.359167  False   
1 2134-05-15 04:58:00  72.733384      104.359167  False   
2 2134-05-14 04:08:00  72.733384      104.359167  False   
3 2141-06-14 06:24:00  23.845640      100.895278  False   
4 2141-06-11 14:08:00  23.845640      100.895278  False   

   arterial_pressure_systolic          ...            urinary_sodium_time  \
0                       139.0          ...            

Initial size of table : 11261
After dropping duplicates : 11251
After applying inclusion criteria : 10570
Merged table size : 126403
   subject_id  icustay_id              intime             outtime  creatinine  \
0          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
1          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.8   
2          36      241249 2134-05-12 06:52:47 2134-05-16 15:14:20         0.9   
3         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   
4         109      253139 2141-06-11 10:17:54 2141-06-15 15:11:37         7.4   

      creatinine_time        age  length_of_stay    rrt  \
0 2134-05-12 10:01:00  72.733384      104.359167  False   
1 2134-05-15 04:58:00  72.733384      104.359167  False   
2 2134-05-14 04:08:00  72.733384      104.359167  False   
3 2141-06-14 06:24:00  23.845640      100.895278  False   
4 2141-06-11 14:08:00  23.845640      100.895278  False   

   arterial_pressure_syst

In [6]:
print(df_chartevents['arterial_pressure_diastolic_itemid'].value_counts(dropna=False))

 220180.0    59266
 220051.0    48895
 8368.0       7944
 8441.0       6109
 225310.0     3162
NaN           1027
Name: arterial_pressure_diastolic_itemid, dtype: int64


In [7]:
# Dump to file
df_chartevents.to_csv('creatinine_measurements_1.csv')
df_chartevents.head()

Unnamed: 0,subject_id,icustay_id,intime,outtime,creatinine,creatinine_time,age,length_of_stay,rrt,arterial_pressure_systolic,...,total_protein_urine_time,total_protein_urine_itemid,bilirubin,bilirubin_delay,bilirubin_time,bilirubin_itemid,c_reactive_protein,c_reactive_protein_delay,c_reactive_protein_time,c_reactive_protein_itemid
0,36,241249,2134-05-12 06:52:47,2134-05-16 15:14:20,0.9,2134-05-12 10:01:00,72.733384,104.359167,False,139.0,...,,,,,,,,,NaT,
1,36,241249,2134-05-12 06:52:47,2134-05-16 15:14:20,0.8,2134-05-15 04:58:00,72.733384,104.359167,False,154.0,...,,,,,,,,,NaT,
2,36,241249,2134-05-12 06:52:47,2134-05-16 15:14:20,0.9,2134-05-14 04:08:00,72.733384,104.359167,False,115.0,...,,,,,,,,,NaT,
3,109,253139,2141-06-11 10:17:54,2141-06-15 15:11:37,7.4,2141-06-14 06:24:00,23.84564,100.895278,False,183.0,...,,,,,,,,,NaT,
4,109,253139,2141-06-11 10:17:54,2141-06-15 15:11:37,7.4,2141-06-11 14:08:00,23.84564,100.895278,False,252.0,...,,,,,,,,,NaT,


# Retrieve missing static information

In [8]:
# Creatinine measurements, AKI flags + stage for all stays
# Inclusion flags :
# patient age > 15
# length of stay > 48hours
query = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    ce.valuenum as creatinine, ce.storetime as creatinine_time,
    adm.ethnicity, adm.diagnosis,
    pat.gender as gender,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt
  from icustays icu
  inner join chartevents ce
    on icu.subject_id = ce.subject_id
    and ce.itemid = 220615
    and ce.valuenum is not null
    and ce.storetime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  inner join admissions adm
    on icu.subject_id = adm.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
)
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.ethnicity, cr.diagnosis,
    cr.gender,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt
  from cr
"""
df_static = pd.read_sql_query(query, con)
print(df_static.head())
df_static = apply_inclusion_criteria(df_static)
print(df_static.head())

   subject_id  icustay_id              intime             outtime  creatinine  \
0         494      268296 2171-12-20 22:10:57 2171-12-21 22:37:41         1.0   
1         494      268296 2171-12-20 22:10:57 2171-12-21 22:37:41         1.0   
2         199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50         1.1   
3         199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50         1.1   
4         199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50         1.1   

      creatinine_time        ethnicity       diagnosis gender        age  \
0 2171-12-21 02:16:00  WHITE - RUSSIAN      CHEST PAIN      F  61.997044   
1 2171-12-21 02:16:00  WHITE - RUSSIAN          ANEMIA      F  61.997044   
2 2174-04-17 02:12:00            WHITE  CARDIOMYOPATHY      M  48.500607   
3 2174-04-14 03:11:00            WHITE  CARDIOMYOPATHY      M  48.500607   
4 2174-04-15 03:29:00            WHITE  CARDIOMYOPATHY      M  48.500607   

   inclusion_age  length_of_stay  inclusion_length_of_st

In [9]:
# Dump to file
df_static.to_csv('creatinine_measurements_2.csv')
print(df_static)

        subject_id  icustay_id              intime             outtime  \
2              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
3              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
4              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
5              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
6              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
7              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
8              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
9              199      235675 2174-04-12 11:00:25 2174-04-17 17:02:50   
18             209      229904 2127-08-11 20:43:43 2127-08-15 20:53:22   
19             209      229904 2127-08-11 20:43:43 2127-08-15 20:53:22   
20             209      229904 2127-08-11 20:43:43 2127-08-15 20:53:22   
21             209      229904 2127-08-11 20:43:43 2127-08-15 20:53:22   
22             209      229904 2127-08

# Merge tables into one

In [10]:
print('Chartevents :')
print(df_chartevents.shape)
print(df_chartevents.columns.values)
print('')

print('Static:')
print(df_static.shape)
print(df_static.columns.values)

Chartevents :
(126403, 117)
['subject_id' 'icustay_id' 'intime' 'outtime' 'creatinine'
 'creatinine_time' 'age' 'length_of_stay' 'rrt'
 'arterial_pressure_systolic' 'arterial_pressure_systolic_delay'
 'arterial_pressure_systolic_time' 'arterial_pressure_systolic_itemid'
 'arterial_pressure_diastolic' 'arterial_pressure_diastolic_delay'
 'arterial_pressure_diastolic_time' 'arterial_pressure_diastolic_itemid'
 'heart_rate' 'heart_rate_delay' 'heart_rate_time' 'heart_rate_itemid'
 'weight_daily' 'weight_daily_delay' 'weight_daily_time'
 'weight_daily_itemid' 'temperature' 'temperature_delay' 'temperature_time'
 'temperature_itemid' 'urine_output' 'urine_output_delay'
 'urine_output_time' 'urine_output_itemid' 'day_urine_output'
 'day_urine_output_delay' 'day_urine_output_time' 'day_urine_output_itemid'
 'scr' 'scr_delay' 'scr_time' 'scr_itemid' 'sodium' 'sodium_delay'
 'sodium_time' 'sodium_itemid' 'potassium' 'potassium_delay'
 'potassium_time' 'potassium_itemid' 'calcium' 'calcium_delay

In [11]:
merged_df = pd.merge(df_chartevents,df_static,on=['subject_id', 'icustay_id', 'intime', 'outtime', 'creatinine',
       'creatinine_time', 'age', 'length_of_stay', 'rrt'],how='outer')

# Remove columns that are not features (except icustay_id and patient_id that are needed to build the table with labels)

In [12]:
# REMOVE COLUMNS THAT WERE USED FOR INCLUSION CRITERIA BUT THAT ARE NOT AVAILABLE FEATURES FOR THE PREDICTION
# REMOVE ICUSTAY_IDS FOR WHICH THERE IS ONLY ONE MEASUREMENT OF CREATININE

print('Number of lines : ' + str(merged_df.shape[0]))
to_remove = ['intime','outtime','length_of_stay','rrt']
for c in to_remove:
    if (c in merged_df.columns.values): merged_df = merged_df.drop(c,axis=1)
        
# Remove columns with _time suffix
to_remove = [i for i in merged_df.columns.values if '_time' in i]
for c in to_remove:
    if (c in merged_df.columns.values): merged_df = merged_df.drop(c,axis=1)

# Remove icustay_ids for which there's only one measurement of creatinine
count_mes = merged_df['icustay_id'].value_counts()
to_remove = count_mes.index.values[count_mes==1]
for i in to_remove:
    merged_df = merged_df.loc[merged_df['icustay_id']!=i,:]

print('After dropping unique measurements of creatinine : ' + str(merged_df.shape[0]))
merged_df.head()

Number of lines : 126888
After dropping unique measurements of creatinine : 126819


Unnamed: 0,subject_id,icustay_id,creatinine,age,arterial_pressure_systolic,arterial_pressure_systolic_delay,arterial_pressure_systolic_itemid,arterial_pressure_diastolic,arterial_pressure_diastolic_delay,arterial_pressure_diastolic_itemid,...,total_protein_urine_itemid,bilirubin,bilirubin_delay,bilirubin_itemid,c_reactive_protein,c_reactive_protein_delay,c_reactive_protein_itemid,ethnicity,diagnosis,gender
0,36,241249,0.9,72.733384,139.0,3060.0,220179.0,84.0,3060.0,220180.0,...,,,,,,,,WHITE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,M
1,36,241249,0.8,72.733384,154.0,1800.0,220179.0,79.0,1800.0,220180.0,...,,,,,,,,WHITE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,M
2,36,241249,0.9,72.733384,115.0,6960.0,220179.0,69.0,6960.0,220180.0,...,,,,,,,,WHITE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,M
3,109,253139,7.4,23.84564,183.0,1140.0,220179.0,96.0,1140.0,220180.0,...,,,,,,,,BLACK/AFRICAN AMERICAN,HYPERTENSIVE EMERGENCY,F
4,109,253139,7.4,23.84564,252.0,1200.0,220179.0,116.0,1200.0,220180.0,...,,,,,,,,BLACK/AFRICAN AMERICAN,HYPERTENSIVE EMERGENCY,F


**WARNING : the columns named "..._itemid" can be used to filter on the itemid used to retrieve the feature after the query has been done. But these are not features.**

In [13]:
# Dump to file
merged_df.to_csv('creatinine_measurements_merged.csv')
print(merged_df)

        subject_id  icustay_id  creatinine         age  \
0               36      241249         0.9   72.733384   
1               36      241249         0.8   72.733384   
2               36      241249         0.9   72.733384   
3              109      253139         7.4   23.845640   
4              109      253139         7.4   23.845640   
5              109      240251         7.1   24.785584   
6              109      262652         8.5   24.123360   
7              109      214027         5.2   24.930443   
8              109      222630         4.9   24.365106   
9              109      253139         7.3   23.845640   
10             109      236124         6.8   24.186610   
11             109      236124         6.7   24.186610   
12             109      222630         5.8   24.365106   
13             109      222630         6.2   24.365106   
14             109      222630         5.5   24.365106   
15             124      256064         1.8   75.140716   
16            