# imports

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2

# below imports are used to print out pretty pandas dataframes
from IPython.display import display, HTML

%matplotlib inline
plt.style.use('ggplot')

# database connection

In [2]:
# information used to create a database connection
sqluser = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'

# Connect to postgres with a copy of the MIMIC-III database
con = psycopg2.connect(dbname=dbname, user=sqluser)

# the below statement is prepended to queries to ensure they select from the right schema
query_schema = 'set search_path to ' + schema_name + ';'

# Helper functions

In [3]:
def apply_inclusion_criteria(df):
    print('Initial size of table : ' + str(df.shape[0]))
    df = df.drop_duplicates(['icustay_id','creatinine_time'])
    print('After dropping duplicates : ' + str(df.shape[0]))
    criteria_list = [i for i in df.columns.values if 'inclusion' in i]
    for c in criteria_list:
        df = df.loc[df[c]==1,:].drop(c,axis=1)    
    print('After applying inclusion criteria : ' + str(df.shape[0]))
    return df

# Playground for queries

In [7]:
# Creatinine measurements for all stays
query = query_schema + """
select
    ie.subject_id, ie.icustay_id, ie.intime, ie.outtime,
    le.valuenum as creatinine, le.charttime as creatinine_time
  from icustays ie
  inner join labevents le
    on ie.subject_id = le.subject_id
    and le.itemid = 50912
    and le.valuenum is not null
    and le.charttime between ie.intime and ie.outtime
"""
df = pd.read_sql_query(query, con)
print(df.shape[0])
print(df)

326891
        icustay_id              intime             outtime  creatinine  \
0           294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.5   
1           294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.4   
2           228232 2175-05-30 21:30:54 2175-06-03 13:39:54        10.0   
3           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   
4           228232 2175-05-30 21:30:54 2175-06-03 13:39:54         5.1   
5           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.4   
6           229441 2178-04-16 06:19:32 2178-04-17 20:21:05         0.7   
7           229441 2178-04-16 06:19:32 2178-04-17 20:21:05         0.7   
8           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   
9           228232 2175-05-30 21:30:54 2175-06-03 13:39:54         7.4   
10          228232 2175-05-30 21:30:54 2175-06-03 13:39:54         6.2   
11          220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.6   
12          220597 2149-11-09 1

In [6]:
# Creatinine measurements, AKI flags + stage for all stays
query = query_schema + """
select
    ie.subject_id, ie.icustay_id, ie.intime, ie.outtime,
    le.valuenum as creatinine, le.charttime as creatinine_time,
    kdigo.aki_7day, kdigo.aki_stage_7day
  from icustays ie
  inner join labevents le
    on ie.subject_id = le.subject_id
    and le.itemid = 50912
    and le.valuenum is not null
    and le.charttime between ie.intime and ie.outtime
  inner join kdigo_stages_7day kdigo
    on ie.icustay_id = kdigo.icustay_id
"""
df = pd.read_sql_query(query, con)
print(df.shape[0])
print(df)

326891
        icustay_id              intime             outtime  creatinine  \
0           228232 2175-05-30 21:30:54 2175-06-03 13:39:54        10.0   
1           228232 2175-05-30 21:30:54 2175-06-03 13:39:54         7.4   
2           228232 2175-05-30 21:30:54 2175-06-03 13:39:54         6.2   
3           294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.5   
4           294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.4   
5           228232 2175-05-30 21:30:54 2175-06-03 13:39:54         5.1   
6           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.4   
7           229441 2178-04-16 06:19:32 2178-04-17 20:21:05         0.7   
8           229441 2178-04-16 06:19:32 2178-04-17 20:21:05         0.7   
9           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   
10          220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.6   
11          220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.7   
12          220597 2149-11-09 1

In [21]:
# Creatinine measurements, AKI flags + stage for all stays
# Inclusion flag : patient age > 15
query = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime,
    le.valuenum as creatinine, le.charttime as creatinine_time,
    kdigo.aki_7day, kdigo.aki_stage_7day,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age
  from icustays icu
  inner join labevents le
    on icu.subject_id = le.subject_id
    and le.itemid = 50912
    and le.valuenum is not null
    and le.charttime between icu.intime and icu.outtime
  inner join kdigo_stages_7day kdigo
    on icu.icustay_id = kdigo.icustay_id
  INNER JOIN patients pat
    ON icu.subject_id = pat.subject_id
)
select
cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.aki_7day, cr.aki_stage_7day,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age
  from cr
"""
print(df)
df = pd.read_sql_query(query, con)
df = apply_inclusion_criteria(df)
print(df)

        icustay_id              intime             outtime  creatinine  \
0           294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.5   
1           294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.4   
2           228232 2175-05-30 21:30:54 2175-06-03 13:39:54         7.4   
3           228232 2175-05-30 21:30:54 2175-06-03 13:39:54         6.2   
4           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.4   
5           228232 2175-05-30 21:30:54 2175-06-03 13:39:54         5.1   
6           228232 2175-05-30 21:30:54 2175-06-03 13:39:54        10.0   
7           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.6   
8           229441 2178-04-16 06:19:32 2178-04-17 20:21:05         0.7   
9           229441 2178-04-16 06:19:32 2178-04-17 20:21:05         0.7   
10          220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   
11          220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   
12          220597 2149-11-09 13:07:02

Initial size of table : 326891
After applying inclusion criteria : 324752
        icustay_id              intime             outtime  creatinine  \
0           294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.5   
1           294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.4   
2           228232 2175-05-30 21:30:54 2175-06-03 13:39:54         5.1   
3           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   
4           228232 2175-05-30 21:30:54 2175-06-03 13:39:54        10.0   
5           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.4   
6           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.6   
7           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.7   
8           228232 2175-05-30 21:30:54 2175-06-03 13:39:54         7.4   
9           228232 2175-05-30 21:30:54 2175-06-03 13:39:54         6.2   
10          229441 2178-04-16 06:19:32 2178-04-17 20:21:05         0.7   
11          229441 2178-04-16 06:19:32

In [27]:
# Creatinine measurements, AKI flags + stage for all stays
# Inclusion flags :
# patient age > 15
# length of stay > 48hours
query = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    le.valuenum as creatinine, le.charttime as creatinine_time,
    kdigo.aki_7day, kdigo.aki_stage_7day,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age
  from icustays icu
  inner join labevents le
    on icu.subject_id = le.subject_id
    and le.itemid = 50912
    and le.valuenum is not null
    and le.charttime between icu.intime and icu.outtime
  inner join kdigo_stages_7day kdigo
    on icu.icustay_id = kdigo.icustay_id
  INNER JOIN patients pat
    ON icu.subject_id = pat.subject_id
)
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.aki_7day, cr.aki_stage_7day,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay
  from cr
"""
df = pd.read_sql_query(query, con)
print(df.head())
df = apply_inclusion_criteria(df)
print(df.head())

   icustay_id              intime             outtime  creatinine  \
0      228232 2175-05-30 21:30:54 2175-06-03 13:39:54        10.0   
1      294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.5   
2      294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.4   
3      228232 2175-05-30 21:30:54 2175-06-03 13:39:54         7.4   
4      228232 2175-05-30 21:30:54 2175-06-03 13:39:54         6.2   

      creatinine_time  aki_7day  aki_stage_7day        age  inclusion_age  \
0 2175-05-31 01:48:00       1.0             3.0  65.942297              1   
1 2191-03-16 05:42:00       0.0             0.0  47.845047              1   
2 2191-03-17 06:00:00       0.0             0.0  47.845047              1   
3 2175-06-01 03:00:00       1.0             3.0  65.942297              1   
4 2175-06-02 02:06:00       1.0             3.0  65.942297              1   

   length_of_stay  inclusion_length_of_stay  
0       88.150000                         1  
1       40.283333             

In [49]:
# Creatinine measurements, AKI flags + stage for all stays
# Inclusion flags :
# patient age > 15
# length of stay > 48hours
query = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    le.valuenum as creatinine, le.charttime as creatinine_time,
    kdigo.aki_7day, kdigo.aki_stage_7day,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt
  from icustays icu
  inner join labevents le
    on icu.subject_id = le.subject_id
    and le.itemid = 50912
    and le.valuenum is not null
    and le.charttime between icu.intime and icu.outtime
  inner join kdigo_stages_7day kdigo
    on icu.icustay_id = kdigo.icustay_id
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
)
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.aki_7day, cr.aki_stage_7day,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt
  from cr
"""
df = pd.read_sql_query(query, con)
print(df.head())
df = apply_inclusion_criteria(df)
print(df.head())

   icustay_id              intime             outtime  creatinine  \
0      294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.5   
1      228232 2175-05-30 21:30:54 2175-06-03 13:39:54        10.0   
2      228232 2175-05-30 21:30:54 2175-06-03 13:39:54         5.1   
3      220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.4   
4      229441 2178-04-16 06:19:32 2178-04-17 20:21:05         0.7   

      creatinine_time  aki_7day  aki_stage_7day        age  inclusion_age  \
0 2191-03-16 05:42:00       0.0             0.0  47.845047              1   
1 2175-05-31 01:48:00       1.0             3.0  65.942297              1   
2 2175-06-03 00:58:00       1.0             3.0  65.942297              1   
3 2149-11-10 03:00:00       1.0             3.0  41.790228              1   
4 2178-04-16 07:12:00       0.0             0.0  50.148295              1   

   length_of_stay  inclusion_length_of_stay    rrt  inclusion_rrt  
0       40.283333                         0  False    

# Retrieve missing static information

In [7]:
# Creatinine measurements, AKI flags + stage for all stays
# Inclusion flags :
# patient age > 15
# length of stay > 48hours
query = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    le.valuenum as creatinine, le.charttime as creatinine_time,
    adm.ethnicity, adm.diagnosis,
    pat.gender as gender,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt
  from icustays icu
  inner join labevents le
    on icu.subject_id = le.subject_id
    and le.itemid = 50912
    and le.valuenum is not null
    and le.charttime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  inner join admissions adm
    on icu.subject_id = adm.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
)
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.ethnicity, cr.diagnosis,
    cr.gender,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt
  from cr
"""
df = pd.read_sql_query(query, con)
print(df.head())
df = apply_inclusion_criteria(df)
print(df.head())

   subject_id  icustay_id              intime             outtime  creatinine  \
0           4      294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.5   
1           4      294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.4   
2           6      228232 2175-05-30 21:30:54 2175-06-03 13:39:54        10.0   
3           6      228232 2175-05-30 21:30:54 2175-06-03 13:39:54         7.4   
4           6      228232 2175-05-30 21:30:54 2175-06-03 13:39:54         6.2   

      creatinine_time ethnicity                            diagnosis gender  \
0 2191-03-16 05:42:00     WHITE  FEVER,DEHYDRATION,FAILURE TO THRIVE      F   
1 2191-03-17 06:00:00     WHITE  FEVER,DEHYDRATION,FAILURE TO THRIVE      F   
2 2175-05-31 01:48:00     WHITE            CHRONIC RENAL FAILURE/SDA      F   
3 2175-06-01 03:00:00     WHITE            CHRONIC RENAL FAILURE/SDA      F   
4 2175-06-02 02:06:00     WHITE            CHRONIC RENAL FAILURE/SDA      F   

         age  inclusion_age  length_of

In [8]:
# REMOVE COLUMNS THAT WERE USED FOR INCLUSION CRITERIA BUT THAT ARE NOT AVAILABLE FEATURES FOR THE PREDICTION
# REMOVE ICUSTAY_IDS FOR WHICH THERE IS ONLY ONE MEASUREMENT OF CREATININE

print('Number of lines : ' + str(df.shape[0]))
to_remove = ['intime','outtime','length_of_stay','rrt']
for c in to_remove:
    if (c in df.columns.values): df = df.drop(c,axis=1)

# Remove icustay_ids for which there's only one measurement of creatinine
count_mes = df['icustay_id'].value_counts()
to_remove = count_mes.index.values[count_mes==1]
for i in to_remove:
    df = df.loc[df['icustay_id']!=i,:]

print('After dropping unique measurements of creatinine : ' + str(df.shape[0]))
print(df.head())

Number of lines : 279205
After dropping unique measurements of creatinine : 279035
   subject_id  icustay_id  creatinine     creatinine_time  \
2           6      228232        10.0 2175-05-31 01:48:00   
3           6      228232         7.4 2175-06-01 03:00:00   
4           6      228232         6.2 2175-06-02 02:06:00   
5           6      228232         5.1 2175-06-03 00:58:00   
6           9      220597         1.4 2149-11-10 03:00:00   

               ethnicity                  diagnosis gender        age  
2                  WHITE  CHRONIC RENAL FAILURE/SDA      F  65.942297  
3                  WHITE  CHRONIC RENAL FAILURE/SDA      F  65.942297  
4                  WHITE  CHRONIC RENAL FAILURE/SDA      F  65.942297  
5                  WHITE  CHRONIC RENAL FAILURE/SDA      F  65.942297  
6  UNKNOWN/NOT SPECIFIED            HEMORRHAGIC CVA      M  41.790228  


In [9]:
print(df['diagnosis'].value_counts())

SEPSIS                                                            11966
PNEUMONIA                                                         11235
CONGESTIVE HEART FAILURE                                           6719
FEVER                                                              4458
ALTERED MENTAL STATUS                                              4135
CORONARY ARTERY DISEASE                                            3617
ABDOMINAL PAIN                                                     3589
INTRACRANIAL HEMORRHAGE                                            3374
SUBARACHNOID HEMORRHAGE                                            3369
CHEST PAIN                                                         3250
HYPOTENSION                                                        3207
GASTROINTESTINAL BLEED                                             2848
RESPIRATORY FAILURE                                                2543
PANCREATITIS                                                    

In [10]:
# Dump to file
df.to_csv('creatinine_measurements.csv')
print(df)

        subject_id  icustay_id  creatinine     creatinine_time  \
2                6      228232        10.0 2175-05-31 01:48:00   
3                6      228232         7.4 2175-06-01 03:00:00   
4                6      228232         6.2 2175-06-02 02:06:00   
5                6      228232         5.1 2175-06-03 00:58:00   
6                9      220597         1.4 2149-11-10 03:00:00   
9                9      220597         2.0 2149-11-11 03:00:00   
10               9      220597         2.0 2149-11-12 03:00:00   
11               9      220597         1.6 2149-11-13 03:20:00   
12               9      220597         1.7 2149-11-13 16:04:00   
13               9      220597         2.0 2149-11-14 06:00:00   
14              13      263738         0.6 2167-01-10 03:27:00   
15              13      263738         0.6 2167-01-11 01:44:00   
16              13      263738         0.4 2167-01-12 03:27:00   
17              13      263738         0.6 2167-01-08 20:28:00   
18        

# Retrieve interesting lab events for patients that match inclusion criteria

In [None]:
# TODO : ADD EXCLUSION CRITERIA FOR ESRD AND KIDNEY TRANSPLANT

In [126]:
features_info = pd.DataFrame(columns=['name','table','variable','item_id','item_id_2'],
                             data=[['Urea','labevents','valuenum',51006,None],
                                   ['Sodium','labevents','valuenum',50824,None]])
print(features_info)

     name      table  variable  item_id item_id_2
0    Urea  labevents  valuenum    51006      None
1  Sodium  labevents  valuenum    50824      None


In [154]:
features_info = pd.read_csv('features_info.csv')
features_list = ['Urea','SCr']
# Cut the table for test purposes
features_info = features_info.loc[features_info['name'].apply(lambda x:x in features_list),:]

In [155]:
print(features_info)

   name      table  variable  item_id  item_id_2  item_id_3  item_id_4  \
7   SCr  labevents  valuenum    50912    51081.0        NaN        NaN   
8  Urea  labevents  valuenum    51006        NaN        NaN        NaN   

   item_id_5  item_id_6  
7        NaN        NaN  
8        NaN        NaN  


In [156]:
# Creatinine measurements
# + Features from labevents

# Inclusion flags :
# patient age > 15
# length of stay > 48hours


query_inc = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    le.valuenum as creatinine, le.charttime as creatinine_time,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt
  from icustays icu
  inner join labevents le
    on icu.subject_id = le.subject_id
    and le.itemid = 50912
    and le.valuenum is not null
    and le.charttime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
),
cr_inc as
(
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt
  from cr),  
  """
    
query_feat = """
cr_feat as
(
select
cr_inc.subject_id, cr_inc.icustay_id, cr_inc.intime, cr_inc.outtime,
    cr_inc.creatinine, cr_inc.creatinine_time,
    cr_inc.age, cr_inc.inclusion_age,
    cr_inc.length_of_stay, cr_inc.inclusion_length_of_stay,
    cr_inc.rrt, cr_inc.inclusion_rrt
    """
    
for i,row in features_info.iterrows():
    query_feat = query_feat  + """,
    le"""+str(i)+""".valuenum as """+row['name']+""", EXTRACT('epoch' from cr_inc.creatinine_time - le"""+str(i)+""".charttime) as """+row['name']+"""_delay, le"""+str(i)+""".charttime as """+row['name']+"""_time"""
    
query_feat = query_feat +"""
    from cr_inc"""
    
for i,row in features_info.iterrows():
    query_feat = query_feat + """
  left join labevents le"""+str(i)+"""
    on cr_inc.subject_id = le"""+str(i)+""".subject_id
    and le"""+str(i)+""".itemid = """+str(row['item_id'])+"""
    -- and le"""+str(i)+""".valuenum is not null
    and le"""+str(i)+""".charttime between cr_inc.intime and cr_inc.creatinine_time
    """

query_feat = query_feat + """
)
"""
    
query_select = """
select 
a.subject_id, a.icustay_id, a.intime, a.outtime,
    a.creatinine, a.creatinine_time,
    a.age, a.inclusion_age,
    a.length_of_stay, a.inclusion_length_of_stay,
    a.rrt, a.inclusion_rrt
    """
    
for i,row in features_info.iterrows():
    query_select = query_select  + """,
    a."""+row['name']+""", a."""+row['name']+"""_delay, a."""+row['name']+"""_time
    """
    
query_end = """
from cr_feat as a 
"""

for i,row in features_info.iterrows():
    query_end = query_end + """
    inner join (
        -- Get the min seenID for each personID
        select creatinine_time, min("""+row['name']+"""_delay) as """+row['name']+"""_delay
        from cr_feat
        group by creatinine_time
    ) as b"""+str(i)+""" on a.creatinine_time = b"""+str(i)+""".creatinine_time
and a."""+row['name']+"""_delay = b"""+str(i)+"""."""+row['name']+"""_delay
"""

query = query_inc + query_feat + query_select + query_end
print(query)

set search_path to mimiciii;
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    le.valuenum as creatinine, le.charttime as creatinine_time,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt
  from icustays icu
  inner join labevents le
    on icu.subject_id = le.subject_id
    and le.itemid = 50912
    and le.valuenum is not null
    and le.charttime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
),
cr_inc as
(
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 t

In [157]:
df = pd.read_sql_query(query, con)
print(df.head())
df = apply_inclusion_criteria(df)
print(df.head())

   subject_id  icustay_id              intime             outtime  creatinine  \
0           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         2.4   
1          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.1   
2          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.3   
3          13      263738 2167-01-08 18:44:25 2167-01-12 10:43:31         0.5   
4          21      216859 2135-01-30 20:53:34 2135-02-08 05:38:46         4.7   

      creatinine_time        age  inclusion_age  length_of_stay  \
0 2101-10-20 19:26:00  76.526792              1      145.549444   
1 2104-08-13 03:51:00  72.374177              1      183.235556   
2 2104-08-08 04:15:00  72.374177              1      183.235556   
3 2167-01-09 17:09:00  39.866118              1       87.985000   
4 2135-02-01 02:59:00  87.826347              1      200.753333   

   inclusion_length_of_stay    rrt  inclusion_rrt  scr  scr_delay  \
0                         1  False       

In [113]:
# REMOVE COLUMNS THAT WERE USED FOR INCLUSION CRITERIA BUT THAT ARE NOT AVAILABLE FEATURES FOR THE PREDICTION
# REMOVE ICUSTAY_IDS FOR WHICH THERE IS ONLY ONE MEASUREMENT OF CREATININE

print('Number of lines : ' + str(df.shape[0]))
to_remove = ['intime','outtime','length_of_stay','rrt']
for c in to_remove:
    if (c in df.columns.values): df = df.drop(c,axis=1)

# Remove icustay_ids for which there's only one measurement of creatinine
count_mes = df['icustay_id'].value_counts()
to_remove = count_mes.index.values[count_mes==1]
for i in to_remove:
    df = df.loc[df['icustay_id']!=i,:]

print('After dropping unique measurements of creatinine : ' + str(df.shape[0]))
print(df.head())

Number of lines : 109295
After dropping unique measurements of creatinine : 109047
    subject_id  icustay_id  creatinine     creatinine_time        age  urea  \
0            3      211552         1.7 2101-10-22 21:15:00  76.526792  32.0   
5            3      211552         1.6 2101-10-23 03:45:00  76.526792  32.0   
11           3      211552         1.5 2101-10-24 03:15:00  76.526792  32.0   
18           3      211552         1.9 2101-10-22 04:00:00  76.526792  41.0   
22           3      211552         2.5 2101-10-21 03:00:00  76.526792  41.0   

    urea_delay           urea_time  sodium  sodium_delay         sodium_time  
0          0.0 2101-10-22 21:15:00   139.0      170640.0 2101-10-20 21:51:00  
5      23400.0 2101-10-22 21:15:00   139.0      194040.0 2101-10-20 21:51:00  
11    108000.0 2101-10-22 21:15:00   139.0      278640.0 2101-10-20 21:51:00  
18    117240.0 2101-10-20 19:26:00   139.0      108540.0 2101-10-20 21:51:00  
22     27240.0 2101-10-20 19:26:00   139.0     

In [51]:
# Dump to file
df.to_csv('creatinine_measurements.csv')
print(df)

        icustay_id              intime             outtime  creatinine  \
1           228232 2175-05-30 21:30:54 2175-06-03 13:39:54        10.0   
2           228232 2175-05-30 21:30:54 2175-06-03 13:39:54         5.1   
3           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.4   
6           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   
7           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   
8           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.6   
9           220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.7   
10          220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   
11          263738 2167-01-08 18:44:25 2167-01-12 10:43:31         0.6   
12          263738 2167-01-08 18:44:25 2167-01-12 10:43:31         0.6   
13          263738 2167-01-08 18:44:25 2167-01-12 10:43:31         0.4   
14          263738 2167-01-08 18:44:25 2167-01-12 10:43:31         0.6   
15          263738 2167-01-08 18:44:25

# Alternative : retrieve independently each feature and merge tables afterwards

In [12]:
features_info = pd.read_csv('features_info.csv')
#features_list = ['Urea','Temperature']
# Cut the table for test purposes
#features_info = features_info.loc[features_info['name'].apply(lambda x:x in features_list),:].reset_index(drop=True)
print(features_info)

                           name        table  variable  item_id  item_id_2  \
0    Arterial_pressure_systolic  chartevents  valuenum        6       51.0   
1   Arterial_pressure_diastolic  chartevents  valuenum     8364     8368.0   
2                    Heart_rate  chartevents  valuenum      211   220045.0   
3                  Weight_daily  chartevents  valuenum      763   224639.0   
4                   Temperature  chartevents  valuenum   223761      678.0   
5                  Urine_output  chartevents  valuenum    43966    44706.0   
6              day_urine_output  chartevents  valuenum    43372        NaN   
7                           SCr    labevents  valuenum    50912    51081.0   
8                          Urea    labevents  valuenum    51006        NaN   
9                        Sodium    labevents  valuenum    50824    50983.0   
10                    Potassium    labevents  valuenum    50822    50971.0   
11                      Calcium    labevents  valuenum    50808 

In [13]:
query = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    le.valuenum as creatinine, le.charttime as creatinine_time,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt
  from icustays icu
  inner join """+features_info.loc[0,'table']+""" le
    on icu.subject_id = le.subject_id
    and le.itemid = 50912
    and le."""+features_info.loc[0,'variable']+""" is not null
    and le.charttime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
),
cr_inc as
(
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt
  from cr),
cr_feat as
(
select
cr_inc.subject_id, cr_inc.icustay_id, cr_inc.intime, cr_inc.outtime,
    cr_inc.creatinine, cr_inc.creatinine_time,
    cr_inc.age, cr_inc.inclusion_age,
    cr_inc.length_of_stay, cr_inc.inclusion_length_of_stay,
    cr_inc.rrt, cr_inc.inclusion_rrt,
    le.valuenum as """+features_info.loc[0,'name']+""", EXTRACT('epoch' from cr_inc.creatinine_time - le.charttime) as """+features_info.loc[0,'name']+"""_delay, le.charttime as """+features_info.loc[0,'name']+"""_time
  from cr_inc
  inner join """+features_info.loc[0,'table']+""" le
    on cr_inc.subject_id = le.subject_id
    and le.itemid = """+str(features_info.loc[0,'item_id'])+"""
    and le."""+features_info.loc[0,'variable']+""" is not null
    and le.charttime between cr_inc.intime and cr_inc.creatinine_time
)
select 
a.subject_id, a.icustay_id, a.intime, a.outtime,
    a.creatinine, a.creatinine_time,
    a.age, a.inclusion_age,
    a.length_of_stay, a.inclusion_length_of_stay,
    a.rrt, a.inclusion_rrt,
    a."""+features_info.loc[0,'name']+""", a."""+features_info.loc[0,'name']+"""_delay, a."""+features_info.loc[0,'name']+"""_time
from cr_feat as a
    join (
        -- Get the min seenID for each personID
        select creatinine_time, min("""+features_info.loc[0,'name']+"""_delay) as """+features_info.loc[0,'name']+"""_delay
        from cr_feat
        group by creatinine_time
    ) as b on a.creatinine_time = b.creatinine_time
where a."""+features_info.loc[0,'name']+"""_delay = b."""+features_info.loc[0,'name']+"""_delay
"""
last_df = pd.read_sql_query(query, con)
print(last_df.head())
last_df = apply_inclusion_criteria(last_df)
print(last_df.head())
    
for i,row in features_info.loc[1:,:].iterrows():
    print('------------------------------------')
    print('--- Processing feature : ' + row['name'])

    query = query_schema + """
    with cr as
    (
    select
        icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
        le.valuenum as creatinine, le.charttime as creatinine_time,
        EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
        (rrt.icustay_id is null) as rrt
      from icustays icu
      inner join """+row['table']+""" le
        on icu.subject_id = le.subject_id
        and le.itemid = 50912
        and le."""+row['variable']+""" is not null
        and le.charttime between icu.intime and icu.outtime
      inner join patients pat
        ON icu.subject_id = pat.subject_id
      left outer join rrt 
        on icu.icustay_id = rrt.icustay_id
    ),
    cr_inc as
    (
    select
    cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
        cr.creatinine, cr.creatinine_time,
        cr.age, CASE
                    WHEN cr.age >= 15 then 1
                ELSE 0 END
                as inclusion_age,
      cr.length_of_stay, CASE
                            WHEN cr.length_of_stay >= 48 then 1
                         ELSE 0 END
                         as inclusion_length_of_stay,
      cr.rrt, CASE
                WHEN cr.rrt = False then 1
              ELSE 0 END
              as inclusion_rrt
      from cr),
    cr_feat as
    (
    select
    cr_inc.subject_id, cr_inc.icustay_id, cr_inc.intime, cr_inc.outtime,
        cr_inc.creatinine, cr_inc.creatinine_time,
        cr_inc.age, cr_inc.inclusion_age,
        cr_inc.length_of_stay, cr_inc.inclusion_length_of_stay,
        cr_inc.rrt, cr_inc.inclusion_rrt,
        le.valuenum as """+row['name']+""", EXTRACT('epoch' from cr_inc.creatinine_time - le.charttime) as """+row['name']+"""_delay, le.charttime as """+row['name']+"""_time
      from cr_inc
      inner join """+row['table']+""" le
        on cr_inc.subject_id = le.subject_id
        and le.itemid = """+str(row['item_id'])+"""
        and le."""+row['variable']+""" is not null
        and le.charttime between cr_inc.intime and cr_inc.creatinine_time
    )
    select 
    a.subject_id, a.icustay_id, a.intime, a.outtime,
        a.creatinine, a.creatinine_time,
        a.age, a.inclusion_age,
        a.length_of_stay, a.inclusion_length_of_stay,
        a.rrt, a.inclusion_rrt,
        a."""+row['name']+""", a."""+row['name']+"""_delay, a."""+row['name']+"""_time
    from cr_feat as a
        join (
            -- Get the min seenID for each personID
            select creatinine_time, min("""+row['name']+"""_delay) as """+row['name']+"""_delay
            from cr_feat
            group by creatinine_time
        ) as b on a.creatinine_time = b.creatinine_time
    where a."""+row['name']+"""_delay = b."""+row['name']+"""_delay
    """
    df = pd.read_sql_query(query, con)
    df = apply_inclusion_criteria(df)
    print(df.head())
    last_df = pd.merge(last_df,df,on=['subject_id', 'icustay_id', 'intime', 'outtime', 'creatinine',
       'creatinine_time', 'age', 'length_of_stay', 'rrt'],how='outer')
    print('---------------------------------')
    print('Merged table size : ' + str(last_df.shape[0]))
    print(last_df.head())

Empty DataFrame
Columns: [subject_id, icustay_id, intime, outtime, creatinine, creatinine_time, age, inclusion_age, length_of_stay, inclusion_length_of_stay, rrt, inclusion_rrt, arterial_pressure_systolic, arterial_pressure_systolic_delay, arterial_pressure_systolic_time]
Index: []
Initial size of table : 0
After dropping duplicates : 0
After applying inclusion criteria : 0
Empty DataFrame
Columns: [subject_id, icustay_id, intime, outtime, creatinine, creatinine_time, age, length_of_stay, rrt, arterial_pressure_systolic, arterial_pressure_systolic_delay, arterial_pressure_systolic_time]
Index: []
------------------------------------
--- Processing feature : Arterial_pressure_diastolic
Initial size of table : 0
After dropping duplicates : 0
After applying inclusion criteria : 0
Empty DataFrame
Columns: [subject_id, icustay_id, intime, outtime, creatinine, creatinine_time, age, length_of_stay, rrt, arterial_pressure_diastolic, arterial_pressure_diastolic_delay, arterial_pressure_diastoli

  stride //= shape[i]


Initial size of table : 327367
After dropping duplicates : 326653
After applying inclusion criteria : 279205
   subject_id  icustay_id              intime             outtime  creatinine  \
0           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         2.4   
1          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.1   
2          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.3   
3          13      263738 2167-01-08 18:44:25 2167-01-12 10:43:31         0.5   
4          21      216859 2135-01-30 20:53:34 2135-02-08 05:38:46         4.7   

      creatinine_time        age  length_of_stay    rrt  scr  scr_delay  \
0 2101-10-20 19:26:00  76.526792      145.549444  False  2.4        0.0   
1 2104-08-13 03:51:00  72.374177      183.235556  False  1.1        0.0   
2 2104-08-08 04:15:00  72.374177      183.235556  False  1.3        0.0   
3 2167-01-09 17:09:00  39.866118       87.985000  False  0.5        0.0   
4 2135-02-01 02:59:00  87.826

---------------------------------
Merged table size : 279206
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

Initial size of table : 305586
After dropping duplicates : 304878
After applying inclusion criteria : 266501
   subject_id  icustay_id              intime             outtime  creatinine  \
0           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         2.4   
1          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.3   
2          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.1   
3          21      216859 2135-01-30 20:53:34 2135-02-08 05:38:46         4.7   
4          21      217847 2134-09-11 20:50:04 2134-09-17 18:28:32         2.3   

      creatinine_time        age  length_of_stay    rrt  phosphor  \
0 2101-10-20 19:26:00  76.526792      145.549444  False       4.7   
1 2104-08-08 04:15:00  72.374177      183.235556  False       6.6   
2 2104-08-13 03:51:00  72.374177      183.235556  False       2.6   
3 2135-02-01 02:59:00  87.826347      200.753333  False       6.1   
4 2134-09-14 03:54:00  87.440295      141.641111  False    

Initial size of table : 125671
After dropping duplicates : 125508
After applying inclusion criteria : 117688
   subject_id  icustay_id              intime             outtime  creatinine  \
0           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.7   
1           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.9   
2           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.3   
3           9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   
4           9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.4   

      creatinine_time        age  length_of_stay    rrt  hemoglobine  \
0 2101-10-22 21:15:00  76.526792      145.549444  False         10.1   
1 2101-10-22 04:00:00  76.526792      145.549444  False          8.8   
2 2101-10-25 03:00:00  76.526792      145.549444  False         10.5   
3 2149-11-12 03:00:00  41.790228      127.753333  False         14.0   
4 2149-11-10 03:00:00  41.790228      127.75

Initial size of table : 212890
After dropping duplicates : 212479
After applying inclusion criteria : 199761
   subject_id  icustay_id              intime             outtime  creatinine  \
0           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.3   
1           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.9   
2           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.7   
3           9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.6   
4           9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   

      creatinine_time        age  length_of_stay    rrt  lactate  \
0 2101-10-25 03:00:00  76.526792      145.549444  False      1.0   
1 2101-10-22 04:00:00  76.526792      145.549444  False      1.3   
2 2101-10-22 21:15:00  76.526792      145.549444  False      1.3   
3 2149-11-13 03:20:00  41.790228      127.753333  False      2.6   
4 2149-11-11 03:00:00  41.790228      127.753333  False      2.3

Initial size of table : 92799
After dropping duplicates : 92619
After applying inclusion criteria : 87463
   subject_id  icustay_id              intime             outtime  creatinine  \
0           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         2.5   
1           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.9   
2           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.3   
3           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.7   
4           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.3   

      creatinine_time        age  length_of_stay    rrt  chloride  \
0 2101-10-20 19:59:00  76.526792      145.549444  False     114.0   
1 2101-10-22 04:00:00  76.526792      145.549444  False     111.0   
2 2101-10-26 03:00:00  76.526792      145.549444  False     111.0   
3 2101-10-22 21:15:00  76.526792      145.549444  False     111.0   
4 2101-10-25 03:00:00  76.526792      145.549444  False     11

Initial size of table : 157656
After dropping duplicates : 157332
After applying inclusion criteria : 144999
   subject_id  icustay_id              intime             outtime  creatinine  \
0           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.3   
1          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.1   
2          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.1   
3          21      216859 2135-01-30 20:53:34 2135-02-08 05:38:46         4.7   
4          21      217847 2134-09-11 20:50:04 2134-09-17 18:28:32         3.8   

      creatinine_time        age  length_of_stay    rrt  lymphocytes  \
0 2101-10-25 03:00:00  76.526792      145.549444  False          9.4   
1 2104-08-13 03:51:00  72.374177      183.235556  False          4.0   
2 2104-08-14 03:09:00  72.374177      183.235556  False          8.0   
3 2135-02-01 02:59:00  87.826347      200.753333  False          3.0   
4 2134-09-17 03:14:00  87.440295      141.64

Initial size of table : 250384
After dropping duplicates : 249882
After applying inclusion criteria : 230895
   subject_id  icustay_id              intime             outtime  creatinine  \
0           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.3   
1           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.3   
2           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.9   
3           9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   
4          21      216859 2135-01-30 20:53:34 2135-02-08 05:38:46         2.5   

      creatinine_time        age  length_of_stay    rrt  ph_blood  \
0 2101-10-26 03:00:00  76.526792      145.549444  False      7.39   
1 2101-10-25 03:00:00  76.526792      145.549444  False      7.39   
2 2101-10-22 04:00:00  76.526792      145.549444  False      7.42   
3 2149-11-11 03:00:00  41.790228      127.753333  False      7.39   
4 2135-02-02 05:03:00  87.826347      200.753333  False    

Initial size of table : 0
After dropping duplicates : 0
After applying inclusion criteria : 0
Empty DataFrame
Columns: [subject_id, icustay_id, intime, outtime, creatinine, creatinine_time, age, length_of_stay, rrt, monocytes, monocytes_delay, monocytes_time]
Index: []
---------------------------------
Merged table size : 279206
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                 

---------------------------------
Merged table size : 279206
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

---------------------------------
Merged table size : 279206
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

---------------------------------
Merged table size : 279206
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

---------------------------------
Merged table size : 279206
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

---------------------------------
Merged table size : 279206
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

In [25]:
print(last_df.shape[0])
print(last_df.loc[4,:])

279036
arterial_pressure_systolic                           NaN
arterial_pressure_systolic_delay                     NaN
arterial_pressure_systolic_time                      NaN
arterial_pressure_diastolic                          NaN
arterial_pressure_diastolic_delay                    NaN
arterial_pressure_diastolic_time                     NaN
heart_rate                                           NaN
heart_rate_delay                                     NaN
heart_rate_time                                      NaN
weight_daily                                         NaN
weight_daily_delay                                   NaN
weight_daily_time                                    NaN
temperature                                          NaN
temperature_delay                                    NaN
temperature_time                                     NaN
urine_output                                         NaN
urine_output_delay                                   NaN
urine_output_time       

# TODO (some is implemented below)
- figure out why we find no variables from the chartevents ?
- remove columns that are not features (remark : the delays between creatinine and other measurements might be interesting features!!)
- drop duplicates of creatinine measurements
- dump into .csv file and put it on the drive or slack


- take more exclusion criteria into account : the ones based on icd-9 codes
- add other (static) features !!

In [20]:
print(last_df.columns.values)

['arterial_pressure_systolic' 'arterial_pressure_systolic_delay'
 'arterial_pressure_systolic_time' 'arterial_pressure_diastolic'
 'arterial_pressure_diastolic_delay' 'arterial_pressure_diastolic_time'
 'heart_rate' 'heart_rate_delay' 'heart_rate_time' 'weight_daily'
 'weight_daily_delay' 'weight_daily_time' 'temperature' 'temperature_delay'
 'temperature_time' 'urine_output' 'urine_output_delay' 'urine_output_time'
 'day_urine_output' 'day_urine_output_delay' 'day_urine_output_time'
 'subject_id' 'icustay_id' 'intime' 'outtime' 'creatinine'
 'creatinine_time' 'age' 'length_of_stay' 'rrt' 'scr' 'scr_delay'
 'scr_time' 'urea' 'urea_delay' 'urea_time' 'sodium' 'sodium_delay'
 'sodium_time' 'potassium' 'potassium_delay' 'potassium_time' 'calcium'
 'calcium_delay' 'calcium_time' 'phosphor' 'phosphor_delay' 'phosphor_time'
 'leukocytes' 'leukocytes_delay' 'leukocytes_time' 'hemoglobine'
 'hemoglobine_delay' 'hemoglobine_time' 'bicarbonate' 'bicarbonate_delay'
 'bicarbonate_time' 'lactate' '

In [21]:
# REMOVE COLUMNS THAT WERE USED FOR INCLUSION CRITERIA BUT THAT ARE NOT AVAILABLE FEATURES FOR THE PREDICTION
# REMOVE ICUSTAY_IDS FOR WHICH THERE IS ONLY ONE MEASUREMENT OF CREATININE

print('Number of lines : ' + str(last_df.shape[0]))
to_remove = ['intime','outtime','length_of_stay','rrt']
for c in to_remove:
    if (c in last_df.columns.values): last_df = last_df.drop(c,axis=1)

# Remove icustay_ids for which there's only one measurement of creatinine
count_mes = last_df['icustay_id'].value_counts()
to_remove = count_mes.index.values[count_mes==1]
for i in to_remove:
    last_df = last_df.loc[last_df['icustay_id']!=i,:]

print('After dropping unique measurements of creatinine : ' + str(last_df.shape[0]))
last_df.head()

Number of lines : 279206
After dropping unique measurements of creatinine : 279036
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                            

In [22]:
# Dump to file
last_df.to_csv('creatinine_measurements.csv')
print(last_df)

       arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                             NaN                              NaN   
1                             NaN                              NaN   
2                             NaN                              NaN   
3                             NaN                              NaN   
4                             NaN                              NaN   
5                             NaN                              NaN   
6                             NaN                              NaN   
7                             NaN                              NaN   
8                             NaN                              NaN   
9                             NaN                              NaN   
10                            NaN                              NaN   
11                            NaN                              NaN   
12                            NaN                              NaN   
13                  

# Backup query only to retrieve urea as a feature

In [None]:
# Creatinine measurements
# + Features from labevents

# Inclusion flags :
# patient age > 15
# length of stay > 48hours


# TODO READ FEATURE_INFOS FILE

query = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    le.valuenum as creatinine, le.charttime as creatinine_time,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt
  from icustays icu
  inner join labevents le
    on icu.subject_id = le.subject_id
    and le.itemid = 50912
    and le.valuenum is not null
    and le.charttime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
),
cr_inc as
(
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt
  from cr),
cr_feat as
(
select
cr_inc.subject_id, cr_inc.icustay_id, cr_inc.intime, cr_inc.outtime,
    cr_inc.creatinine, cr_inc.creatinine_time,
    cr_inc.age, cr_inc.inclusion_age,
    cr_inc.length_of_stay, cr_inc.inclusion_length_of_stay,
    cr_inc.rrt, cr_inc.inclusion_rrt,
    le.valuenum as urea, EXTRACT('epoch' from cr_inc.creatinine_time - le.charttime) as urea_delay, le.charttime as urea_time
  from cr_inc
  inner join labevents le
    on cr_inc.subject_id = le.subject_id
    and le.itemid = 51006
    and le.valuenum is not null
    and le.charttime between cr_inc.intime and cr_inc.creatinine_time
)
select 
a.subject_id, a.icustay_id, a.intime, a.outtime,
    a.creatinine, a.creatinine_time,
    a.age, a.inclusion_age,
    a.length_of_stay, a.inclusion_length_of_stay,
    a.rrt, a.inclusion_rrt,
    a.urea, a.urea_delay, a.urea_time
from cr_feat as a
    join (
        -- Get the min seenID for each personID
        select creatinine_time, min(urea_delay) as urea_delay
        from cr_feat
        group by creatinine_time
    ) as b on a.creatinine_time = b.creatinine_time
where a.urea_delay = b.urea_delay
"""
df = pd.read_sql_query(query, con)
print(df.head())
df = apply_inclusion_criteria(df)
print(df.head())