In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2

# below imports are used to print out pretty pandas dataframes
from IPython.display import display, HTML

%matplotlib inline
plt.style.use('ggplot')

In [2]:
# information used to create a database connection
sqluser = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'

# Connect to postgres with a copy of the MIMIC-III database
con = psycopg2.connect(dbname=dbname, user=sqluser)

# the below statement is prepended to queries to ensure they select from the right schema
query_schema = 'set search_path to ' + schema_name + ';'

In [3]:
def apply_inclusion_criteria(df):
    print('Initial size of table : ' + str(df.shape[0]))
    df = df.drop_duplicates(['icustay_id','creatinine_time'])
    print('After dropping duplicates : ' + str(df.shape[0]))
    criteria_list = [i for i in df.columns.values if 'inclusion' in i]
    for c in criteria_list:
        df = df.loc[df[c]==1,:].drop(c,axis=1)    
    print('After applying inclusion criteria : ' + str(df.shape[0]))
    return df

# Retrieve interesting lab events for patients that match inclusion criteria

In [4]:
features_info = pd.read_csv('features_info.csv')
#features_list = ['Urea','Temperature']
# Cut the table for test purposes
#features_info = features_info.loc[features_info['name'].apply(lambda x:x in features_list),:].reset_index(drop=True)
print(features_info)

                           name        table  variable  item_id  item_id_2  \
0    Arterial_pressure_systolic  chartevents  valuenum        6       51.0   
1   Arterial_pressure_diastolic  chartevents  valuenum     8364     8368.0   
2                    Heart_rate  chartevents  valuenum      211   220045.0   
3                  Weight_daily  chartevents  valuenum      763   224639.0   
4                   Temperature  chartevents  valuenum   223761      678.0   
5                  Urine_output  chartevents  valuenum    43966    44706.0   
6              day_urine_output  chartevents  valuenum    43372        NaN   
7                           SCr    labevents  valuenum    50912    51081.0   
8                          Urea    labevents  valuenum    51006        NaN   
9                        Sodium    labevents  valuenum    50824    50983.0   
10                    Potassium    labevents  valuenum    50822    50971.0   
11                      Calcium    labevents  valuenum    50808 

In [5]:
query = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    le.valuenum as creatinine, le.charttime as creatinine_time,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt
  from icustays icu
  inner join """+features_info.loc[0,'table']+""" le
    on icu.subject_id = le.subject_id
    and le.itemid = 50912
    and le."""+features_info.loc[0,'variable']+""" is not null
    and le.charttime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
),
cr_inc as
(
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt
  from cr),
cr_feat as
(
select
cr_inc.subject_id, cr_inc.icustay_id, cr_inc.intime, cr_inc.outtime,
    cr_inc.creatinine, cr_inc.creatinine_time,
    cr_inc.age, cr_inc.inclusion_age,
    cr_inc.length_of_stay, cr_inc.inclusion_length_of_stay,
    cr_inc.rrt, cr_inc.inclusion_rrt,
    le.valuenum as """+features_info.loc[0,'name']+""", EXTRACT('epoch' from cr_inc.creatinine_time - le.charttime) as """+features_info.loc[0,'name']+"""_delay, le.charttime as """+features_info.loc[0,'name']+"""_time
  from cr_inc
  inner join """+features_info.loc[0,'table']+""" le
    on cr_inc.subject_id = le.subject_id
    and le.itemid = """+str(features_info.loc[0,'item_id'])+"""
    and le."""+features_info.loc[0,'variable']+""" is not null
    and le.charttime between cr_inc.intime and cr_inc.creatinine_time
)
select 
a.subject_id, a.icustay_id, a.intime, a.outtime,
    a.creatinine, a.creatinine_time,
    a.age, a.inclusion_age,
    a.length_of_stay, a.inclusion_length_of_stay,
    a.rrt, a.inclusion_rrt,
    a."""+features_info.loc[0,'name']+""", a."""+features_info.loc[0,'name']+"""_delay, a."""+features_info.loc[0,'name']+"""_time
from cr_feat as a
    join (
        -- Get the min seenID for each personID
        select creatinine_time, min("""+features_info.loc[0,'name']+"""_delay) as """+features_info.loc[0,'name']+"""_delay
        from cr_feat
        group by creatinine_time
    ) as b on a.creatinine_time = b.creatinine_time
where a."""+features_info.loc[0,'name']+"""_delay = b."""+features_info.loc[0,'name']+"""_delay
"""
df_labevents = pd.read_sql_query(query, con)
print(df_labevents.head())
df_labevents = apply_inclusion_criteria(df_labevents)
print(df_labevents.head())
    
for i,row in features_info.loc[1:,:].iterrows():
    print('------------------------------------')
    print('--- Processing feature : ' + row['name'])

    query = query_schema + """
    with cr as
    (
    select
        icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
        le.valuenum as creatinine, le.charttime as creatinine_time,
        EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
        (rrt.icustay_id is null) as rrt
      from icustays icu
      inner join """+row['table']+""" le
        on icu.subject_id = le.subject_id
        and le.itemid = 50912
        and le."""+row['variable']+""" is not null
        and le.charttime between icu.intime and icu.outtime
      inner join patients pat
        ON icu.subject_id = pat.subject_id
      left outer join rrt 
        on icu.icustay_id = rrt.icustay_id
    ),
    cr_inc as
    (
    select
    cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
        cr.creatinine, cr.creatinine_time,
        cr.age, CASE
                    WHEN cr.age >= 15 then 1
                ELSE 0 END
                as inclusion_age,
      cr.length_of_stay, CASE
                            WHEN cr.length_of_stay >= 48 then 1
                         ELSE 0 END
                         as inclusion_length_of_stay,
      cr.rrt, CASE
                WHEN cr.rrt = False then 1
              ELSE 0 END
              as inclusion_rrt
      from cr),
    cr_feat as
    (
    select
    cr_inc.subject_id, cr_inc.icustay_id, cr_inc.intime, cr_inc.outtime,
        cr_inc.creatinine, cr_inc.creatinine_time,
        cr_inc.age, cr_inc.inclusion_age,
        cr_inc.length_of_stay, cr_inc.inclusion_length_of_stay,
        cr_inc.rrt, cr_inc.inclusion_rrt,
        le.valuenum as """+row['name']+""", EXTRACT('epoch' from cr_inc.creatinine_time - le.charttime) as """+row['name']+"""_delay, le.charttime as """+row['name']+"""_time
      from cr_inc
      inner join """+row['table']+""" le
        on cr_inc.subject_id = le.subject_id
        and le.itemid = """+str(row['item_id'])+"""
        and le."""+row['variable']+""" is not null
        and le.charttime between cr_inc.intime and cr_inc.creatinine_time
    )
    select 
    a.subject_id, a.icustay_id, a.intime, a.outtime,
        a.creatinine, a.creatinine_time,
        a.age, a.inclusion_age,
        a.length_of_stay, a.inclusion_length_of_stay,
        a.rrt, a.inclusion_rrt,
        a."""+row['name']+""", a."""+row['name']+"""_delay, a."""+row['name']+"""_time
    from cr_feat as a
        join (
            -- Get the min seenID for each personID
            select creatinine_time, min("""+row['name']+"""_delay) as """+row['name']+"""_delay
            from cr_feat
            group by creatinine_time
        ) as b on a.creatinine_time = b.creatinine_time
    where a."""+row['name']+"""_delay = b."""+row['name']+"""_delay
    """
    df = pd.read_sql_query(query, con)
    df = apply_inclusion_criteria(df)
    print(df.head())
    df_labevents = pd.merge(df_labevents,df,on=['subject_id', 'icustay_id', 'intime', 'outtime', 'creatinine',
       'creatinine_time', 'age', 'length_of_stay', 'rrt'],how='outer')
    print('---------------------------------')
    print('Merged table size : ' + str(df_labevents.shape[0]))
    print(df_labevents.head())

Empty DataFrame
Columns: [subject_id, icustay_id, intime, outtime, creatinine, creatinine_time, age, inclusion_age, length_of_stay, inclusion_length_of_stay, rrt, inclusion_rrt, arterial_pressure_systolic, arterial_pressure_systolic_delay, arterial_pressure_systolic_time]
Index: []
Initial size of table : 0
After dropping duplicates : 0
After applying inclusion criteria : 0
Empty DataFrame
Columns: [subject_id, icustay_id, intime, outtime, creatinine, creatinine_time, age, length_of_stay, rrt, arterial_pressure_systolic, arterial_pressure_systolic_delay, arterial_pressure_systolic_time]
Index: []
------------------------------------
--- Processing feature : Arterial_pressure_diastolic
Initial size of table : 0
After dropping duplicates : 0
After applying inclusion criteria : 0
Empty DataFrame
Columns: [subject_id, icustay_id, intime, outtime, creatinine, creatinine_time, age, length_of_stay, rrt, arterial_pressure_diastolic, arterial_pressure_diastolic_delay, arterial_pressure_diastoli

  stride //= shape[i]


Initial size of table : 0
After dropping duplicates : 0
After applying inclusion criteria : 0
Empty DataFrame
Columns: [subject_id, icustay_id, intime, outtime, creatinine, creatinine_time, age, length_of_stay, rrt, day_urine_output, day_urine_output_delay, day_urine_output_time]
Index: []
---------------------------------
Merged table size : 0
Empty DataFrame
Columns: [arterial_pressure_systolic, arterial_pressure_systolic_delay, arterial_pressure_systolic_time, arterial_pressure_diastolic, arterial_pressure_diastolic_delay, arterial_pressure_diastolic_time, heart_rate, heart_rate_delay, heart_rate_time, weight_daily, weight_daily_delay, weight_daily_time, temperature, temperature_delay, temperature_time, urine_output, urine_output_delay, urine_output_time, subject_id, icustay_id, intime, outtime, creatinine, creatinine_time, age, length_of_stay, rrt, day_urine_output, day_urine_output_delay, day_urine_output_time]
Index: []

[0 rows x 30 columns]
------------------------------------


Initial size of table : 116607
After dropping duplicates : 116427
After applying inclusion criteria : 108790
   subject_id  icustay_id              intime             outtime  creatinine  \
0           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         2.5   
1           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.9   
2           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.7   
3           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.3   
4           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.3   

      creatinine_time        age  length_of_stay    rrt  sodium  sodium_delay  \
0 2101-10-21 03:00:00  76.526792      145.549444  False   139.0       18540.0   
1 2101-10-22 04:00:00  76.526792      145.549444  False   138.0       16380.0   
2 2101-10-22 21:15:00  76.526792      145.549444  False   138.0       78480.0   
3 2101-10-26 03:00:00  76.526792      145.549444  False   138.0      358380.0   

---------------------------------
Merged table size : 279205
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

---------------------------------
Merged table size : 279206
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

---------------------------------
Merged table size : 279206
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

---------------------------------
Merged table size : 279206
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

---------------------------------
Merged table size : 279206
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

---------------------------------
Merged table size : 279206
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

---------------------------------
Merged table size : 279206
  arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                        NaN                              NaN   
1                        NaN                              NaN   
2                        NaN                              NaN   
3                        NaN                              NaN   
4                        NaN                              NaN   

  arterial_pressure_systolic_time arterial_pressure_diastolic  \
0                             NaN                         NaN   
1                             NaN                         NaN   
2                             NaN                         NaN   
3                             NaN                         NaN   
4                             NaN                         NaN   

  arterial_pressure_diastolic_delay arterial_pressure_diastolic_time  \
0                               NaN                              NaN   
1            

Initial size of table : 85961
After dropping duplicates : 85687
After applying inclusion criteria : 82464
   subject_id  icustay_id              intime             outtime  creatinine  \
1          31      254478 2108-08-22 23:28:42 2108-08-30 21:59:20         0.9   
2          31      254478 2108-08-22 23:28:42 2108-08-30 21:59:20         1.5   
4          62      216609 2113-02-15 00:20:44 2113-02-17 20:09:46         0.7   
5          62      216609 2113-02-15 00:20:44 2113-02-17 20:09:46         0.7   
6          62      216609 2113-02-15 00:20:44 2113-02-17 20:09:46         0.6   

      creatinine_time        age  length_of_stay    rrt  urinary_sodium  \
1 2108-08-29 03:00:00  72.267095      190.510556  False            21.0   
2 2108-08-30 01:45:00  72.267095      190.510556  False            21.0   
4 2113-02-15 14:32:00  68.773620       67.817222  False            70.0   
5 2113-02-16 17:30:00  68.773620       67.817222  False            70.0   
6 2113-02-17 05:23:00  68.773620

Initial size of table : 56779
After dropping duplicates : 56668
After applying inclusion criteria : 54771
   subject_id  icustay_id              intime             outtime  creatinine  \
0           9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   
1           9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.6   
2           9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.7   
3           9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14         2.0   
4          62      216609 2113-02-15 00:20:44 2113-02-17 20:09:46         0.7   

      creatinine_time        age  length_of_stay    rrt  urine_urea  \
0 2149-11-12 03:00:00  41.790228      127.753333  False       636.0   
1 2149-11-13 03:20:00  41.790228      127.753333  False       636.0   
2 2149-11-13 16:04:00  41.790228      127.753333  False       636.0   
3 2149-11-14 06:00:00  41.790228      127.753333  False       636.0   
4 2113-02-15 14:32:00  68.773620       67.817222  Fa

Initial size of table : 189131
After dropping duplicates : 188784
After applying inclusion criteria : 174336
   subject_id  icustay_id              intime             outtime  creatinine  \
0           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.3   
1           6      228232 2175-05-30 21:30:54 2175-06-03 13:39:54         5.1   
2           9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.6   
3          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.2   
4          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.1   

      creatinine_time        age  length_of_stay    rrt  alkaline_phospatase  \
0 2101-10-25 03:00:00  76.526792      145.549444  False                 89.0   
1 2175-06-03 00:58:00  65.942297       88.150000  False                 55.0   
2 2149-11-13 03:20:00  41.790228      127.753333  False                 67.0   
3 2104-08-10 01:00:00  72.374177      183.235556  False                132.0   
4 21

Initial size of table : 176225
After dropping duplicates : 175862
After applying inclusion criteria : 164713
   subject_id  icustay_id              intime             outtime  creatinine  \
0           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.5   
1           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.3   
2           9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.6   
3          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.2   
4          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.1   

      creatinine_time        age  length_of_stay    rrt  albumin  \
0 2101-10-24 03:15:00  76.526792      145.549444  False      1.8   
1 2101-10-26 03:00:00  76.526792      145.549444  False      1.8   
2 2149-11-13 03:20:00  41.790228      127.753333  False      3.0   
3 2104-08-10 01:00:00  72.374177      183.235556  False      2.4   
4 2104-08-11 03:00:00  72.374177      183.235556  False      2.4

Initial size of table : 193432
After dropping duplicates : 193018
After applying inclusion criteria : 177120
   subject_id  icustay_id              intime             outtime  creatinine  \
0           3      211552 2101-10-20 19:10:11 2101-10-26 20:43:09         1.3   
1           6      228232 2175-05-30 21:30:54 2175-06-03 13:39:54         5.1   
2           9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14         1.6   
3          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.2   
4          12      232669 2104-08-08 02:08:17 2104-08-15 17:22:25         1.1   

      creatinine_time        age  length_of_stay    rrt  bilirubin  \
0 2101-10-25 03:00:00  76.526792      145.549444  False        0.8   
1 2175-06-03 00:58:00  65.942297       88.150000  False        0.3   
2 2149-11-13 03:20:00  41.790228      127.753333  False        0.4   
3 2104-08-10 01:00:00  72.374177      183.235556  False        9.9   
4 2104-08-11 03:00:00  72.374177      183.235556  Fals

In [None]:
# REMOVE COLUMNS THAT WERE USED FOR INCLUSION CRITERIA BUT THAT ARE NOT AVAILABLE FEATURES FOR THE PREDICTION
# REMOVE ICUSTAY_IDS FOR WHICH THERE IS ONLY ONE MEASUREMENT OF CREATININE

#print('Number of lines : ' + str(last_df.shape[0]))
#to_remove = ['intime','outtime','length_of_stay','rrt']
#for c in to_remove:
#    if (c in last_df.columns.values): last_df = last_df.drop(c,axis=1)

# Remove icustay_ids for which there's only one measurement of creatinine
#count_mes = last_df['icustay_id'].value_counts()
#to_remove = count_mes.index.values[count_mes==1]
#for i in to_remove:
#    last_df = last_df.loc[last_df['icustay_id']!=i,:]

#print('After dropping unique measurements of creatinine : ' + str(last_df.shape[0]))
#last_df.head()

In [7]:
# Dump to file
df_labevents.to_csv('creatinine_measurements_1.csv')
print(df_labevents)

       arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                             NaN                              NaN   
1                             NaN                              NaN   
2                             NaN                              NaN   
3                             NaN                              NaN   
4                             NaN                              NaN   
5                             NaN                              NaN   
6                             NaN                              NaN   
7                             NaN                              NaN   
8                             NaN                              NaN   
9                             NaN                              NaN   
10                            NaN                              NaN   
11                            NaN                              NaN   
12                            NaN                              NaN   
13                  

# Retrieve missing static information

In [8]:
# Creatinine measurements, AKI flags + stage for all stays
# Inclusion flags :
# patient age > 15
# length of stay > 48hours
query = query_schema + """
with cr as
(
select
    icu.subject_id, icu.icustay_id, icu.intime, icu.outtime, EXTRACT(EPOCH FROM icu.outtime - icu.intime)/60.0/60.0 as length_of_stay,
    le.valuenum as creatinine, le.charttime as creatinine_time,
    adm.ethnicity, adm.diagnosis,
    pat.gender as gender,
    EXTRACT('epoch' from icu.intime - pat.dob) / 60.0 / 60.0 / 24.0 / 365.242 AS age,
    (rrt.icustay_id is null) as rrt
  from icustays icu
  inner join labevents le
    on icu.subject_id = le.subject_id
    and le.itemid = 50912
    and le.valuenum is not null
    and le.charttime between icu.intime and icu.outtime
  inner join patients pat
    ON icu.subject_id = pat.subject_id
  inner join admissions adm
    on icu.subject_id = adm.subject_id
  left outer join rrt 
    on icu.icustay_id = rrt.icustay_id
)
select
cr.subject_id, cr.icustay_id, cr.intime, cr.outtime,
    cr.creatinine, cr.creatinine_time,
    cr.ethnicity, cr.diagnosis,
    cr.gender,
    cr.age, CASE
                WHEN cr.age >= 15 then 1
            ELSE 0 END
            as inclusion_age,
  cr.length_of_stay, CASE
                        WHEN cr.length_of_stay >= 48 then 1
                     ELSE 0 END
                     as inclusion_length_of_stay,
  cr.rrt, CASE
            WHEN cr.rrt = False then 1
          ELSE 0 END
          as inclusion_rrt
  from cr
"""
df_static = pd.read_sql_query(query, con)
print(df_static.head())
df_static = apply_inclusion_criteria(df_static)
print(df_static.head())

   subject_id  icustay_id              intime             outtime  creatinine  \
0           4      294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.5   
1           4      294638 2191-03-16 00:29:31 2191-03-17 16:46:31         0.4   
2           6      228232 2175-05-30 21:30:54 2175-06-03 13:39:54        10.0   
3           6      228232 2175-05-30 21:30:54 2175-06-03 13:39:54         7.4   
4           6      228232 2175-05-30 21:30:54 2175-06-03 13:39:54         6.2   

      creatinine_time ethnicity                            diagnosis gender  \
0 2191-03-16 05:42:00     WHITE  FEVER,DEHYDRATION,FAILURE TO THRIVE      F   
1 2191-03-17 06:00:00     WHITE  FEVER,DEHYDRATION,FAILURE TO THRIVE      F   
2 2175-05-31 01:48:00     WHITE            CHRONIC RENAL FAILURE/SDA      F   
3 2175-06-01 03:00:00     WHITE            CHRONIC RENAL FAILURE/SDA      F   
4 2175-06-02 02:06:00     WHITE            CHRONIC RENAL FAILURE/SDA      F   

         age  inclusion_age  length_of

In [None]:
# REMOVE COLUMNS THAT WERE USED FOR INCLUSION CRITERIA BUT THAT ARE NOT AVAILABLE FEATURES FOR THE PREDICTION
# REMOVE ICUSTAY_IDS FOR WHICH THERE IS ONLY ONE MEASUREMENT OF CREATININE

#print('Number of lines : ' + str(df.shape[0]))
#to_remove = ['intime','outtime','length_of_stay','rrt']
#for c in to_remove:
#    if (c in df.columns.values): df = df.drop(c,axis=1)

# Remove icustay_ids for which there's only one measurement of creatinine
#count_mes = df['icustay_id'].value_counts()
#to_remove = count_mes.index.values[count_mes==1]
#for i in to_remove:
#    df = df.loc[df['icustay_id']!=i,:]

#print('After dropping unique measurements of creatinine : ' + str(df.shape[0]))
#print(df.head())

In [9]:
# Dump to file
df_static.to_csv('creatinine_measurements_2.csv')
print(df_static)

        subject_id  icustay_id              intime             outtime  \
2                6      228232 2175-05-30 21:30:54 2175-06-03 13:39:54   
3                6      228232 2175-05-30 21:30:54 2175-06-03 13:39:54   
4                6      228232 2175-05-30 21:30:54 2175-06-03 13:39:54   
5                6      228232 2175-05-30 21:30:54 2175-06-03 13:39:54   
6                9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14   
9                9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14   
10               9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14   
11               9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14   
12               9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14   
13               9      220597 2149-11-09 13:07:02 2149-11-14 20:52:14   
14              13      263738 2167-01-08 18:44:25 2167-01-12 10:43:31   
15              13      263738 2167-01-08 18:44:25 2167-01-12 10:43:31   
16              13      263738 2167-01

# Merge tables into one

In [10]:
print('Labevents :')
print(df_labevents.shape)
print(df_labevents.columns.values)
print('')

print('Static:')
print(df_static.shape)
print(df_static.columns.values)

Labevents :
(279206, 117)
['arterial_pressure_systolic' 'arterial_pressure_systolic_delay'
 'arterial_pressure_systolic_time' 'arterial_pressure_diastolic'
 'arterial_pressure_diastolic_delay' 'arterial_pressure_diastolic_time'
 'heart_rate' 'heart_rate_delay' 'heart_rate_time' 'weight_daily'
 'weight_daily_delay' 'weight_daily_time' 'temperature' 'temperature_delay'
 'temperature_time' 'urine_output' 'urine_output_delay' 'urine_output_time'
 'day_urine_output' 'day_urine_output_delay' 'day_urine_output_time'
 'subject_id' 'icustay_id' 'intime' 'outtime' 'creatinine'
 'creatinine_time' 'age' 'length_of_stay' 'rrt' 'scr' 'scr_delay'
 'scr_time' 'urea' 'urea_delay' 'urea_time' 'sodium' 'sodium_delay'
 'sodium_time' 'potassium' 'potassium_delay' 'potassium_time' 'calcium'
 'calcium_delay' 'calcium_time' 'phosphor' 'phosphor_delay' 'phosphor_time'
 'leukocytes' 'leukocytes_delay' 'leukocytes_time' 'hemoglobine'
 'hemoglobine_delay' 'hemoglobine_time' 'bicarbonate' 'bicarbonate_delay'
 'bic

In [11]:
merged_df = pd.merge(df_labevents,df_static,on=['subject_id', 'icustay_id', 'intime', 'outtime', 'creatinine',
       'creatinine_time', 'age', 'length_of_stay', 'rrt'],how='outer')

# Remove columns that are not features (except icustay_id and patient_id that are needed to build the table with labels)

In [21]:
# REMOVE COLUMNS THAT WERE USED FOR INCLUSION CRITERIA BUT THAT ARE NOT AVAILABLE FEATURES FOR THE PREDICTION
# REMOVE ICUSTAY_IDS FOR WHICH THERE IS ONLY ONE MEASUREMENT OF CREATININE

print('Number of lines : ' + str(merged_df.shape[0]))
to_remove = ['intime','outtime','length_of_stay','rrt']
for c in to_remove:
    if (c in merged_df.columns.values): merged_df = merged_df.drop(c,axis=1)
        
# Remove columns with _time suffix
to_remove = [i for i in merged_df.columns.values if '_time' in i]
for c in to_remove:
    if (c in merged_df.columns.values): merged_df = merged_df.drop(c,axis=1)

# Remove icustay_ids for which there's only one measurement of creatinine
count_mes = merged_df['icustay_id'].value_counts()
to_remove = count_mes.index.values[count_mes==1]
for i in to_remove:
    merged_df = merged_df.loc[merged_df['icustay_id']!=i,:]

print('After dropping unique measurements of creatinine : ' + str(merged_df.shape[0]))
merged_df.head()

Number of lines : 279036
After dropping unique measurements of creatinine : 279036


Unnamed: 0,arterial_pressure_systolic,arterial_pressure_systolic_delay,arterial_pressure_diastolic,arterial_pressure_diastolic_delay,heart_rate,heart_rate_delay,weight_daily,weight_daily_delay,temperature,temperature_delay,...,albumin_delay,total_protein_urine,total_protein_urine_delay,bilirubin,bilirubin_delay,c_reactive_protein,c_reactive_protein_delay,ethnicity,diagnosis,gender
0,,,,,,,,,,,...,,,,,,,,WHITE,HYPOTENSION,M
1,,,,,,,,,,,...,0.0,,,8.2,0.0,,,WHITE,PANCREATIC CANCER/SDA,M
2,,,,,,,,,,,...,,,,,,,,WHITE,PANCREATIC CANCER/SDA,M
3,,,,,,,,,,,...,74460.0,,,0.4,74460.0,,,WHITE,CORONARY ARTERY DISEASE,F
4,,,,,,,,,,,...,,,,0.4,170640.0,,,WHITE,SEPSIS,M


In [22]:
# Dump to file
merged_df.to_csv('creatinine_measurements_merged.csv')
print(merged_df)

       arterial_pressure_systolic arterial_pressure_systolic_delay  \
0                             NaN                              NaN   
1                             NaN                              NaN   
2                             NaN                              NaN   
3                             NaN                              NaN   
4                             NaN                              NaN   
5                             NaN                              NaN   
6                             NaN                              NaN   
7                             NaN                              NaN   
8                             NaN                              NaN   
9                             NaN                              NaN   
10                            NaN                              NaN   
11                            NaN                              NaN   
12                            NaN                              NaN   
13                  