In [None]:
%pip install tensorflow

In [None]:
%pip install Timestamp

In [None]:
import pandas as pd


In [None]:
dataset = 'mimiciv' # If you put your files in data/mimiciv/...
version = 'iv' #iv or iii

In [None]:
ventdurations = pd.read_csv(f'data/{dataset}/ventdurations.csv')
d_items = pd.read_csv(f'data/{dataset}/d_items.csv.gz')
patients = pd.read_csv(f'data/{dataset}/patients.csv.gz')
icustays = pd.read_csv(f'data/{dataset}/icustays.csv.gz')

#Use if Chartevents stored in just one file
#chartevents = pd.read_csv(f'data/{dataset}/chartevents.csv') 

In [None]:
import tensorflow as tf
from tensorflow.python.lib.io import file_io

def read_csv_file(filename):
    df = pd.read_csv(filename)
    return df

def read_csv_files(filename_pattern):
    filenames = tf.io.gfile.glob(filename_pattern)
    dataframes = [read_csv_file(filename) for filename in filenames]
    return pd.concat(dataframes)
# sharded read
chartevents = read_csv_files(f'data/{dataset}/chartevents*') 


In [None]:
#Memory usage before optimizations
chartevents.memory_usage(deep=True)

In [None]:
# Reduce memory usage
chartevents["valueuom"] = chartevents["valueuom"].astype("category")
chartevents['charttime'] = pd.to_datetime(chartevents['charttime'])
chartevents = chartevents.drop(columns = ['subject_id','row_id','hadm_id','cgid', 'storetime', 'value', 'warning', 'error', 'resultstatus', 'stopped'],  errors='ignore')
chartevents = chartevents[chartevents['valuenum'].notna()]
chartevents = chartevents[chartevents['icustay_id'].notna()]
chartevents['icustay_id'] = chartevents['icustay_id'].astype(int)
chartevents.memory_usage(deep=True)

In [None]:
# Fix uppercase column names
patients.columns = list(map(lambda s: s.lower(),patients.columns))
icustays.columns = list(map(lambda s: s.lower(),icustays.columns))
d_items.columns = list(map(lambda s: s.lower(),d_items.columns))
ventdurations.columns = list(map(lambda s: s.lower(),ventdurations.columns))
chartevents.columns = list(map(lambda s: s.lower(),chartevents.columns))

In [None]:
ventdurations['endtime'] = pd.to_datetime(ventdurations['endtime'])
ventdurations['starttime'] = pd.to_datetime(ventdurations['starttime'])
ventdurations['duration'] = ventdurations['endtime'] - ventdurations['starttime']
ventdurations = ventdurations[ventdurations['icustay_id'].notna()]
ventdurations['icustay_id'] = ventdurations['icustay_id'].astype(int)
ventdurations = ventdurations.groupby(by = 'icustay_id').agg(
    duration=pd.NamedAgg(column="duration", aggfunc="sum"),
    starttime=pd.NamedAgg(column="starttime", aggfunc="min"),
    endtime=pd.NamedAgg(column="endtime", aggfunc="max"))

ventdurations = ventdurations[ventdurations['duration'] >= pd.to_timedelta('1 day')]



if version == 'iii':
    patients = patients[['subject_id','dob']].set_index('subject_id')
    icustays = icustays[['subject_id','icustay_id']]
    icustay_to_dob = icustays.join(patients, on = 'subject_id').drop(columns = 'subject_id').set_index('icustay_id')

    icustay_to_dob['dob'] = pd.to_datetime(icustay_to_dob['dob'])


    #Filter out patients that are at least 18 years of age
    ventdurations = ventdurations.join(icustay_to_dob)
    ventdurations['startdate'] = pd.to_datetime(ventdurations['starttime']).dt.date
    ventdurations['dob'] = pd.to_datetime(ventdurations['dob']).dt.date
    ventdurations = ventdurations[ventdurations.apply(lambda row: (row['startdate'] - row['dob']).days >= 18*365, axis = 1)]
    
    ventdurations = ventdurations.drop(columns = ['duration','startdate','endtime','dob'], errors='ignore')
    
else:
    # Version iv, patients table is a little bit different
    patients = patients[patients['anchor_age']>=18]
    patients = patients[['subject_id', 'anchor_age']]
    icustays = icustays[['subject_id', 'stay_id']]
    icustays = icustays.set_index('subject_id')
    patients = patients.set_index('subject_id')
    
    icustays['icustay_id'] = icustays['stay_id'] 
    icu_patients = icustays.join(patients, how = 'inner')
    icu_patients = icu_patients.set_index('icustay_id')
    
    ventdurations = ventdurations.join(icu_patients, how = 'inner')
    ventdurations = ventdurations.drop(columns = ['duration','endtime','stay_id','anchor_age'], errors='ignore')


ventdurations
    

In [None]:
#Filter out chartevents that are taken during the first day of ventilation for ICU stays that last for 1 day

chartevents = chartevents.join(ventdurations, on = 'icustay_id', how = 'inner')
rowfilter = (chartevents['starttime'] <= chartevents['charttime']) & \
    (chartevents['charttime'] <= chartevents['starttime'] + pd.to_timedelta('1 day'))
chartevents = chartevents[rowfilter]

chartevents = chartevents.drop(columns = ['charttime','starttime'], errors='ignore')
chartevents

In [None]:
d_items = d_items.set_index('itemid')
d_items = d_items[['label']]
d_items

In [None]:
#Replace itemid column with label, important because some itemid maps to the same label
chartevents = chartevents.join(d_items, on = 'itemid').drop(columns = 'itemid')

In [None]:
# Check that units of measurement are consistent
units_of_measurement = chartevents[chartevents['valueuom'].notna()].groupby(by = ['label']).agg(
    valueuom=pd.NamedAgg(column="valueuom", aggfunc="unique")
).reset_index()
units_of_measurement[units_of_measurement['valueuom'].apply(lambda x: x.size)>1]
# Just validate that table is empty, which means that there are no measurements with different units
# For mimiciv "Absolute count" has 2 different units of measurement, but it will be discarded anyway.

In [None]:
medianevents = chartevents.groupby(by = ['icustay_id','label']).agg(
    valuenum=pd.NamedAgg(column="valuenum", aggfunc="median")
)

medianevents

In [None]:
measurements_by_person = medianevents.reset_index().set_index(['icustay_id','label']).unstack()

# Remove columns with more than 30% nan values.
measurements_by_person = measurements_by_person.loc[:, measurements_by_person.isnull().mean() <= 0.3]
# Replace remaining nan values with mode
measurements_by_person = measurements_by_person.fillna(measurements_by_person.mode().iloc[0])
# Fix column labels
measurements_by_person.columns = measurements_by_person.columns.map(lambda x: x[1])
measurements_by_person = measurements_by_person.reset_index()
measurements_by_person

In [None]:
measurements_by_person.to_csv(f'output/intermediate-{dataset}.csv', index = False)