In [5]:
import pandas as pd
import numpy as np
import psycopg2
import getpass
import gc
from tqdm import tqdm

In [6]:
user = 'postgres'
host = '10.10.116.166'
#127.0.0.1
#host = '127.0.0.1'
port = 8088
dbname = 'mimiciv'
schema = 'public, mimiciv_derived, mimiciv_hosp, mimiciv_icu'

In [8]:
# 连接本地数据库
con = psycopg2.connect(user=user, host=host, port=port,
                      dbname=dbname, password=getpass.getpass())
cur = con.cursor()


········


In [9]:
OUTPUT_FILENAME = './data_seq_ori.tsv'
VITALS = 'select * from vital_all_icu_0721'
LABS = 'select * from labs_all_icu_mean_0721'
KDIGO_STAGES = 'select * from kdigo_stages_0721'

# vitals

In [10]:
vitals = pd.read_sql_query(VITALS, con)

In [11]:
vitals.drop(['subject_id'], axis=1, inplace=True)
vitals.head()

Unnamed: 0,hadm_id,stay_id,charttime,heartrate_mean,sysbp_mean,diasbp_mean,meanbp_mean,resprate_mean,tempc_mean,spo2_mean,glucose_mean
0,26184834,37510196,2131-01-11 04:22:00,38.0,,,,,,,
1,26184834,37510196,2131-01-11 04:25:00,,180.0,12.0,46.0,,,,
2,26184834,37510196,2131-01-11 05:00:00,60.0,,,,10.0,,98.0,
3,26184834,37510196,2131-01-11 05:01:00,,167.0,49.0,70.0,,,,
4,26184834,37510196,2131-01-11 06:00:00,72.0,,,,20.0,,100.0,


In [12]:
vitals.shape, vitals.dtypes

((6443813, 11),
 hadm_id                    int64
 stay_id                    int64
 charttime         datetime64[ns]
 heartrate_mean           float64
 sysbp_mean               float64
 diasbp_mean              float64
 meanbp_mean              float64
 resprate_mean            float64
 tempc_mean               float64
 spo2_mean                float64
 glucose_mean             float64
 dtype: object)

In [13]:
vitals.hadm_id.nunique(), vitals.stay_id.nunique()

(27499, 28658)

In [14]:
vitals.columns = ['hadm_id', 'stay_id', 'charttime', 'heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc', 'spo2', 'glucose']
vitals.dropna(subset=vitals.columns[3:], how='all', inplace=True)
vitals.sort_values(['stay_id', 'charttime'], inplace=True, ascending=True)

In [15]:
vitals.shape, pd.isna(vitals).any()

((6439069, 11),
 hadm_id      False
 stay_id      False
 charttime    False
 heartrate     True
 sysbp         True
 diasbp        True
 meanbp        True
 resprate      True
 tempc         True
 spo2          True
 glucose       True
 dtype: bool)

In [16]:
vitals.hadm_id.nunique(), vitals.stay_id.nunique()

(27499, 28658)

# labs

In [17]:
labs = pd.read_sql_query(LABS, con)

In [18]:
labs.drop(['subject_id'], axis=1, inplace=True)
labs.head()

Unnamed: 0,hadm_id,stay_id,charttime,aniongap_mean,albumin_mean,bands_mean,bicarbonate_mean,bilirubin_mean,creatinine_mean,chloride_mean,...,hemoglobin_mean,lactate_mean,platelet_mean,potassium_mean,ptt_mean,inr_mean,pt_mean,sodium_mean,bun_mean,wbc_mean
0,26184834,37510196,2131-01-11 06:31:00,11.0,3.5,1.0,33.0,0.4,1.1,96.0,...,11.4,,171.0,4.0,21.3,1.4,15.3,136.0,30.0,18.4
1,26184834,37510196,2131-01-11 06:37:00,,,,,,,90.0,...,12.1,1.5,,3.9,,,,134.0,,
2,26184834,37510196,2131-01-11 11:33:00,,,,,,,,...,,1.1,,,,,,,,
3,26184834,37510196,2131-01-11 16:06:00,,,,,,,,...,,,,,106.5,,,,,
4,26184834,37510196,2131-01-11 22:00:00,,,,,,,,...,,,,,36.5,,,,,


In [19]:
labs.shape, labs.dtypes

((594731, 22),
 hadm_id                      int64
 stay_id                      int64
 charttime           datetime64[ns]
 aniongap_mean              float64
 albumin_mean               float64
 bands_mean                 float64
 bicarbonate_mean           float64
 bilirubin_mean             float64
 creatinine_mean            float64
 chloride_mean              float64
 glucose_mean               float64
 hematocrit_mean            float64
 hemoglobin_mean            float64
 lactate_mean               float64
 platelet_mean              float64
 potassium_mean             float64
 ptt_mean                   float64
 inr_mean                   float64
 pt_mean                    float64
 sodium_mean                float64
 bun_mean                   float64
 wbc_mean                   float64
 dtype: object)

In [20]:
labs.hadm_id.nunique(), labs.stay_id.nunique()

(27499, 28658)

In [21]:
labs.columns = ['hadm_id', 'stay_id', 'charttime', 'aniongap', 'albumin', 'bands', 'bicarbonate', 
                'bilirubin', 'creatinine', 'chloride', 'glucose', 'hematocrit', 'hemoglobin', 'lactate', 
                'platelet', 'potassium', 'ptt', 'inr', 'pt', 'sodium', 'bun', 'wbc']
labs.dropna(subset=labs.columns[3:], how='all', inplace=True)
labs.sort_values(['stay_id', 'charttime'], inplace=True, ascending=True)

In [22]:
labs.shape, pd.isna(labs).any()

((594662, 22),
 hadm_id        False
 stay_id        False
 charttime      False
 aniongap        True
 albumin         True
 bands           True
 bicarbonate     True
 bilirubin       True
 creatinine      True
 chloride        True
 glucose         True
 hematocrit      True
 hemoglobin      True
 lactate         True
 platelet        True
 potassium       True
 ptt             True
 inr             True
 pt              True
 sodium          True
 bun             True
 wbc             True
 dtype: bool)

In [23]:
labs.hadm_id.nunique(), labs.stay_id.nunique()

(27439, 28590)

# kdigo stages

In [24]:
kdigo_stages = pd.read_sql_query(KDIGO_STAGES, con)

In [25]:
kdigo_stages.drop(['subject_id', 'icu_intime', 'icu_outtime'], axis=1, inplace=True)
kdigo_stages.head()

Unnamed: 0,hadm_id,stay_id,charttime,creat,uo_rt_6hr,uo_rt_12hr,uo_rt_24hr,aki_stage
0,20007905,34104518,2189-08-01 18:20:00,,1.6216,1.0628,0.9226,0
1,20007905,34104518,2189-08-01 16:00:00,,1.3023,0.8424,0.7597,0
2,20007905,34104518,2189-08-01 08:21:00,,0.4662,0.5204,0.4544,1
3,20007905,34104518,2189-08-01 05:21:00,1.1,,,,0
4,20007905,34104518,2189-07-31 23:00:00,,0.5837,0.4475,0.5895,2


In [26]:
kdigo_stages.shape, kdigo_stages.dtypes

((2669937, 8),
 hadm_id                int64
 stay_id                int64
 charttime     datetime64[ns]
 creat                float64
 uo_rt_6hr            float64
 uo_rt_12hr           float64
 uo_rt_24hr           float64
 aki_stage              int64
 dtype: object)

In [27]:
kdigo_stages.dropna(subset=kdigo_stages.columns[3:], how='all', inplace=True)

In [28]:
kdigo_stages.shape, pd.isna(kdigo_stages).any()

((2669937, 8),
 hadm_id       False
 stay_id       False
 charttime     False
 creat          True
 uo_rt_6hr      True
 uo_rt_12hr     True
 uo_rt_24hr     True
 aki_stage     False
 dtype: bool)

In [29]:
kdigo_stages.hadm_id.nunique(), kdigo_stages.stay_id.nunique()

(27499, 28658)

# merge glucose in vitals and labs, scr in kdigo_stages and labs

In [30]:
glucose = vitals[['hadm_id', 'stay_id', 'charttime', 'glucose']].copy()
glucose.dropna(subset=['glucose'], inplace=True)
glucose_lab = labs[['hadm_id', 'stay_id', 'charttime', 'glucose']].copy()
glucose_lab.dropna(subset=['glucose'], inplace=True)

In [31]:
glucose.shape, glucose_lab.shape

((874512, 4), (351350, 4))

In [32]:
glucose = glucose.append(glucose_lab, ignore_index=True)
glucose.drop_duplicates(keep='first', inplace=True)
glucose.shape

(880650, 4)

In [33]:
vitals.drop(['glucose'], axis=1, inplace=True)
vitals.dropna(subset=vitals.columns[3:], how='all', inplace=True)
labs.drop(['glucose'], axis=1, inplace=True)
labs.dropna(subset=labs.columns[3:], how='all', inplace=True)

In [34]:
scr = kdigo_stages[['hadm_id', 'stay_id', 'charttime', 'creat']].copy()
scr.dropna(subset=['creat'], inplace=True)
scr_lab = labs[['hadm_id', 'stay_id', 'charttime', 'creatinine']].copy()
scr_lab.dropna(subset=['creatinine'], inplace=True)

In [35]:
scr.shape, scr_lab.shape

((277151, 4), (276543, 4))

In [36]:
scr.rename(columns={'creat': 'creatinine'}, inplace=True)
scr = scr.append(scr_lab, ignore_index=True)
scr.drop_duplicates(keep='first', inplace=True)
scr.shape

(277152, 4)

In [37]:
kdigo_stages.drop(['creat'], axis=1, inplace=True)
kdigo_stages.dropna(subset=kdigo_stages.columns[3:], how='all', inplace=True)
labs.drop(['creatinine'], axis=1, inplace=True)
labs.dropna(subset=labs.columns[3:], how='all', inplace=True)

# merge vitals, labs, glucose, scr and kdigo_stages

In [38]:
merge_axis = ['hadm_id', 'stay_id', 'charttime']
data = pd.merge(vitals, labs, on=merge_axis, how='outer')
data = pd.merge(data, glucose, on=merge_axis, how='outer')
data = pd.merge(data, scr, on=merge_axis, how='outer')
data = pd.merge(data, kdigo_stages, on=merge_axis, how='outer')
del vitals, labs, glucose, glucose_lab, scr, scr_lab, kdigo_stages
gc.collect()

0

In [39]:
data.columns

Index(['hadm_id', 'stay_id', 'charttime', 'heartrate', 'sysbp', 'diasbp',
       'meanbp', 'resprate', 'tempc', 'spo2', 'aniongap', 'albumin', 'bands',
       'bicarbonate', 'bilirubin', 'chloride', 'hematocrit', 'hemoglobin',
       'lactate', 'platelet', 'potassium', 'ptt', 'inr', 'pt', 'sodium', 'bun',
       'wbc', 'glucose', 'creatinine', 'uo_rt_6hr', 'uo_rt_12hr', 'uo_rt_24hr',
       'aki_stage'],
      dtype='object')

In [40]:
data.shape

(6955511, 33)

In [41]:
data.hadm_id.nunique(), data.stay_id.nunique()

(27499, 28658)

In [42]:
pd.isna(data[data.columns[:3]]).any()

hadm_id      False
stay_id      False
charttime    False
dtype: bool

In [43]:
data.sort_values(['hadm_id', 'stay_id', 'charttime'], inplace=True, ascending=True, ignore_index=True)

In [45]:
data.to_csv(OUTPUT_FILENAME, sep='\t', index=False)

In [46]:
cur.close()
con.close()