In [1]:
import pandas as pd
import numpy as np
import os
import timeit
from tqdm.notebook import trange, tqdm
import re
import matplotlib.pyplot as plt

In [2]:
DEMO = False
WRITE_ERRORS_TO_FILE = True

In [3]:

if DEMO:
    CHARTEVENTS_BY_ICUSTAY_ID = 'data_demo/parquet/'
    REDUCED_CE_BY_ICUSTAY_ID = 'data_demo/samples/'
    NUMERICAL_BY_ICUSTAY_ID = 'data_demo/samples_numerical/'
    STATB_BY_ICUSTAY_ID = 'data_demo/samples_statB/'
    STATB_CSV_BY_ICUSTAY_ID = 'data_demo/samples_statB_csv/'
else:
    CHARTEVENTS_BY_ICUSTAY_ID = 'data/parquet/'
    REDUCED_CE_BY_ICUSTAY_ID = 'data/samples/'
    NUMERICAL_BY_ICUSTAY_ID = 'data/samples_numerical/'
    STATB_BY_ICUSTAY_ID = 'data/samples_statB/'
    DEMO_BY_ICUSTAY_ID = 'data/samples_demographics/'
    ICD9_BY_ICUSTAY_ID = 'data/samples_icd9/'


CHARTEVENTS_FILENAME = 'mimic-iii/CHARTEVENTS.csv'
DIAGNOSES_FILENAME = 'mimic-iii/DIAGNOSES_ICD.csv'
READMISSION_FILENAME = 'data/readmission.csv'
EMBEDDED_FILENAME = 'resources/embedded.parquet'

PARQUET_EXT = '.parquet'
STEP_FAIL_FILE = 'Step8_failed.txt'

In [4]:
readmission_columns = ['ICUSTAY_ID', 'HADM_ID']
diagnoses_columns = ['HADM_ID', 'ICD9_CODE']

In [5]:
readmission = pd.read_csv(READMISSION_FILENAME,usecols=readmission_columns).sort_values(['ICUSTAY_ID']).reset_index(drop=True)
print(readmission.dtypes)
readmission = readmission.set_index('ICUSTAY_ID')
readmission.head(1)

HADM_ID       int64
ICUSTAY_ID    int64
dtype: object


Unnamed: 0_level_0,HADM_ID
ICUSTAY_ID,Unnamed: 1_level_1
200001,152234


In [6]:
readmission.at[200001,'HADM_ID']

152234

In [7]:
diagnoses = pd.read_csv(DIAGNOSES_FILENAME,usecols=diagnoses_columns).sort_values(['HADM_ID']).reset_index(drop=True)
print(diagnoses.dtypes)
# diagnoses = diagnoses.set_index('ICUSTAY_ID')
diagnoses.head(3)

HADM_ID       int64
ICD9_CODE    object
dtype: object


Unnamed: 0,HADM_ID,ICD9_CODE
0,100001,V1351
1,100001,25013
2,100001,3371


In [8]:
embedded_df = pd.read_parquet(EMBEDDED_FILENAME)
embedded_df.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
4011,0.201225,0.03023,0.051014,0.016721,0.012499,0.031637,-0.075032,-0.055207,-0.135877,-0.044585,...,-0.022324,0.011847,-0.033494,-0.010834,-0.172425,0.066614,-0.144115,0.102333,-0.00311,0.064532


In [9]:
empty_embedded_df = pd.Series([0]*300).to_frame().T


In [10]:
chartevents_dir_list = os.listdir(NUMERICAL_BY_ICUSTAY_ID)
len(chartevents_dir_list)

48075

In [11]:
failed = []
for stay in tqdm(chartevents_dir_list, total=len(chartevents_dir_list)):
    try:
        icustay_id = int(re.sub(r'[^0-9]*','',stay))
        hadm_id = readmission.at[icustay_id,'HADM_ID']
        icd_df = diagnoses[diagnoses['HADM_ID']==hadm_id]
        if icd_df.shape[0] > 0:
            icd_df = icd_df.drop_duplicates(subset='ICD9_CODE')
            icd_df = icd_df.set_index('ICD9_CODE')
            embedded_icd9_codes = pd.merge(icd_df,embedded_df,'left',left_index=True,right_index=True)
            
            embedded_icd9_codes = embedded_icd9_codes.drop('HADM_ID',axis=1)
            
            embedded_icd9_codes_summed = embedded_icd9_codes.sum(axis=0).to_frame().T.reset_index(drop=True)
        else:
            embedded_icd9_codes_summed = empty_embedded_df
        
        embedded_icd9_codes_summed.to_parquet(ICD9_BY_ICUSTAY_ID+stay)


    except Exception as e:
        failed.append(stay)
        failed.append(str(e))
        continue

  0%|          | 0/48075 [00:00<?, ?it/s]

In [12]:
failed

[]

In [13]:
if WRITE_ERRORS_TO_FILE:
    with open(STEP_FAIL_FILE, 'w') as f:
        for line in failed:
            f.write(line)
            f.write('\n')