In [55]:
import pandas as pd
import numpy as np
import os
import timeit
from tqdm.notebook import trange, tqdm
import re
import matplotlib.pyplot as plt

In [56]:
DEMO = False
WRITE_ERRORS_TO_FILE = True

In [57]:

if DEMO:
    CHARTEVENTS_BY_ICUSTAY_ID = 'data_demo/parquet/'
    REDUCED_CE_BY_ICUSTAY_ID = 'data_demo/samples/'
    NUMERICAL_BY_ICUSTAY_ID = 'data_demo/samples_numerical/'
    STATB_BY_ICUSTAY_ID = 'data_demo/samples_statB/'
    STATB_CSV_BY_ICUSTAY_ID = 'data_demo/samples_statB_csv/'
else:
    CHARTEVENTS_BY_ICUSTAY_ID = 'data/parquet/'
    REDUCED_CE_BY_ICUSTAY_ID = 'data/samples/'
    NUMERICAL_BY_ICUSTAY_ID = 'data/samples_numerical/'
    STATB_BY_ICUSTAY_ID = 'data/samples_statB/'
    DEMO_BY_ICUSTAY_ID = 'data/samples_demographics/'


CHARTEVENTS_FILENAME = 'mimic-iii/CHARTEVENTS.csv'
READMISSION_FILENAME = 'data/readmission.csv'
PARQUET_EXT = '.parquet'
STEP_FAIL_FILE = 'Step7_failed.txt'

In [58]:
gender_map = {'__missing__': np.NaN, 'F': '0', 'M': '1'}
insurance_map = {'__missing__': np.NaN,'Medicare':'0', 'Private':'1', 'Medicaid':'2', 'Government':'3', 'Self Pay':'4'}
race_map = {'__missing__': np.NaN, 'AMERICAN INDIAN/ALASKA NATIVE': '4', 'AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE': '4', 'ASIAN': '0', 'ASIAN - ASIAN INDIAN': '0', 'ASIAN - CAMBODIAN': '0', 'ASIAN - CHINESE': '0', 'ASIAN - FILIPINO': '0', 'ASIAN - JAPANESE': '0', 'ASIAN - KOREAN': '0', 'ASIAN - OTHER': '0', 'ASIAN - THAI': '0', 'ASIAN - VIETNAMESE': '0', 'BLACK/AFRICAN': '1', 'BLACK/AFRICAN AMERICAN': '1', 'BLACK/CAPE VERDEAN': '1', 'BLACK/HAITIAN': '1', 'CARIBBEAN ISLAND': '4', 'HISPANIC OR LATINO': '2', 'HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)': '2', 'HISPANIC/LATINO - COLOMBIAN': '2', 'HISPANIC/LATINO - CUBAN': '2', 'HISPANIC/LATINO - DOMINICAN': '2', 'HISPANIC/LATINO - GUATEMALAN': '2', 'HISPANIC/LATINO - HONDURAN': '2', 'HISPANIC/LATINO - MEXICAN': '2', 'HISPANIC/LATINO - PUERTO RICAN': '2', 'HISPANIC/LATINO - SALVADORAN': '2', 'MIDDLE EASTERN': '4', 'MULTI RACE ETHNICITY': '4', 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': '4', 'OTHER': '4', 'PATIENT DECLINED TO ANSWER': '5', 'PORTUGUESE': '4', 'SOUTH AMERICAN': '4', 'UNABLE TO OBTAIN': '5', 'UNKNOWN/NOT SPECIFIED': '5', 'WHITE': '3', 'WHITE - BRAZILIAN': '3', 'WHITE - EASTERN EUROPEAN': '3', 'WHITE - OTHER EUROPEAN': '3', 'WHITE - RUSSIAN': '3'}

In [59]:
columns = ['ICUSTAY_ID', 'GENDER', 'AGE', 'ETHNICITY', 'INSURANCE']

In [60]:
readmission = pd.read_csv(READMISSION_FILENAME,usecols=columns).sort_values(['ICUSTAY_ID']).reset_index(drop=True)
print(readmission.dtypes)
readmission = readmission.set_index('ICUSTAY_ID')
readmission.head(1)

ICUSTAY_ID     int64
INSURANCE     object
ETHNICITY     object
GENDER        object
AGE            int64
dtype: object


Unnamed: 0_level_0,INSURANCE,ETHNICITY,GENDER,AGE
ICUSTAY_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
200001,Medicare,ASIAN - ASIAN INDIAN,F,61


In [61]:
chartevents_dir_list = os.listdir(NUMERICAL_BY_ICUSTAY_ID)
len(chartevents_dir_list)

48075

In [62]:
failed = []
for stay in tqdm(chartevents_dir_list, total=len(chartevents_dir_list)):
    try:
        icustay_id = int(re.sub(r'[^0-9]*','',stay))
        demo_df = readmission.loc[[icustay_id]]
        #print(demo_df.head())
        demo_df['GENDER'] = pd.to_numeric(demo_df['GENDER'].map(gender_map),errors='coerce')
        demo_df['INSURANCE'] = pd.to_numeric(demo_df['INSURANCE'].map(insurance_map),errors='coerce')
        demo_df['ETHNICITY'] = pd.to_numeric(demo_df['ETHNICITY'].map(race_map),errors='coerce')
        demo_df['AGE'] = pd.to_numeric(demo_df['AGE'],errors='coerce')
        # demo_df = pd.to_numeric(demo_df,errors='coerce')

        demo_df.to_parquet(DEMO_BY_ICUSTAY_ID+stay)
        # break
    except Exception as e:
        failed.append(stay)
        failed.append(str(e))
        continue

  0%|          | 0/48075 [00:00<?, ?it/s]

In [63]:
failed

[]

In [64]:
if WRITE_ERRORS_TO_FILE:
    with open(STEP_FAIL_FILE, 'w') as f:
        for line in failed:
            f.write(line)
            f.write('\n')

In [65]:
demo_df.head()

Unnamed: 0_level_0,INSURANCE,ETHNICITY,GENDER,AGE
ICUSTAY_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
254384,0,3,1,72
