In [1]:
import csv
import numpy as np
import pandas as pd
import random

# Define data path
DATA_PATH = './mimic-iv-2.2/hosp/unzipped'

### Work with MIMIC data as Pandas Dataframes

In [2]:
d_icd_diagnoses_df = pd.read_csv(DATA_PATH + '/d_icd_diagnoses.csv', sep=',', header='infer')
print(d_icd_diagnoses_df.head())

  icd_code  icd_version                             long_title
0     0010            9         Cholera due to vibrio cholerae
1     0011            9  Cholera due to vibrio cholerae el tor
2     0019            9                   Cholera, unspecified
3     0020            9                          Typhoid fever
4     0021            9                    Paratyphoid fever A


In [3]:
# Find values with 'heart failure' in title
hf_df = d_icd_diagnoses_df[d_icd_diagnoses_df['long_title'].str.contains("heart failure")]

# Filter only values with icd_version 9
hf_df = hf_df.loc[hf_df['icd_version'] == 9]

# Find values in remaining list that contain 'without heart failure'
wohf_df = hf_df[hf_df['long_title'].str.contains("without heart failure")]

# Remove values w/ 'without heart failure'
res_df = pd.concat([hf_df,wohf_df]).drop_duplicates(keep=False)

print(res_df.shape, res_df.head())

hf_icd_codes = res_df['icd_code'].to_list()

print(hf_icd_codes)

(24, 3)      icd_code  icd_version                                         long_title
4647    39891            9               Rheumatic heart failure (congestive)
4653    40201            9  Malignant hypertensive heart disease with hear...
4655    40211            9  Benign hypertensive heart disease with heart f...
4657    40291            9  Unspecified hypertensive heart disease with he...
4665    40401            9  Hypertensive heart and chronic kidney disease,...
['39891', '40201', '40211', '40291', '40401', '40403', '40411', '40413', '40491', '40493', '4280', '4281', '42820', '42821', '42822', '42823', '42830', '42831', '42832', '42833', '42840', '42841', '42842', '42843']


### Filter patient_df and diagnoses_df into list data used for RNN

where
- `pids`: contains the patient ids
- `vids`: contains a list of visit ids for each patient
- `hfs`: contains the heart failure label (0: normal, 1: heart failure) for each patient
- `seqs`: contains a list of visit (in ICD9 codes) for each patient
- `types`: contains the map from ICD9 codes to ICD-9 labels
- `rtypes`: contains the map from ICD9 labels to ICD9 codes

In [4]:
diagnoses_df = pd.read_csv(DATA_PATH + '/diagnoses_icd.csv', sep=',', header='infer')
print(diagnoses_df.head())
print(diagnoses_df.shape)

   subject_id   hadm_id  seq_num icd_code  icd_version
0    10000032  22595853        1     5723            9
1    10000032  22595853        2    78959            9
2    10000032  22595853        3     5715            9
3    10000032  22595853        4    07070            9
4    10000032  22595853        5      496            9
(4756326, 5)


In [5]:
# Filter only diagnoses coded in ICD-9
diagnoses_df = diagnoses_df.loc[diagnoses_df['icd_version'] == 9]
print(diagnoses_df.head(5))
print(diagnoses_df.shape)

   subject_id   hadm_id  seq_num icd_code  icd_version
0    10000032  22595853        1     5723            9
1    10000032  22595853        2    78959            9
2    10000032  22595853        3     5715            9
3    10000032  22595853        4    07070            9
4    10000032  22595853        5      496            9
(2766877, 5)


#### hfs

In [6]:
hfs_pos = diagnoses_df[diagnoses_df['icd_code'].isin(hf_icd_codes)]
print(hfs_pos.head(10))

     subject_id   hadm_id  seq_num icd_code  icd_version
252    10000980  24947999        1    42823            9
259    10000980  24947999        8     4280            9
279    10000980  25242409        7    42832            9
280    10000980  25242409        8     4280            9
330    10000980  26913865        2    42823            9
334    10000980  26913865        6     4280            9
344    10000980  29654838        1    42833            9
353    10000980  29654838       10     4280            9
631    10001877  21320596        1    42833            9
632    10001877  21320596        2     4280            9


In [7]:
hfs_pos_ids = hfs_pos['subject_id'].drop_duplicates().to_list()

print(f'Number of heart failure patients: {len(hfs_pos_ids)}')

Number of heart failure patients: 14790


In [8]:
ratio_hfs_patients = 0.4

num_hfs_neg_patients = int(len(hfs_pos_ids) / ratio_hfs_patients * (1 - ratio_hfs_patients))

hfs_neg_ids = diagnoses_df[~diagnoses_df['subject_id'].isin(hfs_pos_ids)]
hfs_neg_ids = hfs_neg_ids['subject_id'].drop_duplicates().to_list()

print(f'Number of non heart failure patients (ICD-9) in database: {len(hfs_neg_ids)}')

hfs_neg_ids = random.sample(hfs_neg_ids, num_hfs_neg_patients)

print(f'Subsampled patient size: {len(hfs_neg_ids)}')

Number of non heart failure patients (ICD-9) in database: 109760
Subsampled patient size: 22185


#### pids

In [9]:
pids = hfs_pos_ids + hfs_neg_ids
pids.sort()

In [10]:
print(pids[0])

10000635


In [11]:
hfs = [0 for _ in range(len(pids))]

hfs_pos_set = set(hfs_pos_ids)

for i, pid in enumerate(pids):
    if pid in hfs_pos_set:
        hfs[i] = 1

In [12]:
# Print number of patients stored as having heart rate failure to validate
print(f'Number of heart failure patients: {sum(hfs)} (out of {len(hfs)} total)')

Number of heart failure patients: 14790 (out of 36975 total)


In [13]:
pid_index = {}

for i, pid in enumerate(pids):
    pid_index[pid] = i

#### Filter non-sampled patients from diagnoses_df

In [14]:
diagnoses_df = diagnoses_df[diagnoses_df['subject_id'].isin(pids)]

print(diagnoses_df.shape)

(1274367, 5)


#### vids

In [15]:
vids = [[] for _ in range(len(pids))]

for index, row in diagnoses_df.iterrows():
    vid_index = pid_index[row['subject_id']]
    if row['hadm_id'] not in vids[vid_index]:
        vids[vid_index].append(row['hadm_id'])

print(f'vids[0]: {vids[0]}')
print(f'vids[1]: {vids[1]}')
print(f'vids[2]: {vids[2]}')

vids[0]: [26134563]
vids[1]: [20032235, 21086876, 28289260]
vids[2]: [24947999, 25242409, 26913865, 29654838]


In [16]:
print(diagnoses_df.head(10))

     subject_id   hadm_id  seq_num icd_code  icd_version
86     10000635  26134563        1    42789            9
87     10000635  26134563        2    25000            9
88     10000635  26134563        3     4019            9
113    10000826  20032235        1     5712            9
114    10000826  20032235        2      486            9
115    10000826  20032235        3    78959            9
116    10000826  20032235        4     5723            9
117    10000826  20032235        5     5990            9
118    10000826  20032235        6     2639            9
119    10000826  20032235        7     2761            9


#### types/rtypes
We must create the types/rtypes map(s) first to convert icd_9 values into integers to store in seqs

In [17]:
# Create list of all unique icd_code values.

icd_codes = sorted(diagnoses_df.icd_code.drop_duplicates().to_list())

types = {}
rtypes = {}

for i, code in enumerate(icd_codes):
    types[code] = i
    rtypes[i] = code

#### seqs

In [18]:
seqs = [[[] for _ in range(len(vids[i]))] for i in range(len(pids))]

In [19]:
for pid in range(len(seqs)):
    for visit_idx in range(len(seqs[pid])):
        visit_id = vids[pid][visit_idx]
        single_visit_df = diagnoses_df.loc[diagnoses_df['hadm_id'] == visit_id]
        icd_codes = single_visit_df.icd_code.to_list()
        
        icd_labels = [types[code] for code in icd_codes]
        
        seqs[pid][visit_idx] = icd_labels

In [20]:
print(seqs[1][0])

pat1_visit1_codes = [rtypes[icd_label] for icd_label in seqs[1][0]]
print(pat1_visit1_codes)

[3168, 2751, 4921, 3180, 3347, 1027, 1095, 2806, 3167, 1106, 1436, 3111, 1160, 1389]
['5712', '486', '78959', '5723', '5990', '2639', '2761', '51189', '5711', '2768', '30391', '56409', '2819', '30000']


#### Export Dataset to Pickle Files

In [21]:
import pickle

In [22]:
with open('./mimic-iv-2.2/hosp/high_hfs_pickle_files/pids.pickle', 'wb') as handle:
    pickle.dump(pids, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('./mimic-iv-2.2/hosp/high_hfs_pickle_files/vids.pickle', 'wb') as handle:
    pickle.dump(vids, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('./mimic-iv-2.2/hosp/high_hfs_pickle_files/hfs.pickle', 'wb') as handle:
    pickle.dump(hfs, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('./mimic-iv-2.2/hosp/high_hfs_pickle_files/seqs.pickle', 'wb') as handle:
    pickle.dump(seqs, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('./mimic-iv-2.2/hosp/high_hfs_pickle_files/types.pickle', 'wb') as handle:
    pickle.dump(types, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('./mimic-iv-2.2/hosp/high_hfs_pickle_files/rtypes.pickle', 'wb') as handle:
    pickle.dump(rtypes, handle, protocol=pickle.HIGHEST_PROTOCOL)