In [1]:
import dill
import numpy as np
import pandas as pd

## Common Acronymns Defined

- "DDI" -> Drug-Drug Interaction
- "NDC" -> National Drug Code [FDA link](https://fda.report/NDC)
- "ATC" -> Anatomical Therapeutic Chemical [WHO link](https://www.whocc.no/atc/structure_and_principles/)
- "ICD9" -> International Classification of Diseases [CDC Link](https://www.cdc.gov/nchs/icd/icd9cm.htm)

### Column Names
- "PRO_CODE" -> Process ICD9 code from mimiciii PROCEDURES_ICD.csv
- "HADM" -> Hospital Admission ID 

In [2]:
data_dir = '../../data/'
data_path = data_dir + 'records_final.pkl'
data = dill.load(open(data_path, 'rb'))
print('Number of rows:', len(data))

Number of rows: 6349


In [3]:
data[0]


[[[0, 1, 2, 3, 4, 5, 6, 7],
  [0, 1, 2],
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]],
 [[8, 9, 10, 7],
  [3, 4, 1],
  [0, 1, 2, 3, 5, 4, 6, 7, 8, 9, 10, 12, 14, 15, 16, 17]]]

## Sample Patient Record

In [4]:
patient_records = data[-1]
print('Number of visits: ', len(patient_records))
print('First Visit:')
display(patient_records[0])

Number of visits:  3
First Visit:


[[15, 464, 13, 54, 95, 18, 1359, 69, 309, 61, 178, 172, 296],
 [209],
 [1, 4, 7, 12, 6, 3, 2, 21, 61, 26, 13, 11, 103, 83]]

In [5]:
print('Encoded Visit Diagnosis Codes:')
display(patient_records[0][0])
print('Encoded Visit Procedure Codes:')
display(patient_records[0][1])
print('Encoded Visit Medical Codes:')
display(patient_records[0][2])

Encoded Visit Diagnosis Codes:


[15, 464, 13, 54, 95, 18, 1359, 69, 309, 61, 178, 172, 296]

Encoded Visit Procedure Codes:


[209]

Encoded Visit Medical Codes:


[1, 4, 7, 12, 6, 3, 2, 21, 61, 26, 13, 11, 103, 83]

In [6]:
# I received an error when loading the original `voc_final.pkl` file with dill. So I created separate csv files
# for each voc mapping.
diag_voc = pd.read_csv(data_dir + 'diag_voc.csv', index_col='idx').word.to_dict()
pro_voc = pd.read_csv(data_dir +'pro_voc.csv', index_col='idx').word.to_dict()
med_voc = pd.read_csv(data_dir +'med_voc.csv', index_col='idx').word.to_dict()


print('Original Visit Diagnosis Codes:')
display([diag_voc[i] for i in patient_records[0][0]])
print('Original Visit Procedure Codes:')
display([pro_voc[i] for i in patient_records[0][1]])
print('Original Visit Medical Codes:')
display([med_voc[i] for i in patient_records[0][2]])

Original Visit Diagnosis Codes:


['4280',
 '42823',
 '5849',
 '4254',
 '2763',
 '42731',
 '78729',
 '53081',
 'V422',
 'V5861',
 '4168',
 '56400',
 '2768']

Original Visit Procedure Codes:


[3721]

Original Visit Medical Codes:


['A01A',
 'B05C',
 'C01C',
 'C03C',
 'A12C',
 'A06A',
 'A02B',
 'B01A',
 'C01A',
 'C01B',
 'A12B',
 'C07A',
 'C03B',
 'C09C']

In [7]:
ddi_adj_path = data_dir + 'ddi_A_final.pkl'
ddi_adj = dill.load(open(ddi_adj_path, 'rb'))
ddi_adj

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
data_final_path = data_dir + '/data_final.pkl'
data_final = dill.load(open(data_final_path, 'rb')) 
data_final[data_final.SUBJECT_ID == data_final.SUBJECT_ID.unique()[-1]]

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE,NDC,PRO_CODE,NDC_Len
14992,99982,112748,"[4280, 42823, 5849, 4254, 2763, 42731, 78729, ...","[A01A, B05C, C01C, C03C, A12C, A06A, A02B, B01...",[3721],14
14993,99982,151454,"[42823, 4254, 2875, 42731, 3970, 5303, 4280, V...","[N02B, A01A, A06A, B05C, A12A, A12C, C01C, N01...","[3527, 3961]",20
14994,99982,183791,"[5849, 42731, 4280, 2875, 53081, 56400, 78720,...","[A01A, A06A, B05C, N02B, C03C, A03B, B01A, A12...","[3721, 3897, 8964]",21


In [9]:
data_final.sort_values(['SUBJECT_ID', 'HADM_ID'])

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE,NDC,PRO_CODE,NDC_Len
0,17,161087,"[4239, 5119, 78551, 4589, 311, 7220, 71946, 2724]","[N02B, A01A, A02B, A06A, B05C, A12A, A12C, C01...","[3731, 8872, 3893]",15
1,17,194023,"[7455, 45829, V1259, 2724]","[N02B, A01A, A02B, A06A, A12A, B05C, A12C, C01...","[3571, 3961, 8872]",16
2,21,109451,"[41071, 78551, 5781, 5849, 40391, 4280, 4592, ...","[A06A, B05C, C07A, A12B, C03C, A12A, A02A, J01...","[0066, 3761, 3950, 3606, 0042, 0047, 3895, 399...",23
3,21,111970,"[0388, 78552, 40391, 42731, 70709, 5119, 6823,...","[N02B, A06A, B05C, A12C, A07A, A02A, B01A, N06...","[3995, 8961, 0014]",19
4,23,124321,"[2252, 3485, 78039, 4241, 4019, 2720, 2724, V4...","[B05C, A07A, C07A, A06A, N02B, C02D, B01A, A02...",[0151],17
...,...,...,...,...,...,...
14990,99923,164914,"[45829, 4532, 2761, 5723, 4561, 45621, 5849, 7...","[N02B, A02A, B01A, A06A, J01M, H01C, A07A, C01C]","[5491, 4513]",8
14991,99923,192053,"[5712, 5856, 5724, 40391, 9974, 5601, 30393, V...","[A06A, A12A, A12C, N01A, C07A, C03C, B01A, A02...","[5059, 504, 5569, 0093]",24
14992,99982,112748,"[4280, 42823, 5849, 4254, 2763, 42731, 78729, ...","[A01A, B05C, C01C, C03C, A12C, A06A, A02B, B01...",[3721],14
14993,99982,151454,"[42823, 4254, 2875, 42731, 3970, 5303, 4280, V...","[N02B, A01A, A06A, B05C, A12A, A12C, C01C, N01...","[3527, 3961]",20


In [10]:
len(data_final.SUBJECT_ID.unique())

6349

In [11]:
print('Patients with multiple visits:', (data_final.groupby('SUBJECT_ID').HADM_ID.nunique() > 1).sum())

Patients with multiple visits: 5421


In [15]:
data

[[[[0, 1, 2, 3, 4, 5, 6, 7],
   [0, 1, 2],
   [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]],
  [[8, 9, 10, 7],
   [3, 4, 1],
   [0, 1, 2, 3, 5, 4, 6, 7, 8, 9, 10, 12, 14, 15, 16, 17]]],
 [[[11, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27],
   [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
   [3,
    4,
    11,
    13,
    12,
    5,
    18,
    19,
    20,
    21,
    22,
    23,
    24,
    17,
    1,
    25,
    2,
    15,
    26,
    27,
    6,
    28,
    29]],
  [[28,
    29,
    14,
    18,
    30,
    1,
    31,
    32,
    33,
    34,
    35,
    36,
    21,
    37,
    27,
    25,
    20,
    38,
    39,
    40,
    41],
   [12, 17, 18],
   [0, 3, 4, 6, 8, 18, 21, 17, 1, 2, 26, 25, 30, 31, 32, 7, 33, 34, 16]]],
 [[[42, 43, 44, 45, 46, 24, 7, 47, 48, 49],
   [19],
   [4, 8, 11, 3, 0, 15, 21, 2, 6, 30, 35, 1, 36, 28, 25, 37, 13]],
  [[20, 50, 45, 51, 7, 46, 52, 53],
   [20, 21, 4, 1, 15, 22, 23],
   [0, 1, 2, 3, 5, 4, 6, 7, 10, 11, 12, 13, 14, 15, 26, 3