In [1]:
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import copy

import itertools
import collections
import time
import random


# from src.utils import PickleUtils
# from stellargraph import IndexedArray, StellarGraph

In [2]:
pat_table_orig = pd.read_csv('../saved_data/patient.csv')
# pat_table_orig

In [3]:
admissionDx_table_orig = pd.read_csv('../saved_data/admissionDx.csv')
# admissionDx_table_orig

In [4]:
diag_table_orig = pd.read_csv('../saved_data/diagnosis.csv')
# diag_table_orig

In [5]:
treatment_table_orig = pd.read_csv('../saved_data/treatment.csv')
# treatment_table_orig

In [6]:
pat_table = pat_table_orig[['uniquepid','patientunitstayid','patienthealthsystemstayid','gender','age','ethnicity',
                            'hospitaladmitoffset','unitvisitnumber','unitdischargestatus']].drop_duplicates()
# pat_table

In [7]:
admissionDx_table = admissionDx_table_orig[['patientunitstayid','admitdxenteredoffset','admitdxpath']] \
    .rename(columns={'admitdxpath':'PPD name', 'admitdxenteredoffset':'offset'})

med_jny = admissionDx_table[admissionDx_table.patientunitstayid.isin(pat_table.patientunitstayid.to_list())].sort_values(by='patientunitstayid')
# med_jny

In [8]:
diag_table = diag_table_orig[['patientunitstayid','diagnosisoffset','diagnosisstring']] \
    .rename(columns={'diagnosisstring':'PPD name', 'diagnosisoffset':'offset'})
med_jny = pd.concat([med_jny, diag_table]).sort_values(by=['patientunitstayid', 'offset'])
# med_jny

In [9]:
treatment_table = treatment_table_orig[['patientunitstayid','treatmentoffset','treatmentstring']] \
    .rename(columns={'treatmentstring':'PPD name', 'treatmentoffset':'offset'})
med_jny = pd.concat([med_jny, treatment_table]).sort_values(by=['patientunitstayid', 'offset'])
# med_jny

In [10]:
last_units = pat_table.groupby('patienthealthsystemstayid')['unitvisitnumber'].max().reset_index(name='unitvisitnumber')
last_units['readmission'] = 0
pat_table = pat_table.merge(last_units, on=['patienthealthsystemstayid','unitvisitnumber'], how='outer')
pat_table['readmission'] = pat_table['readmission'].fillna(1)

In [11]:
def remove_adj_duplicate(x):
    col = x['PPD name'].to_list()
    ii = [i for i, n in enumerate(col) if i==0 or n != col[i-1]]
    return x.iloc[ii,:]

In [12]:
med_jny_dedup = med_jny.groupby('patientunitstayid').apply(remove_adj_duplicate).reset_index(drop=True)

In [13]:
pat_rec_cnt = med_jny_dedup.patientunitstayid.value_counts().reset_index(name='count') \
    .rename(columns={'index':'patientunitstayid'})
pat_rec_cnt = pat_rec_cnt[pat_rec_cnt['count'] >= 5]
med_jny_dedup = med_jny_dedup[med_jny_dedup['patientunitstayid'].isin(pat_rec_cnt.patientunitstayid.to_list())]
pat_table = pat_table[pat_table['patientunitstayid'].isin(pat_rec_cnt.patientunitstayid.to_list())]
pat_table = pat_table.sort_values(by=['patienthealthsystemstayid','unitvisitnumber'])

In [14]:
svc_cnts = med_jny_dedup['PPD name'].value_counts().reset_index(name='count') \
    .rename(columns={'index':'PPD name'})
svc_cnts['svc_id'] = list(range(len(svc_cnts)))
med_jny_dedup = med_jny_dedup.merge(svc_cnts, on=['PPD name'], how='inner').drop(columns='count')

In [15]:
pat_table.to_parquet('../saved_data/pat_table.parquet', index=False)
med_jny_dedup.to_parquet('../saved_data/med_jny_dedup.parquet', index=False)
svc_cnts.to_csv('../saved_data/svc_dict.csv', index=False)

In [16]:
data = pd.read_parquet('../saved_data/med_jny_dedup.parquet')
data = data.sort_values(by=['patientunitstayid','offset','PPD name'])


In [17]:
pat_table = pd.read_parquet('../saved_data/pat_table.parquet')
svc_dict = pd.read_csv('../saved_data/svc_dict.csv')

In [18]:
svc_dict_lean = svc_dict[svc_dict['count'] >= 100]
data_lean = data[data['svc_id'].isin(svc_dict_lean.svc_id.to_list())]
pat_table_lean = pat_table[pat_table.patientunitstayid.isin(data_lean.patientunitstayid.unique())]

data_lean.to_parquet('../saved_data/med_jny_dedup_lean.parquet', index=False)
pat_table_lean.to_parquet('../saved_data/pat_table_lean.parquet', index=False)

In [19]:
pat_enc_cnts = data_lean.groupby('patientunitstayid')['offset'].count().reset_index(name='count')
pat_ind_split = np.cumsum(pat_enc_cnts['count'].to_numpy())
pat_ind_split = np.concatenate(([0], pat_ind_split))
data_lean_np = data_lean[['patientunitstayid','offset','svc_id']].to_numpy()

In [20]:
win_len = 60 # one hour
adj_mat = np.zeros((len(svc_dict_lean), len(svc_dict_lean)), dtype=int)

In [21]:
from itertools import combinations


for i in tqdm(range(len(pat_enc_cnts))):

    pat_journey = data_lean_np[pat_ind_split[i]:pat_ind_split[i + 1]]
    ii = np.floor_divide(pat_journey[:, 1], win_len)
    ii_uni = np.unique(ii)
    for k in ii_uni:
        serv_win = pat_journey[ii == k, 2]
        if len(serv_win) == 1:
            continue
        indx = np.array(list(combinations(serv_win, 2)), dtype=int).T
        adj_mat[indx[0, :], indx[1, :]] += 1
        adj_mat[indx[1, :], indx[0, :]] += 1

adj_mat[np.diag_indices(len(svc_dict_lean))] = 0

100%|██████████| 2106/2106 [00:00<00:00, 5376.61it/s]


In [22]:
fh = open('../node2vec/graph/ppd_eICU.edgelist', 'w')
for x, y in combinations(range(len(adj_mat)), 2):
    if adj_mat[x, y] > 0:
        fh.write(str(x + 1) + ' ' + str(y + 1) + ' ' + str(adj_mat[x, y]) + '\n')
fh.close()