In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# ! pip install pgmpy

In [3]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import numpy as np

In [5]:
date_set_path = "../temp_sets/"

## Time data discretisation

In [6]:
def date_discretisation(date):
    if not pd.isna(date):
        # Do not use "-" as connector, because Pandas might read it as date or time!!
        return str(date.quarter) + "&" + str(date.dayofweek)
    else:
        return "Na"

In [7]:
def datetime_discretisation(date):
    if not pd.isna(date):
        # Do not use "-" as connector, because Pandas might read it as date or time!!
        return str(date.quarter) + "&" + str(date.dayofweek) + "&" + str(date.hour)
    else:
        return "Na"

## Read Tempdatasets

In [8]:
# read csv file as dataframe, and drop ROW_ID column
def read_csv_no_rowid(file_path):
    df = pd.read_csv(file_path)
    df.drop(["Unnamed: 0", 'row_id'], axis=1, inplace=True)

    return df

In [9]:
# check NaN value

def nan_count(df):
    print("Total columns: " + str(len(df.columns)))
    print("Total rows: " + str(len(df)))
    print("--------------")
    print(df.isnull().sum())

### Patients

In [10]:
random_patients_df = read_csv_no_rowid(date_set_path+"patients.csv")

In [11]:
# Drop useless colums
random_patients_df.drop(['dod', 'expire_flag'], axis=1, inplace=True)

In [12]:
# Transfer some date type

random_patients_df['dob'] = pd.to_datetime(random_patients_df['dob'])
# random_patients_df['DOD'] = pd.to_datetime(random_patients_df['DOD'])
random_patients_df['dod_hosp'] = pd.to_datetime(random_patients_df['dod_hosp'])
random_patients_df['dod_ssn'] = pd.to_datetime(random_patients_df['dod_ssn'])

In [13]:
nan_count(random_patients_df)

Total columns: 5
Total rows: 1000
--------------
subject_id      0
gender          0
dob             0
dod_hosp      774
dod_ssn       702
dtype: int64


In [14]:
# random_patients_df.loc[:, 'dob'] = random_patients_df.loc[:, 'dob'].apply(date_discretisation)
# random_patients_df.loc[:, 'dod_hosp'] = random_patients_df.loc[:, 'dod_hosp'].apply(date_discretisation)
# random_patients_df.loc[:, 'dod_ssn'] = random_patients_df.loc[:, 'dod_ssn'].apply(date_discretisation)

In [15]:
# random_patients_df['dob'].fillna(value=nan_time, inplace=True)
# random_patients_df['dod_hosp'].fillna(value=nan_time, inplace=True)
# random_patients_df['dod_ssn'].fillna(value=nan_time, inplace=True)

In [16]:
nan_count(random_patients_df)

Total columns: 5
Total rows: 1000
--------------
subject_id      0
gender          0
dob             0
dod_hosp      774
dod_ssn       702
dtype: int64


### Admissions

Note: the 'deathtime' in admission table cannot be dropped, as this column includes the exact time.

In [17]:
admissions_sample_df = read_csv_no_rowid(date_set_path+"admissions.csv")

In [18]:
admissions_sample_df.drop(['diagnosis', 'hospital_expire_flag'], axis=1, inplace=True)

In [19]:
admissions_sample_df['admittime'] = pd.to_datetime(admissions_sample_df['admittime'])
admissions_sample_df['dischtime'] = pd.to_datetime(admissions_sample_df['dischtime'])
admissions_sample_df['deathtime'] = pd.to_datetime(admissions_sample_df['deathtime'])
admissions_sample_df['edregtime'] = pd.to_datetime(admissions_sample_df['edregtime'])
admissions_sample_df['edouttime'] = pd.to_datetime(admissions_sample_df['edouttime'])

In [20]:
admissions_sample_df.dtypes

subject_id                       int64
hadm_id                          int64
admittime               datetime64[ns]
dischtime               datetime64[ns]
deathtime               datetime64[ns]
admission_type                  object
admission_location              object
discharge_location              object
insurance                       object
language                        object
religion                        object
marital_status                  object
ethnicity                       object
edregtime               datetime64[ns]
edouttime               datetime64[ns]
has_chartevents_data             int64
dtype: object

In [21]:
# admissions_sample_df.loc[:, 'admittime'] = admissions_sample_df.loc[:, 'admittime'].apply(datetime_discretisation)
# admissions_sample_df.loc[:, 'dischtime'] = admissions_sample_df.loc[:, 'dischtime'].apply(datetime_discretisation)
# admissions_sample_df.loc[:, 'deathtime'] = admissions_sample_df.loc[:, 'deathtime'].apply(datetime_discretisation)
# admissions_sample_df.loc[:, 'edregtime'] = admissions_sample_df.loc[:, 'edregtime'].apply(datetime_discretisation)
# admissions_sample_df.loc[:, 'edouttime'] = admissions_sample_df.loc[:, 'edouttime'].apply(datetime_discretisation)

In [22]:
admissions_sample_df['language'].fillna(value='unknow', inplace=True)
admissions_sample_df['marital_status'].fillna(value='unknow', inplace=True)
admissions_sample_df['religion'].fillna(value='unknow', inplace=True)
# admissions_sample_df['deathtime'].fillna(value=nan_time, inplace=True)
# admissions_sample_df['edregtime'].fillna(value=nan_time, inplace=True)
# admissions_sample_df['edouttime'].fillna(value=nan_time, inplace=True)

In [23]:
nan_count(admissions_sample_df)

Total columns: 16
Total rows: 1289
--------------
subject_id                 0
hadm_id                    0
admittime                  0
dischtime                  0
deathtime               1159
admission_type             0
admission_location         0
discharge_location         0
insurance                  0
language                   0
religion                   0
marital_status             0
ethnicity                  0
edregtime                604
edouttime                604
has_chartevents_data       0
dtype: int64


### Callout

In [24]:
callout_sample_df = read_csv_no_rowid(date_set_path+"callout.csv")

In [25]:
callout_sample_df.drop(['submit_careunit', 'firstreservationtime', 'currentreservationtime'], axis=1, inplace=True)

In [26]:
callout_sample_df['createtime'] = pd.to_datetime(callout_sample_df['createtime'])
callout_sample_df['updatetime'] = pd.to_datetime(callout_sample_df['updatetime'])
callout_sample_df['acknowledgetime'] = pd.to_datetime(callout_sample_df['acknowledgetime'])
callout_sample_df['outcometime'] = pd.to_datetime(callout_sample_df['outcometime'])

In [27]:
# callout_sample_df.loc[:, 'createtime'] = callout_sample_df.loc[:, 'createtime'].apply(datetime_discretisation)
# callout_sample_df.loc[:, 'updatetime'] = callout_sample_df.loc[:, 'updatetime'].apply(datetime_discretisation)
# callout_sample_df.loc[:, 'acknowledgetime'] = callout_sample_df.loc[:, 'acknowledgetime'].apply(datetime_discretisation)
# callout_sample_df.loc[:, 'outcometime'] = callout_sample_df.loc[:, 'outcometime'].apply(datetime_discretisation)

In [28]:
callout_sample_df['discharge_wardid'].fillna(value=100, inplace=True)
# callout_sample_df['acknowledgetime'].fillna(value=nan_time, inplace=True)

In [29]:
nan_count(callout_sample_df)

Total columns: 20
Total rows: 807
--------------
subject_id             0
hadm_id                0
submit_wardid          0
curr_wardid            0
curr_careunit          0
callout_wardid         0
callout_service        0
request_tele           0
request_resp           0
request_cdiff          0
request_mrsa           0
request_vre            0
callout_status         0
callout_outcome        0
discharge_wardid       0
acknowledge_status     0
createtime             0
updatetime             0
acknowledgetime       35
outcometime            0
dtype: int64


### ICUstays

In [30]:
icustays_sample_df = read_csv_no_rowid(date_set_path+"icustays.csv")

In [31]:
icustays_sample_df.drop(['los'], axis=1, inplace=True)

In [32]:
icustays_sample_df['intime'] = pd.to_datetime(icustays_sample_df['intime'])
icustays_sample_df['outtime'] = pd.to_datetime(icustays_sample_df['outtime'])

In [33]:
# icustays_sample_df.loc[:, 'intime'] = icustays_sample_df.loc[:, 'intime'].apply(datetime_discretisation)
# icustays_sample_df.loc[:, 'outtime'] = icustays_sample_df.loc[:, 'outtime'].apply(datetime_discretisation)

In [34]:
nan_count(icustays_sample_df)

Total columns: 10
Total rows: 1342
--------------
subject_id        0
hadm_id           0
icustay_id        0
dbsource          0
first_careunit    0
last_careunit     0
first_wardid      0
last_wardid       0
intime            0
outtime           0
dtype: int64


---

## Combine samples

In [35]:
# patients_df = pd.merge(random_patients_df, admissions_sample_df, on=['subject_id'], how='inner')
# patients_df = pd.merge(patients_df, callout_sample_df, on=['subject_id', 'hadm_id'], how='inner')
# patients_df = pd.merge(patients_df, icustays_sample_df, on=['subject_id', 'hadm_id'], how='inner')

In [36]:
# patients_df.columns

In [37]:
# nan_count(patients_df)

In [38]:
# patients_df = pd.merge(patients_df, services_sample_df, on=['subject_id', 'hadm_id'], how='outer')
# patients_df = pd.merge(patients_df, transfers_sample_df.drop(['icustay_id', 'dbsource', 'curr_careunit', 'curr_wardid', \
#                                                 'intime', 'outtime', 'los'], axis=1), on=['subject_id', 'hadm_id'], how='outer')
# patients_df.fillna(value="Na", inplace=True)

In [39]:
# patients_df.to_csv("patients_info.csv")

In [40]:
# patients_df.drop(['subject_id', 'hadm_id'], axis=1, inplace=True)

---

##  Build Network

In [41]:
from sdv import Metadata
from sdv.relational import HMA1

from sdv.constraints import GreaterThan

In [42]:
patients_tables = {
    'random_patients_df': random_patients_df,
    'admissions_sample_df': admissions_sample_df,
    'callout_sample_df': callout_sample_df,
    'icustays_sample_df': icustays_sample_df
}

In [43]:
# contains in patients


In [44]:
# contains in admissions
admittime_dischtime = GreaterThan(
    low='admittime',
    high='dischtime',
    handling_strategy='reject_sampling'
)

edregtime_edouttime = GreaterThan(
    low='edregtime',
    high='edouttime',
    handling_strategy='reject_sampling'
)

In [45]:
# contains in callout
createtime_updatetime = GreaterThan(
    low='createtime',
    high='updatetime',
    handling_strategy='reject_sampling'
)

createtime_acknowledgetime = GreaterThan(
    low='createtime',
    high='acknowledgetime',
    handling_strategy='reject_sampling'
)

updatetime_outcometime = GreaterThan(
    low='updatetime',
    high='outcometime',
    handling_strategy='reject_sampling'
)

acknowledgetime_outcometime = GreaterThan(
    low='acknowledgetime',
    high='outcometime',
    handling_strategy='reject_sampling'
)

In [46]:
# contains in icu_stays
intime_outtime = GreaterThan(
    low='intime',
    high='outtime',
    handling_strategy='transform'
)

In [47]:
metadata = Metadata()

In [48]:
metadata.add_table(name='random_patients_df', data=patients_tables['random_patients_df'], primary_key='subject_id')

metadata.add_table(name='admissions_sample_df', data=patients_tables['admissions_sample_df'], primary_key='hadm_id', 
    parent='random_patients_df', foreign_key='subject_id', constraints=[edregtime_edouttime])

metadata.add_table(name='callout_sample_df', data=patients_tables['callout_sample_df'], primary_key='hadm_id', 
    parent='random_patients_df', foreign_key='subject_id', 
    constraints=[createtime_updatetime, createtime_acknowledgetime, updatetime_outcometime, acknowledgetime_outcometime])
    
metadata.add_table(name='icustays_sample_df', data=patients_tables['icustays_sample_df'], primary_key='icustay_id', 
    parent='callout_sample_df', foreign_key='hadm_id', constraints=[intime_outtime])

In [49]:
nan_count(icustays_sample_df)

Total columns: 10
Total rows: 1342
--------------
subject_id        0
hadm_id           0
icustay_id        0
dbsource          0
first_careunit    0
last_careunit     0
first_wardid      0
last_wardid       0
intime            0
outtime           0
dtype: int64


In [50]:
# metadata.add_table(name='random_patients_df', data=patients_tables['random_patients_df'], primary_key='subject_id')

# metadata.add_table(name='admissions_sample_df', data=patients_tables['admissions_sample_df'], primary_key='hadm_id', 
#     parent='random_patients_df', foreign_key='subject_id', constraints=[admittime_dischtime, edregtime_edouttime])

# metadata.add_table(name='callout_sample_df', data=patients_tables['callout_sample_df'], primary_key='hadm_id', 
#     parent='random_patients_df', foreign_key='subject_id', 
#     constraints=[createtime_updatetime, createtime_acknowledgetime, updatetime_outcometime, acknowledgetime_outcometime])
    
# metadata.add_table(name='icustays_sample_df', data=patients_tables['icustays_sample_df'], primary_key='icustay_id', 
#     parent='callout_sample_df', foreign_key='hadm_id', constraints=[intime_outtime])

In [51]:
# metadata.visualize()

In [52]:
patients_info_model = HMA1(metadata)

In [53]:
patients_info_model.fit(patients_tables)

In [90]:
patients_info_model.save("../temp_sets/models/patients_info_model.pkl")

In [91]:
new_data_sample = patients_info_model.sample(num_rows = 1000)

---

In [92]:
from sdv.evaluation import evaluate

In [99]:
new_data_sample['callout_sample_df'].head(5)

Unnamed: 0,subject_id,hadm_id,submit_wardid,curr_wardid,curr_careunit,callout_wardid,callout_service,request_tele,request_resp,request_cdiff,request_mrsa,request_vre,callout_status,callout_outcome,discharge_wardid,acknowledge_status,createtime,updatetime,acknowledgetime,outcometime
0,518,79,22,38,SICU,21,CSURG,1,0,0,0,0,Inactive,Discharged,28.0,Acknowledged,2132-01-12 16:55:44.359519232,2160-01-02 23:40:34.586804224,2171-10-13 11:08:13.608137728,2203-07-28 07:37:02.321100800
1,534,80,22,34,SICU,18,OMED,1,0,0,0,0,Inactive,Discharged,26.0,Acknowledged,2105-12-27 11:42:32.577035776,2127-11-30 15:35:06.207374336,2126-07-04 23:33:10.479185920,2142-11-04 01:46:03.801082880
2,534,81,25,47,SICU,35,CMED,1,0,0,0,0,Inactive,Cancelled,73.0,Acknowledged,2106-06-05 13:13:49.802624000,2127-12-23 01:00:49.191844864,NaT,2142-10-26 10:51:12.483483648
3,555,82,25,36,SICU,12,CSURG,1,0,0,0,0,Inactive,Discharged,0.0,Acknowledged,1969-11-22 23:55:13.635221884,1977-09-10 03:01:57.780327200,2013-10-15 01:23:29.150006784,2019-09-28 03:48:14.343142400
4,565,83,48,35,MICU,21,CCU,1,0,0,0,0,Inactive,Discharged,0.0,Acknowledged,2000-07-21 14:28:27.981632896,2015-08-15 11:10:46.274389248,2003-07-27 15:41:50.534866176,2041-10-03 19:45:40.428942336


In [102]:
evaluate(
    new_data_sample['callout_sample_df'].loc[:, ['curr_careunit', 'callout_service','callout_status','callout_outcome','acknowledge_status']], 
    callout_sample_df.loc[:, ['curr_careunit', 'callout_service','callout_status','callout_outcome','acknowledge_status']], 
    metrics=['DiscreteKLDivergence'])

0.7062228509428266

In [116]:
from sdv.metrics.relational import KSTestExtended

In [122]:
KSTestExtended.compute(patients_tables, new_data_sample)

0.6968312980239912