In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# ! pip install pgmpy

In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

import common

In [3]:
date_set_path = "../temp_sets_100/"

## Patients

### Read samples

In [4]:
patients_df = common.read_csv_no_rowid(date_set_path+"patients.csv")

### Data preproces

In [5]:
# Drop useless colums
patients_df.drop(['dod', 'expire_flag'], axis=1, inplace=True)

#### Deal with null value

In [7]:
# Check null value in table
common.nan_count(patients_df)

Total columns: 5
Total rows: 100
--------------
subject_id     0
gender         0
dob            0
dod_hosp      78
dod_ssn       63
dtype: int64


In [8]:
# Set a value replacing the null time value
nan_datetime=pd.to_datetime(0)

In [9]:
patients_df['dob'].fillna(value=nan_datetime, inplace=True)
patients_df['dod_hosp'].fillna(value=nan_datetime, inplace=True)
patients_df['dod_ssn'].fillna(value=nan_datetime, inplace=True)

In [10]:
common.nan_count(patients_df)

Total columns: 5
Total rows: 100
--------------
subject_id    0
gender        0
dob           0
dod_hosp      0
dod_ssn       0
dtype: int64


#### Set the column types

In [11]:
patients_df.dtypes

subject_id             int64
gender                object
dob           datetime64[ns]
dod_hosp      datetime64[ns]
dod_ssn       datetime64[ns]
dtype: object

In [6]:
# Transfer some date type

patients_df['dob'] = pd.to_datetime(patients_df['dob'])
# patients_df['DOD'] = pd.to_datetime(patients_df['DOD'])
patients_df['dod_hosp'] = pd.to_datetime(patients_df['dod_hosp'])
patients_df['dod_ssn'] = pd.to_datetime(patients_df['dod_ssn'])

In [None]:
patients_df.dtypes

### Admissions

Note: the 'deathtime' in admission table cannot be dropped, as this column includes the exact time.

In [None]:
admissions_sample_df = common.read_csv_no_rowid(date_set_path+"admissions.csv")

In [None]:
admissions_sample_df.drop(['diagnosis', 'hospital_expire_flag'], axis=1, inplace=True)

In [None]:
admissions_sample_df['admittime'] = pd.to_datetime(admissions_sample_df['admittime'])
admissions_sample_df['dischtime'] = pd.to_datetime(admissions_sample_df['dischtime'])
admissions_sample_df['deathtime'] = pd.to_datetime(admissions_sample_df['deathtime'])
admissions_sample_df['edregtime'] = pd.to_datetime(admissions_sample_df['edregtime'])
admissions_sample_df['edouttime'] = pd.to_datetime(admissions_sample_df['edouttime'])

In [None]:
admissions_sample_df.dtypes

In [None]:
# admissions_sample_df.loc[:, 'admittime'] = admissions_sample_df.loc[:, 'admittime'].apply(datetime_discretisation)
# admissions_sample_df.loc[:, 'dischtime'] = admissions_sample_df.loc[:, 'dischtime'].apply(datetime_discretisation)
# admissions_sample_df.loc[:, 'deathtime'] = admissions_sample_df.loc[:, 'deathtime'].apply(datetime_discretisation)
# admissions_sample_df.loc[:, 'edregtime'] = admissions_sample_df.loc[:, 'edregtime'].apply(datetime_discretisation)
# admissions_sample_df.loc[:, 'edouttime'] = admissions_sample_df.loc[:, 'edouttime'].apply(datetime_discretisation)

In [None]:
admissions_sample_df['language'].fillna(value='unknow', inplace=True)
admissions_sample_df['marital_status'].fillna(value='unknow', inplace=True)
admissions_sample_df['religion'].fillna(value='unknow', inplace=True)
# admissions_sample_df['deathtime'].fillna(value=nan_time, inplace=True)
# admissions_sample_df['edregtime'].fillna(value=nan_time, inplace=True)
# admissions_sample_df['edouttime'].fillna(value=nan_time, inplace=True)

In [None]:
common.nan_count(admissions_sample_df)

### Callout

In [None]:
callout_sample_df = common.read_csv_no_rowid(date_set_path+"callout.csv")

In [None]:
callout_sample_df.drop(['submit_careunit', 'firstreservationtime', 'currentreservationtime'], axis=1, inplace=True)

In [None]:
callout_sample_df['createtime'] = pd.to_datetime(callout_sample_df['createtime'])
callout_sample_df['updatetime'] = pd.to_datetime(callout_sample_df['updatetime'])
callout_sample_df['acknowledgetime'] = pd.to_datetime(callout_sample_df['acknowledgetime'])
callout_sample_df['outcometime'] = pd.to_datetime(callout_sample_df['outcometime'])

In [None]:
# callout_sample_df.loc[:, 'createtime'] = callout_sample_df.loc[:, 'createtime'].apply(datetime_discretisation)
# callout_sample_df.loc[:, 'updatetime'] = callout_sample_df.loc[:, 'updatetime'].apply(datetime_discretisation)
# callout_sample_df.loc[:, 'acknowledgetime'] = callout_sample_df.loc[:, 'acknowledgetime'].apply(datetime_discretisation)
# callout_sample_df.loc[:, 'outcometime'] = callout_sample_df.loc[:, 'outcometime'].apply(datetime_discretisation)

In [None]:
callout_sample_df['discharge_wardid'].fillna(value=100, inplace=True)
# callout_sample_df['acknowledgetime'].fillna(value=nan_time, inplace=True)

In [None]:
common.nan_count(callout_sample_df)

### ICUstays

In [None]:
icustays_sample_df = common.read_csv_no_rowid(date_set_path+"icustays.csv")

In [None]:
icustays_sample_df.drop(['los'], axis=1, inplace=True)

In [None]:
icustays_sample_df['intime'] = pd.to_datetime(icustays_sample_df['intime'])
icustays_sample_df['outtime'] = pd.to_datetime(icustays_sample_df['outtime'])

In [None]:
# icustays_sample_df.loc[:, 'intime'] = icustays_sample_df.loc[:, 'intime'].apply(datetime_discretisation)
# icustays_sample_df.loc[:, 'outtime'] = icustays_sample_df.loc[:, 'outtime'].apply(datetime_discretisation)

In [None]:
common.nan_count(icustays_sample_df)

---

## Combine samples

In [None]:
# patients_df = pd.merge(patients_df, admissions_sample_df, on=['subject_id'], how='inner')
# patients_df = pd.merge(patients_df, callout_sample_df, on=['subject_id', 'hadm_id'], how='inner')
# patients_df = pd.merge(patients_df, icustays_sample_df, on=['subject_id', 'hadm_id'], how='inner')

In [None]:
# patients_df.columns

In [None]:
# nan_count(patients_df)

In [None]:
# patients_df = pd.merge(patients_df, services_sample_df, on=['subject_id', 'hadm_id'], how='outer')
# patients_df = pd.merge(patients_df, transfers_sample_df.drop(['icustay_id', 'dbsource', 'curr_careunit', 'curr_wardid', \
#                                                 'intime', 'outtime', 'los'], axis=1), on=['subject_id', 'hadm_id'], how='outer')
# patients_df.fillna(value="Na", inplace=True)

In [None]:
# patients_df.to_csv("patients_info.csv")

In [None]:
# patients_df.drop(['subject_id', 'hadm_id'], axis=1, inplace=True)

---

##  Build Network

In [None]:
from sdv import Metadata
from sdv.relational import HMA1

from sdv.constraints import GreaterThan

In [None]:
patients_tables = {
    'patients_df': patients_df,
    'admissions_sample_df': admissions_sample_df,
    'callout_sample_df': callout_sample_df,
    'icustays_sample_df': icustays_sample_df
}

In [None]:
# contains in patients


In [None]:
# contains in admissions
admittime_dischtime = GreaterThan(
    low='admittime',
    high='dischtime',
    handling_strategy='reject_sampling'
)

edregtime_edouttime = GreaterThan(
    low='edregtime',
    high='edouttime',
    handling_strategy='reject_sampling'
)

In [None]:
# contains in callout
createtime_updatetime = GreaterThan(
    low='createtime',
    high='updatetime',
    handling_strategy='reject_sampling'
)

createtime_acknowledgetime = GreaterThan(
    low='createtime',
    high='acknowledgetime',
    handling_strategy='reject_sampling'
)

updatetime_outcometime = GreaterThan(
    low='updatetime',
    high='outcometime',
    handling_strategy='reject_sampling'
)

acknowledgetime_outcometime = GreaterThan(
    low='acknowledgetime',
    high='outcometime',
    handling_strategy='reject_sampling'
)

In [None]:
# contains in icu_stays
intime_outtime = GreaterThan(
    low='intime',
    high='outtime',
    handling_strategy='transform'
)

In [None]:
metadata = Metadata()

In [None]:
metadata.add_table(name='patients_df', data=patients_tables['patients_df'], primary_key='subject_id')

metadata.add_table(name='admissions_sample_df', data=patients_tables['admissions_sample_df'], primary_key='hadm_id', 
    parent='patients_df', foreign_key='subject_id', constraints=[edregtime_edouttime])

metadata.add_table(name='callout_sample_df', data=patients_tables['callout_sample_df'], primary_key='hadm_id', 
    parent='patients_df', foreign_key='subject_id', 
    constraints=[createtime_updatetime, createtime_acknowledgetime, updatetime_outcometime, acknowledgetime_outcometime])
    
metadata.add_table(name='icustays_sample_df', data=patients_tables['icustays_sample_df'], primary_key='icustay_id', 
    parent='callout_sample_df', foreign_key='hadm_id', constraints=[intime_outtime])

In [None]:
nan_count(icustays_sample_df)

In [None]:
# metadata.add_table(name='patients_df', data=patients_tables['patients_df'], primary_key='subject_id')

# metadata.add_table(name='admissions_sample_df', data=patients_tables['admissions_sample_df'], primary_key='hadm_id', 
#     parent='patients_df', foreign_key='subject_id', constraints=[admittime_dischtime, edregtime_edouttime])

# metadata.add_table(name='callout_sample_df', data=patients_tables['callout_sample_df'], primary_key='hadm_id', 
#     parent='patients_df', foreign_key='subject_id', 
#     constraints=[createtime_updatetime, createtime_acknowledgetime, updatetime_outcometime, acknowledgetime_outcometime])
    
# metadata.add_table(name='icustays_sample_df', data=patients_tables['icustays_sample_df'], primary_key='icustay_id', 
#     parent='callout_sample_df', foreign_key='hadm_id', constraints=[intime_outtime])

In [None]:
# metadata.visualize()

In [None]:
patients_info_model = HMA1(metadata)

In [None]:
patients_info_model.fit(patients_tables)

In [None]:
patients_info_model.save("../temp_sets/models/patients_info_model.pkl")

In [None]:
new_data_sample = patients_info_model.sample(num_rows = 1000)

---

In [None]:
from sdv.evaluation import evaluate

In [None]:
new_data_sample['callout_sample_df'].head(5)

In [None]:
evaluate(
    new_data_sample['callout_sample_df'].loc[:, ['curr_careunit', 'callout_service','callout_status','callout_outcome','acknowledge_status']], 
    callout_sample_df.loc[:, ['curr_careunit', 'callout_service','callout_status','callout_outcome','acknowledge_status']], 
    metrics=['DiscreteKLDivergence'])

In [None]:
from sdv.metrics.relational import KSTestExtended

In [None]:
KSTestExtended.compute(patients_tables, new_data_sample)