In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# ! pip install pgmpy

In [3]:
import pandas as pd
import numpy as np

## Time data discretisation

In [4]:
def date_discretisation(date):
    if not pd.isna(date):
        # Do not use "-" as connector, because Pandas might read it as date or time!!
        return str(date.quarter) + "&" + str(date.dayofweek)
    else:
        return "Na"

In [5]:
def datetime_discretisation(date):
    if not pd.isna(date):
        # Do not use "-" as connector, because Pandas might read it as date or time!!
        return str(date.quarter) + "&" + str(date.dayofweek) + "&" + str(date.hour)
    else:
        return "Na"

In [6]:
date = pd.to_datetime('10/27/2044')

In [7]:
date_discretisation(date)

'4&3'

In [8]:
date.day_of_week

3

## Read Tempdatasets

In [9]:
# read csv file as dataframe, and drop ROW_ID column
def read_csv_no_rowid(file_path):
    df = pd.read_csv(file_path)
    df.drop(["Unnamed: 0", 'row_id'], axis=1, inplace=True)

    return df

In [10]:
# check NaN value

def nan_count(df):
    print("Total columns: " + str(len(df.columns)))
    print("Total rows: " + str(len(df)))
    print("--------------")
    print(df.isnull().sum())

### Patients

In [11]:
random_patients_df = read_csv_no_rowid("../temp_sets/patients.csv")

In [12]:
# Drop useless colums
random_patients_df.drop(['dod', 'expire_flag'], axis=1, inplace=True)

In [13]:
# Transfer some date type

random_patients_df['dob'] = pd.to_datetime(random_patients_df['dob'])
# random_patients_df['DOD'] = pd.to_datetime(random_patients_df['DOD'])
random_patients_df['dod_hosp'] = pd.to_datetime(random_patients_df['dod_hosp'])
random_patients_df['dod_ssn'] = pd.to_datetime(random_patients_df['dod_ssn'])

In [14]:
nan_count(random_patients_df)

Total columns: 5
Total rows: 1000
--------------
subject_id      0
gender          0
dob             0
dod_hosp      774
dod_ssn       702
dtype: int64


In [15]:
random_patients_df.loc[:, 'dob'] = random_patients_df.loc[:, 'dob'].apply(date_discretisation)
random_patients_df.loc[:, 'dod_hosp'] = random_patients_df.loc[:, 'dod_hosp'].apply(date_discretisation)
random_patients_df.loc[:, 'dod_ssn'] = random_patients_df.loc[:, 'dod_ssn'].apply(date_discretisation)

In [16]:
# random_patients_df['dob'].fillna(value=nan_time, inplace=True)
# random_patients_df['dod_hosp'].fillna(value=nan_time, inplace=True)
# random_patients_df['dod_ssn'].fillna(value=nan_time, inplace=True)

In [17]:
nan_count(random_patients_df)

Total columns: 5
Total rows: 1000
--------------
subject_id    0
gender        0
dob           0
dod_hosp      0
dod_ssn       0
dtype: int64


### Admissions

Note: the 'deathtime' in admission table cannot be dropped, as this column includes the exact time.

In [18]:
admissions_sample_df = read_csv_no_rowid("../temp_sets/admissions.csv")

In [19]:
admissions_sample_df.drop(['diagnosis', 'hospital_expire_flag'], axis=1, inplace=True)

In [20]:
admissions_sample_df['admittime'] = pd.to_datetime(admissions_sample_df['admittime'])
admissions_sample_df['dischtime'] = pd.to_datetime(admissions_sample_df['dischtime'])
admissions_sample_df['deathtime'] = pd.to_datetime(admissions_sample_df['deathtime'])
admissions_sample_df['edregtime'] = pd.to_datetime(admissions_sample_df['edregtime'])
admissions_sample_df['edouttime'] = pd.to_datetime(admissions_sample_df['edouttime'])

In [21]:
admissions_sample_df.dtypes

subject_id                       int64
hadm_id                          int64
admittime               datetime64[ns]
dischtime               datetime64[ns]
deathtime               datetime64[ns]
admission_type                  object
admission_location              object
discharge_location              object
insurance                       object
language                        object
religion                        object
marital_status                  object
ethnicity                       object
edregtime               datetime64[ns]
edouttime               datetime64[ns]
has_chartevents_data             int64
dtype: object

In [22]:
admissions_sample_df.loc[:, 'admittime'] = admissions_sample_df.loc[:, 'admittime'].apply(datetime_discretisation)
admissions_sample_df.loc[:, 'dischtime'] = admissions_sample_df.loc[:, 'dischtime'].apply(datetime_discretisation)
admissions_sample_df.loc[:, 'deathtime'] = admissions_sample_df.loc[:, 'deathtime'].apply(datetime_discretisation)
admissions_sample_df.loc[:, 'edregtime'] = admissions_sample_df.loc[:, 'edregtime'].apply(datetime_discretisation)
admissions_sample_df.loc[:, 'edouttime'] = admissions_sample_df.loc[:, 'edouttime'].apply(datetime_discretisation)

In [23]:
admissions_sample_df['language'].fillna(value='unknow', inplace=True)
admissions_sample_df['marital_status'].fillna(value='unknow', inplace=True)
admissions_sample_df['religion'].fillna(value='unknow', inplace=True)
# admissions_sample_df['deathtime'].fillna(value=nan_time, inplace=True)
# admissions_sample_df['edregtime'].fillna(value=nan_time, inplace=True)
# admissions_sample_df['edouttime'].fillna(value=nan_time, inplace=True)

In [24]:
nan_count(admissions_sample_df)

Total columns: 16
Total rows: 1289
--------------
subject_id              0
hadm_id                 0
admittime               0
dischtime               0
deathtime               0
admission_type          0
admission_location      0
discharge_location      0
insurance               0
language                0
religion                0
marital_status          0
ethnicity               0
edregtime               0
edouttime               0
has_chartevents_data    0
dtype: int64


### Callout

In [25]:
callout_sample_df = read_csv_no_rowid("../temp_sets/callout.csv")

In [26]:
callout_sample_df.drop(['submit_careunit', 'firstreservationtime', 'currentreservationtime'], axis=1, inplace=True)

In [27]:
callout_sample_df['createtime'] = pd.to_datetime(callout_sample_df['createtime'])
callout_sample_df['updatetime'] = pd.to_datetime(callout_sample_df['updatetime'])
callout_sample_df['acknowledgetime'] = pd.to_datetime(callout_sample_df['acknowledgetime'])
callout_sample_df['outcometime'] = pd.to_datetime(callout_sample_df['outcometime'])

In [28]:
callout_sample_df.loc[:, 'createtime'] = callout_sample_df.loc[:, 'createtime'].apply(datetime_discretisation)
callout_sample_df.loc[:, 'updatetime'] = callout_sample_df.loc[:, 'updatetime'].apply(datetime_discretisation)
callout_sample_df.loc[:, 'acknowledgetime'] = callout_sample_df.loc[:, 'acknowledgetime'].apply(datetime_discretisation)
callout_sample_df.loc[:, 'outcometime'] = callout_sample_df.loc[:, 'outcometime'].apply(datetime_discretisation)

In [29]:
callout_sample_df['discharge_wardid'].fillna(value=100, inplace=True)
# callout_sample_df['acknowledgetime'].fillna(value=nan_time, inplace=True)

In [30]:
nan_count(callout_sample_df)

Total columns: 20
Total rows: 807
--------------
subject_id            0
hadm_id               0
submit_wardid         0
curr_wardid           0
curr_careunit         0
callout_wardid        0
callout_service       0
request_tele          0
request_resp          0
request_cdiff         0
request_mrsa          0
request_vre           0
callout_status        0
callout_outcome       0
discharge_wardid      0
acknowledge_status    0
createtime            0
updatetime            0
acknowledgetime       0
outcometime           0
dtype: int64


### ICUstays

In [31]:
icustays_sample_df = read_csv_no_rowid("../temp_sets/icustays.csv")

In [32]:
icustays_sample_df.drop(['los'], axis=1, inplace=True)

In [33]:
icustays_sample_df['intime'] = pd.to_datetime(icustays_sample_df['intime'])
icustays_sample_df['outtime'] = pd.to_datetime(icustays_sample_df['outtime'])

In [34]:
icustays_sample_df.loc[:, 'intime'] = icustays_sample_df.loc[:, 'intime'].apply(datetime_discretisation)
icustays_sample_df.loc[:, 'outtime'] = icustays_sample_df.loc[:, 'outtime'].apply(datetime_discretisation)

In [35]:
nan_count(icustays_sample_df)

Total columns: 10
Total rows: 1342
--------------
subject_id        0
hadm_id           0
icustay_id        0
dbsource          0
first_careunit    0
last_careunit     0
first_wardid      0
last_wardid       0
intime            0
outtime           0
dtype: int64


---

## Combine samples

In [37]:
patients_df = pd.merge(random_patients_df, admissions_sample_df, on=['subject_id'], how='inner')

In [38]:
patients_df = pd.merge(patients_df, callout_sample_df, on=['subject_id', 'hadm_id'], how='inner')

In [39]:
patients_df = pd.merge(patients_df, icustays_sample_df, on=['subject_id', 'hadm_id'], how='inner')

In [40]:
patients_df.columns

Index(['subject_id', 'gender', 'dob', 'dod_hosp', 'dod_ssn', 'hadm_id',
       'admittime', 'dischtime', 'deathtime', 'admission_type',
       'admission_location', 'discharge_location', 'insurance', 'language',
       'religion', 'marital_status', 'ethnicity', 'edregtime', 'edouttime',
       'has_chartevents_data', 'submit_wardid', 'curr_wardid', 'curr_careunit',
       'callout_wardid', 'callout_service', 'request_tele', 'request_resp',
       'request_cdiff', 'request_mrsa', 'request_vre', 'callout_status',
       'callout_outcome', 'discharge_wardid', 'acknowledge_status',
       'createtime', 'updatetime', 'acknowledgetime', 'outcometime',
       'icustay_id', 'dbsource', 'first_careunit', 'last_careunit',
       'first_wardid', 'last_wardid', 'intime', 'outtime'],
      dtype='object')

In [41]:
nan_count(patients_df)

Total columns: 46
Total rows: 992
--------------
subject_id              0
gender                  0
dob                     0
dod_hosp                0
dod_ssn                 0
hadm_id                 0
admittime               0
dischtime               0
deathtime               0
admission_type          0
admission_location      0
discharge_location      0
insurance               0
language                0
religion                0
marital_status          0
ethnicity               0
edregtime               0
edouttime               0
has_chartevents_data    0
submit_wardid           0
curr_wardid             0
curr_careunit           0
callout_wardid          0
callout_service         0
request_tele            0
request_resp            0
request_cdiff           0
request_mrsa            0
request_vre             0
callout_status          0
callout_outcome         0
discharge_wardid        0
acknowledge_status      0
createtime              0
updatetime              0
acknowledgetime

In [42]:
# patients_df = pd.merge(patients_df, services_sample_df, on=['subject_id', 'hadm_id'], how='outer')
# patients_df = pd.merge(patients_df, transfers_sample_df.drop(['icustay_id', 'dbsource', 'curr_careunit', 'curr_wardid', \
#                                                 'intime', 'outtime', 'los'], axis=1), on=['subject_id', 'hadm_id'], how='outer')
# patients_df.fillna(value="Na", inplace=True)

In [44]:
patients_df.to_csv("patients_info.csv")

In [45]:
patients_df.drop(['subject_id', 'hadm_id'], axis=1, inplace=True)

---

##  Build Network

In [46]:
from pgmpy.estimators import HillClimbSearch

hc = HillClimbSearch(patients_df.loc[0:50, :])
best_model = hc.estimate()
print(best_model.edges())
edges = best_model.edges()

  0%|          | 0/1000000 [00:00<?, ?it/s]

[('gender', 'religion'), ('dob', 'dischtime'), ('dob', 'gender'), ('dob', 'religion'), ('dod_hosp', 'discharge_location'), ('dod_hosp', 'insurance'), ('dod_hosp', 'edouttime'), ('dod_ssn', 'language'), ('dod_ssn', 'edouttime'), ('dod_ssn', 'gender'), ('admittime', 'intime'), ('dischtime', 'admittime'), ('deathtime', 'dod_ssn'), ('deathtime', 'dod_hosp'), ('deathtime', 'ethnicity'), ('deathtime', 'discharge_wardid'), ('deathtime', 'discharge_location'), ('deathtime', 'first_wardid'), ('deathtime', 'edouttime'), ('admission_type', 'admission_location'), ('admission_type', 'marital_status'), ('admission_type', 'discharge_location'), ('admission_type', 'dob'), ('admission_location', 'dbsource'), ('admission_location', 'edouttime'), ('discharge_location', 'dischtime'), ('insurance', 'marital_status'), ('insurance', 'request_mrsa'), ('language', 'dob'), ('language', 'insurance'), ('language', 'edouttime'), ('language', 'gender'), ('marital_status', 'dob'), ('marital_status', 'gender'), ('mar

[('subject_id', 'dischtime'), ('subject_id', 'gender'), ('subject_id', 'religion'), ('dob', 'subject_id'), ('dod_hosp', 'discharge_location'), ('dod_hosp', 'insurance'), ('dod_hosp', 'edouttime'), ('dod_hosp', 'subject_id'), ('dod_ssn', 'language'), ('dod_ssn', 'edouttime'), ('dod_ssn', 'subject_id'), ('hadm_id', 'icustay_id'), ('admittime', 'hadm_id'), ('dischtime', 'admittime'), ('deathtime', 'dod_ssn'), ('deathtime', 'dod_hosp'), ('deathtime', 'ethnicity'), ('deathtime', 'discharge_wardid'), ('deathtime', 'discharge_location'), ('deathtime', 'first_wardid'), ('deathtime', 'edouttime'), ('admission_type', 'admission_location'), ('admission_type', 'marital_status'), ('admission_type', 'discharge_location'), ('admission_type', 'dob'), ('admission_location', 'dbsource'), ('admission_location', 'edouttime'), ('discharge_location', 'dischtime'), ('insurance', 'marital_status'), ('insurance', 'request_mrsa'), ('language', 'dob'), ('language', 'insurance'), ('language', 'edouttime'), ('language', 'subject_id'), ('marital_status', 'dob'), ('ethnicity', 'insurance'), ('ethnicity', 'marital_status'), ('ethnicity', 'admission_location'), ('edregtime', 'admittime'), ('edouttime', 'edregtime'), ('edouttime', 'dischtime'), ('edouttime', 'hadm_id'), ('submit_wardid', 'discharge_wardid'), ('curr_careunit', 'submit_wardid'), ('callout_wardid', 'callout_service'), ('callout_wardid', 'admission_type'), ('callout_service', 'last_careunit'), ('request_tele', 'request_mrsa'), ('request_tele', 'admission_type'), ('request_tele', 'createtime'), ('request_tele', 'updatetime'), ('request_cdiff', 'ethnicity'), ('request_cdiff', 'discharge_location'), ('request_cdiff', 'dod_hosp'), ('request_cdiff', 'first_wardid'), ('request_cdiff', 'callout_wardid'), ('request_mrsa', 'admission_type'), ('request_vre', 'request_mrsa'), ('request_vre', 'discharge_location'), ('request_vre', 'insurance'), ('request_vre', 'ethnicity'), ('request_vre', 'dob'), ('request_vre', 'edouttime'), ('request_vre', 'subject_id'), ('callout_outcome', 'request_mrsa'), ('callout_outcome', 'createtime'), ('discharge_wardid', 'curr_wardid'), ('discharge_wardid', 'callout_outcome'), ('acknowledge_status', 'deathtime'), ('acknowledge_status', 'request_tele'), ('acknowledge_status', 'last_careunit'), ('createtime', 'updatetime'), ('updatetime', 'acknowledgetime'), ('updatetime', 'outcometime'), ('icustay_id', 'intime'), ('first_careunit', 'first_wardid'), ('last_careunit', 'first_careunit'), ('last_careunit', 'curr_careunit'), ('last_careunit', 'last_wardid'), ('last_careunit', 'request_tele'), ('last_careunit', 'icustay_id'), ('first_wardid', 'last_wardid'), ('last_wardid', 'submit_wardid'), ('intime', 'outtime'), ('outtime', 'createtime')]

In [None]:
# from pgmpy.estimators import MmhcEstimator
# from pgmpy.estimators import BDeuScore
# from pgmpy.estimators import HillClimbSearch 
# from pgmpy.models import BayesianNetwork

# mmhc = MmhcEstimator(patients_df.loc[0:30, :])
# skeleton = mmhc.mmpc()
# print("Part 1) Skeleton: ", skeleton.edges())

# hc = HillClimbSearch(patients_df.loc[0:30, :])
# model = hc.estimate(tabu_length=10, white_list=skeleton.to_directed().edges())
# print("Part 2) Model:    ", model.edges())

In [None]:
# edges = [('subject_id', 'dischtime'), ('subject_id', 'gender'), ('subject_id', 'religion'), ('dob', 'subject_id'), ('dod_hosp', 'discharge_location'), ('dod_hosp', 'insurance'), ('dod_hosp', 'edouttime'), ('dod_hosp', 'subject_id'), ('dod_ssn', 'language'), ('dod_ssn', 'edouttime'), ('dod_ssn', 'subject_id'), ('hadm_id', 'icustay_id'), ('admittime', 'hadm_id'), ('dischtime', 'admittime'), ('deathtime', 'dod_ssn'), ('deathtime', 'dod_hosp'), ('deathtime', 'ethnicity'), ('deathtime', 'discharge_wardid'), ('deathtime', 'discharge_location'), ('deathtime', 'first_wardid'), ('deathtime', 'edouttime'), ('admission_type', 'admission_location'), ('admission_type', 'marital_status'), ('admission_type', 'discharge_location'), ('admission_type', 'dob'), ('admission_location', 'dbsource'), ('admission_location', 'edouttime'), ('discharge_location', 'dischtime'), ('insurance', 'marital_status'), ('insurance', 'request_mrsa'), ('language', 'dob'), ('language', 'insurance'), ('language', 'edouttime'), ('language', 'subject_id'), ('marital_status', 'dob'), ('ethnicity', 'insurance'), ('ethnicity', 'marital_status'), ('ethnicity', 'admission_location'), ('edregtime', 'admittime'), ('edouttime', 'edregtime'), ('edouttime', 'dischtime'), ('edouttime', 'hadm_id'), ('submit_wardid', 'discharge_wardid'), ('curr_careunit', 'submit_wardid'), ('callout_wardid', 'callout_service'), ('callout_wardid', 'admission_type'), ('callout_service', 'last_careunit'), ('request_tele', 'request_mrsa'), ('request_tele', 'admission_type'), ('request_tele', 'createtime'), ('request_tele', 'updatetime'), ('request_cdiff', 'ethnicity'), ('request_cdiff', 'discharge_location'), ('request_cdiff', 'dod_hosp'), ('request_cdiff', 'first_wardid'), ('request_cdiff', 'callout_wardid'), ('request_mrsa', 'admission_type'), ('request_vre', 'request_mrsa'), ('request_vre', 'discharge_location'), ('request_vre', 'insurance'), ('request_vre', 'ethnicity'), ('request_vre', 'dob'), ('request_vre', 'edouttime'), ('request_vre', 'subject_id'), ('callout_outcome', 'request_mrsa'), ('callout_outcome', 'createtime'), ('discharge_wardid', 'curr_wardid'), ('discharge_wardid', 'callout_outcome'), ('acknowledge_status', 'deathtime'), ('acknowledge_status', 'request_tele'), ('acknowledge_status', 'last_careunit'), ('createtime', 'updatetime'), ('updatetime', 'acknowledgetime'), ('updatetime', 'outcometime'), ('icustay_id', 'intime'), ('first_careunit', 'first_wardid'), ('last_careunit', 'first_careunit'), ('last_careunit', 'curr_careunit'), ('last_careunit', 'last_wardid'), ('last_careunit', 'request_tele'), ('last_careunit', 'icustay_id'), ('first_wardid', 'last_wardid'), ('last_wardid', 'submit_wardid'), ('intime', 'outtime'), ('outtime', 'createtime')]

In [51]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import BayesianEstimator

patients_info_model = BayesianNetwork(edges)

In [52]:
len(edges)

85

In [53]:
patients_info_model.fit(patients_df.loc[:500, :], estimator=BayesianEstimator, prior_type="BDeu", n_jobs=8) # default equivalent_sample_size=5

## Generation test

In [54]:
patients_info_model.get_cpds()[0].values.shape

(2, 28, 27, 15, 7)

In [55]:
# patients_info_model.fit_update(patients_df.loc[101:200, list(patients_info_model.nodes)])

In [56]:
# patients_info_model.save(filename="patients_info_model.bif", filetype='bif')

In [57]:
samples = patients_info_model.simulate(n_samples=10)

  0%|          | 0/41 [00:00<?, ?it/s]

  warn(


KeyboardInterrupt: 

In [None]:
samples

Unnamed: 0,ethnicity,curr_wardid,curr_careunit,admission_type,intime,dod_ssn,dbsource,subject_id,createtime,insurance,...,last_wardid,dischtime,callout_service,last_careunit,updatetime,callout_wardid,dob,icustay_id,edouttime,dod_hosp
0,WHITE,40,TSICU,ELECTIVE,4&0&2,Na,carevue,52990,1&4&10,Medicare,...,14,3&4&14,ORTHO,TSICU,1&4&11,31,1&2,284010,3&1&22,Na
1,WHITE,45,MICU,EMERGENCY,3&2&16,Na,metavision,3482,1&2&12,Medicare,...,23,4&4&18,OMED,MICU,1&2&12,48,2&5,232828,1&6&18,Na
2,WHITE,29,MICU,EMERGENCY,2&2&9,2&1,metavision,81096,2&5&15,Private,...,52,2&5&14,MED,MICU,2&5&16,1,1&4,283598,4&6&21,Na
3,WHITE,4,SICU,EMERGENCY,2&2&2,3&1,metavision,70641,2&2&14,Medicare,...,57,3&1&17,MED,SICU,2&4&12,1,3&0,208303,1&0&20,3&2
4,WHITE,18,MICU,EMERGENCY,4&3&2,Na,metavision,30020,4&3&14,Medicare,...,52,4&3&14,MED,MICU,4&3&14,1,2&2,252878,4&6&18,Na
5,WHITE,18,MICU,EMERGENCY,2&2&21,Na,carevue,9216,2&3&9,Medicare,...,52,2&1&15,MED,MICU,2&3&9,1,2&3,216906,2&2&22,Na
6,BLACK/AFRICAN AMERICAN,23,MICU,EMERGENCY,2&5&12,Na,metavision,65401,3&6&9,Private,...,52,2&4&15,MED,MICU,3&6&11,1,2&2,250355,Na,Na
7,WHITE,31,SICU,ELECTIVE,2&3&22,Na,metavision,47747,2&2&14,Medicare,...,33,1&0&17,MED,SICU,2&4&12,2,4&5,229034,Na,Na
8,BLACK/AFRICAN AMERICAN,8,CSRU,EMERGENCY,1&0&17,Na,carevue,17621,2&5&8,Private,...,15,2&4&12,CSURG,CCU,2&5&8,55,4&0,299479,2&0&17,Na
9,WHITE,8,MICU,EMERGENCY,3&2&16,3&1,carevue,10424,4&3&14,Medicare,...,57,3&5&8,MED,SICU,4&3&14,1,3&2,232828,3&4&17,3&5


In [None]:
# my_model.fit(callout_sample_df, estimator=BayesianEstimator, prior_type="BDeu") # default equivalent_sample_size=5
# for cpd in patients_info_model.get_cpds():
#     print(cpd)

In [None]:
# patients_info_model.edges