In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# ! pip install pgmpy

In [1]:
import pandas as pd
import numpy as np

## Read Tempdatasets

In [2]:
# read csv file as dataframe, and drop ROW_ID column
def read_csv_no_rowid(file_path):
    df = pd.read_csv(file_path)
    df.drop(["Unnamed: 0", 'row_id'], axis=1, inplace=True)

    return df

In [3]:
# check NaN value

def nan_count(df):
    print("Total columns: " + str(len(df.columns)))
    print("Total rows: " + str(len(df)))
    print("--------------")
    print(df.isnull().sum())

In [4]:
nan_time = pd.to_datetime("2200-01-01 00:00:00")

### Patients

In [5]:
random_patients_df = read_csv_no_rowid("../temp_sets/patients.csv")

In [6]:
# Drop useless colums
random_patients_df.drop(['dod', 'expire_flag'], axis=1, inplace=True)

In [7]:
# Transfer some date type

random_patients_df['dob'] = pd.to_datetime(random_patients_df['dob'])
# random_patients_df['DOD'] = pd.to_datetime(random_patients_df['DOD'])
random_patients_df['dod_hosp'] = pd.to_datetime(random_patients_df['dod_hosp'])
random_patients_df['dod_ssn'] = pd.to_datetime(random_patients_df['dod_ssn'])

In [8]:
nan_count(random_patients_df)

Total columns: 5
Total rows: 1000
--------------
subject_id      0
gender          0
dob             0
dod_hosp      774
dod_ssn       702
dtype: int64


In [9]:
random_patients_df['dob'].fillna(value=nan_time, inplace=True)
random_patients_df['dod_hosp'].fillna(value=nan_time, inplace=True)
random_patients_df['dod_ssn'].fillna(value=nan_time, inplace=True)

In [10]:
nan_count(random_patients_df)

Total columns: 5
Total rows: 1000
--------------
subject_id    0
gender        0
dob           0
dod_hosp      0
dod_ssn       0
dtype: int64


### Admissions

Note: the 'deathtime' in admission table cannot be dropped, as this column includes the exact time.

In [11]:
admissions_sample_df = read_csv_no_rowid("../temp_sets/admissions.csv")

In [12]:
admissions_sample_df.drop(['diagnosis'], axis=1, inplace=True)

In [13]:
admissions_sample_df['admittime'] = pd.to_datetime(admissions_sample_df['admittime'])
admissions_sample_df['dischtime'] = pd.to_datetime(admissions_sample_df['dischtime'])
admissions_sample_df['deathtime'] = pd.to_datetime(admissions_sample_df['deathtime'])
admissions_sample_df['edregtime'] = pd.to_datetime(admissions_sample_df['edregtime'])
admissions_sample_df['edouttime'] = pd.to_datetime(admissions_sample_df['edouttime'])

In [14]:
nan_count(admissions_sample_df)

Total columns: 17
Total rows: 1289
--------------
subject_id                 0
hadm_id                    0
admittime                  0
dischtime                  0
deathtime               1159
admission_type             0
admission_location         0
discharge_location         0
insurance                  0
language                 541
religion                   8
marital_status           219
ethnicity                  0
edregtime                604
edouttime                604
hospital_expire_flag       0
has_chartevents_data       0
dtype: int64


In [15]:
admissions_sample_df['deathtime'].fillna(value=nan_time, inplace=True)
admissions_sample_df['language'].fillna(value='unknow', inplace=True)
admissions_sample_df['marital_status'].fillna(value='unknow', inplace=True)
admissions_sample_df['edregtime'].fillna(value=nan_time, inplace=True)
admissions_sample_df['edouttime'].fillna(value=nan_time, inplace=True)
admissions_sample_df['religion'].fillna(value='unknow', inplace=True)

In [16]:
nan_count(admissions_sample_df)

Total columns: 17
Total rows: 1289
--------------
subject_id              0
hadm_id                 0
admittime               0
dischtime               0
deathtime               0
admission_type          0
admission_location      0
discharge_location      0
insurance               0
language                0
religion                0
marital_status          0
ethnicity               0
edregtime               0
edouttime               0
hospital_expire_flag    0
has_chartevents_data    0
dtype: int64


### Callout

In [17]:
callout_sample_df = read_csv_no_rowid("../temp_sets/callout.csv")

In [18]:
callout_sample_df.drop(['submit_careunit', 'firstreservationtime', 'currentreservationtime'], axis=1, inplace=True)

In [19]:
callout_sample_df['acknowledgetime'] = pd.to_datetime(callout_sample_df['acknowledgetime'])

In [20]:
callout_sample_df.dtypes

subject_id                     int64
hadm_id                        int64
submit_wardid                  int64
curr_wardid                    int64
curr_careunit                 object
callout_wardid                 int64
callout_service               object
request_tele                   int64
request_resp                   int64
request_cdiff                  int64
request_mrsa                   int64
request_vre                    int64
callout_status                object
callout_outcome               object
discharge_wardid             float64
acknowledge_status            object
createtime                    object
updatetime                    object
acknowledgetime       datetime64[ns]
outcometime                   object
dtype: object

In [21]:
nan_count(callout_sample_df)

Total columns: 20
Total rows: 807
--------------
subject_id              0
hadm_id                 0
submit_wardid           0
curr_wardid             0
curr_careunit           0
callout_wardid          0
callout_service         0
request_tele            0
request_resp            0
request_cdiff           0
request_mrsa            0
request_vre             0
callout_status          0
callout_outcome         0
discharge_wardid      119
acknowledge_status      0
createtime              0
updatetime              0
acknowledgetime        35
outcometime             0
dtype: int64


In [22]:
callout_sample_df['discharge_wardid'].fillna(value=100, inplace=True)
callout_sample_df['acknowledgetime'].fillna(value=nan_time, inplace=True)

In [23]:
nan_count(callout_sample_df)

Total columns: 20
Total rows: 807
--------------
subject_id            0
hadm_id               0
submit_wardid         0
curr_wardid           0
curr_careunit         0
callout_wardid        0
callout_service       0
request_tele          0
request_resp          0
request_cdiff         0
request_mrsa          0
request_vre           0
callout_status        0
callout_outcome       0
discharge_wardid      0
acknowledge_status    0
createtime            0
updatetime            0
acknowledgetime       0
outcometime           0
dtype: int64


### ICUstays

In [24]:
icustays_sample_df = read_csv_no_rowid("../temp_sets/icustays.csv")

In [25]:
icustays_sample_df['intime'] = pd.to_datetime(icustays_sample_df['intime'])
icustays_sample_df['outtime'] = pd.to_datetime(icustays_sample_df['outtime'])

In [26]:
nan_count(icustays_sample_df)

Total columns: 11
Total rows: 1342
--------------
subject_id        0
hadm_id           0
icustay_id        0
dbsource          0
first_careunit    0
last_careunit     0
first_wardid      0
last_wardid       0
intime            0
outtime           0
los               0
dtype: int64


---

## Combine samples

In [27]:
patients_df = pd.merge(random_patients_df, admissions_sample_df, on=['subject_id'], how='inner')

In [28]:
nan_count(patients_df)

Total columns: 21
Total rows: 1289
--------------
subject_id              0
gender                  0
dob                     0
dod_hosp                0
dod_ssn                 0
hadm_id                 0
admittime               0
dischtime               0
deathtime               0
admission_type          0
admission_location      0
discharge_location      0
insurance               0
language                0
religion                0
marital_status          0
ethnicity               0
edregtime               0
edouttime               0
hospital_expire_flag    0
has_chartevents_data    0
dtype: int64


In [29]:
patients_df = pd.merge(patients_df, callout_sample_df, on=['subject_id', 'hadm_id'], how='inner')

In [30]:
nan_count(patients_df)

Total columns: 39
Total rows: 807
--------------
subject_id              0
gender                  0
dob                     0
dod_hosp                0
dod_ssn                 0
hadm_id                 0
admittime               0
dischtime               0
deathtime               0
admission_type          0
admission_location      0
discharge_location      0
insurance               0
language                0
religion                0
marital_status          0
ethnicity               0
edregtime               0
edouttime               0
hospital_expire_flag    0
has_chartevents_data    0
submit_wardid           0
curr_wardid             0
curr_careunit           0
callout_wardid          0
callout_service         0
request_tele            0
request_resp            0
request_cdiff           0
request_mrsa            0
request_vre             0
callout_status          0
callout_outcome         0
discharge_wardid        0
acknowledge_status      0
createtime              0
updatetime     

In [31]:
patients_df = pd.merge(patients_df, icustays_sample_df.drop(['icustay_id', 'dbsource', 'first_careunit', 'last_careunit', 'first_wardid', \
                                                'last_wardid', 'intime', 'outtime', 'los'], axis=1), on=['subject_id', 'hadm_id'], how='inner')

In [32]:
nan_count(patients_df)

Total columns: 39
Total rows: 992
--------------
subject_id              0
gender                  0
dob                     0
dod_hosp                0
dod_ssn                 0
hadm_id                 0
admittime               0
dischtime               0
deathtime               0
admission_type          0
admission_location      0
discharge_location      0
insurance               0
language                0
religion                0
marital_status          0
ethnicity               0
edregtime               0
edouttime               0
hospital_expire_flag    0
has_chartevents_data    0
submit_wardid           0
curr_wardid             0
curr_careunit           0
callout_wardid          0
callout_service         0
request_tele            0
request_resp            0
request_cdiff           0
request_mrsa            0
request_vre             0
callout_status          0
callout_outcome         0
discharge_wardid        0
acknowledge_status      0
createtime              0
updatetime     

In [33]:
# patients_df = pd.merge(patients_df, services_sample_df, on=['subject_id', 'hadm_id'], how='outer')
# patients_df = pd.merge(patients_df, transfers_sample_df.drop(['icustay_id', 'dbsource', 'curr_careunit', 'curr_wardid', \
#                                                 'intime', 'outtime', 'los'], axis=1), on=['subject_id', 'hadm_id'], how='outer')
# patients_df.fillna(value="Na", inplace=True)

In [34]:
nan_count(patients_df)

Total columns: 39
Total rows: 992
--------------
subject_id              0
gender                  0
dob                     0
dod_hosp                0
dod_ssn                 0
hadm_id                 0
admittime               0
dischtime               0
deathtime               0
admission_type          0
admission_location      0
discharge_location      0
insurance               0
language                0
religion                0
marital_status          0
ethnicity               0
edregtime               0
edouttime               0
hospital_expire_flag    0
has_chartevents_data    0
submit_wardid           0
curr_wardid             0
curr_careunit           0
callout_wardid          0
callout_service         0
request_tele            0
request_resp            0
request_cdiff           0
request_mrsa            0
request_vre             0
callout_status          0
callout_outcome         0
discharge_wardid        0
acknowledge_status      0
createtime              0
updatetime     

In [35]:
patients_df.to_csv("patients_info.csv")

---

##  Build Network

In [36]:
# test_df.fillna(value="Na", inplace=True)

In [37]:
# from pgmpy.estimators import HillClimbSearch

# hc = HillClimbSearch(patients_df.loc[0:30, :])
# best_model = hc.estimate()
# print(best_model.edges())

[('subject_id', 'callout_service'), ('subject_id', 'religion'), ('subject_id', 'curr_wardid'), ('subject_id', 'dob'), ('dob', 'callout_wardid'), ('hadm_id', 'discharge_wardid'), ('hadm_id', 'dod_ssn'), ('admittime', 'marital_status'), ('admittime', 'deathtime'), ('deathtime', 'submit_wardid'), ('deathtime', 'ethnicity'), ('deathtime', 'hadm_id'), ('deathtime', 'dischtime'), ('marital_status', 'edregtime'), ('ethnicity', 'request_tele'), ('edregtime', 'dod_hosp'), ('edregtime', 'subject_id'), ('callout_wardid', 'updatetime'), ('request_tele', 'curr_careunit'), ('request_tele', 'edouttime'), ('createtime', 'outcometime'), ('updatetime', 'acknowledgetime'), ('updatetime', 'createtime')]

[('subject_id', 'marital_status'), ('subject_id', 'dob'), ('gender', 'insurance'), ('gender', 'admittime'), ('dob', 'request_tele'), ('dob', 'dod_ssn'), ('hadm_id', 'curr_wardid'), ('hadm_id', 'dod_hosp'), ('admittime', 'religion'), ('admittime', 'discharge_wardid'), ('admittime', 'hadm_id'), ('dischtime', 'callout_service'), ('deathtime', 'submit_wardid'), ('deathtime', 'callout_wardid'), ('admission_location', 'gender'), ('ethnicity', 'curr_careunit'), ('edregtime', 'ethnicity'), ('edregtime', 'dischtime'), ('curr_wardid', 'edouttime'), ('callout_wardid', 'updatetime'), ('callout_service', 'subject_id'), ('discharge_wardid', 'deathtime'), ('discharge_wardid', 'edregtime'), ('createtime', 'outcometime'), ('updatetime', 'createtime'), ('outcometime', 'acknowledgetime')]

In [38]:
from pgmpy.estimators import MmhcEstimator
from pgmpy.estimators import BDeuScore
from pgmpy.estimators import HillClimbSearch 

In [39]:
patients_df.columns

Index(['subject_id', 'gender', 'dob', 'dod_hosp', 'dod_ssn', 'hadm_id',
       'admittime', 'dischtime', 'deathtime', 'admission_type',
       'admission_location', 'discharge_location', 'insurance', 'language',
       'religion', 'marital_status', 'ethnicity', 'edregtime', 'edouttime',
       'hospital_expire_flag', 'has_chartevents_data', 'submit_wardid',
       'curr_wardid', 'curr_careunit', 'callout_wardid', 'callout_service',
       'request_tele', 'request_resp', 'request_cdiff', 'request_mrsa',
       'request_vre', 'callout_status', 'callout_outcome', 'discharge_wardid',
       'acknowledge_status', 'createtime', 'updatetime', 'acknowledgetime',
       'outcometime'],
      dtype='object')

In [40]:
mmhc = MmhcEstimator(patients_df.loc[0:30, :])
skeleton = mmhc.mmpc()
print("Part 1) Skeleton: ", skeleton.edges())

KeyboardInterrupt: 

In [None]:
hc = HillClimbSearch(patients_df.loc[0:30, :])
model = hc.estimate(tabu_length=10, white_list=skeleton.to_directed().edges())
print("Part 2) Model:    ", model.edges())

  0%|          | 0/1000000 [00:00<?, ?it/s]

Part 2) Model:     []


## Generation test

In [None]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import BayesianEstimator
from pgmpy.sampling import BayesianModelSampling

In [None]:
my_model = BayesianNetwork(model.edges())

In [None]:
my_model.fit(patients_df, estimator=BayesianEstimator, prior_type="BDeu") # default equivalent_sample_size=5

ValueError: variable names of the model must be identical to column names in data

In [None]:
# my_model.fit(callout_sample_df, estimator=BayesianEstimator, prior_type="BDeu") # default equivalent_sample_size=5
# for cpd in my_model.get_cpds():
#     print(cpd)

+-------------------+-----+------------------------+
| callout_service   | ... | callout_service(VSURG) |
+-------------------+-----+------------------------+
| subject_id(68)    | ... | 3.9256328120092984e-05 |
+-------------------+-----+------------------------+
| subject_id(145)   | ... | 3.9256328120092984e-05 |
+-------------------+-----+------------------------+
| subject_id(969)   | ... | 3.9256328120092984e-05 |
+-------------------+-----+------------------------+
| subject_id(998)   | ... | 3.9256328120092984e-05 |
+-------------------+-----+------------------------+
| subject_id(1108)  | ... | 3.9256328120092984e-05 |
+-------------------+-----+------------------------+
| subject_id(1286)  | ... | 3.9256328120092984e-05 |
+-------------------+-----+------------------------+
| subject_id(1354)  | ... | 3.9256328120092984e-05 |
+-------------------+-----+------------------------+
| subject_id(1404)  | ... | 3.9256328120092984e-05 |
+-------------------+-----+-------------------

In [None]:
from pgmpy.sampling import BayesianModelSampling

In [None]:
samples = BayesianModelSampling(my_model).forward_sample(size=int(500))
samples.head()

  0%|          | 0/13 [00:00<?, ?it/s]

  warn(


Unnamed: 0,subject_id,hadm_id,createtime,submit_wardid,curr_careunit,request_tele,curr_wardid,discharge_wardid,callout_wardid,callout_service,outcometime,updatetime,acknowledgetime
0,47715,170400,2121-03-09 10:48:49,23,MICU,1,17,9.0,17,CCU,2121-03-09 17:40:30,2121-03-09 10:48:49,2121-03-09 10:58:24
1,7715,192622,2111-09-16 10:25:34,7,CCU,1,45,45.0,1,MED,2111-09-16 16:27:16,2111-09-16 11:21:58,2111-09-16 11:37:10
2,88009,109037,2188-03-16 09:20:55,57,SICU,1,29,0.0,1,MED,2188-03-16 22:40:23,2188-03-16 09:20:55,2188-03-16 09:23:27
3,90508,103295,2195-07-07 12:03:53,50,MICU,1,45,45.0,1,MED,2195-07-07 16:40:22,2195-07-07 12:03:53,2195-07-07 12:37:10
4,32380,159188,2112-03-01 12:02:55,23,MICU,0,17,9.0,1,CMED,2112-03-01 19:27:57,2112-03-01 12:02:55,2112-03-01 13:00:22


In [None]:
# samples.to_csv("samples.csv")

In [None]:
nan_count(samples)

Total columns: 13
Total rows: 100000
--------------
subject_id          0
hadm_id             0
createtime          0
submit_wardid       0
curr_careunit       0
request_tele        0
curr_wardid         0
discharge_wardid    0
callout_wardid      0
callout_service     0
outcometime         0
updatetime          0
acknowledgetime     0
dtype: int64
