In [1]:
import pandas as pd
import numpy as np
import cloudpickle

import common

In [2]:
sythetic_sets_path = "../synthetic_sets/"

## Load models

In [3]:
model_path = "../temp_sets_100/100_models/"

In [5]:
patients_model = common.load_model(model_path + 'patients_model.pkl')

In [None]:
admissions_model = common.load_model('admissions_model.pkl')

In [None]:
callout_model = common.load_model('callout_model.pkl')

In [None]:
icustays_model = common.load_model('icustays_model.pkl')

In [None]:
chartevents_model = common.load_model('chartevents_model.pkl')

In [None]:
cptevents_model = common.load_model('cptevents_model.pkl')

In [None]:
labevents_model = common.load_model('labevents_model.pkl')

In [None]:
microbiologyevents_model = common.load_model('microbiologyevents_model.pkl')

In [None]:
outputevents_model = common.load_model('outputevents_model.pkl')

In [None]:
procedureevents_model = common.load_model('procedureevents_model.pkl')

In [None]:
procedures_icd_model = common.load_model('procedures_icd_model.pkl')

---

## Generate data

In [6]:
patients_df = patients_model.sample(100)

Sampling rows: 100%|██████████| 100/100 [00:02<00:00, 43.33it/s]


In [116]:
admissions_df = admissions_model.sample(130)

Sampling rows: 100%|██████████| 130/130 [00:00<00:00, 1315.93it/s]


In [117]:
callout_df = callout_model.sample(80)

In [118]:
icustays_df = icustays_model.sample(140)

Sampling rows: 100%|██████████| 140/140 [00:00<00:00, 2181.93it/s]


In [119]:
chartevents_df = chartevents_model.sample(10000)

Sampling rows: 100%|██████████| 10000/10000 [00:00<00:00, 14534.23it/s]


In [120]:
cptevents_df = cptevents_model.sample(1500)

Sampling rows: 100%|██████████| 1500/1500 [00:00<00:00, 33262.78it/s]


In [155]:
labevents_df = labevents_model.sample(5000)

Sampling rows: 100%|██████████| 5000/5000 [00:00<00:00, 11460.27it/s]


In [None]:
microbiologyevents_df = microbiologyevents_model.sample(50000)

In [None]:
outputevents_df = outputevents_model.sample(5000)

In [None]:
procedureevents_df = procedureevents_model.sample(1000)

In [121]:
import random

def add_subject_id(df, subject_ids, live_ids):

    # No death data, 
    if pd.isna(df['deathtime']):
        if df['index'] < (len(subject_ids) - 1):
            return subject_ids[df['index']%(len(subject_ids) - 1)]
        else:
            return random.choice(subject_ids)
    else:
        # if match a death data, choose a patients from live list, and delete it from live list(because one person can just dead once)
        subject_id = live_ids.pop()
        return subject_id

## Post-process data

In [122]:
# Reset time data
def reset_time(df, early_col_name, late_col_name, second_early_col_name=None):
    '''
    If first_early_col_name exist, then use late_col - first_early_col_name, else, 
        then use late_col - second_early_col_name, else set result as NaN
    The result is the time delta, save it as the late column
    '''

    if pd.isna(df[late_col_name]):
        return pd.NaT
    elif (pd.isna(df[late_col_name]) == False) & pd.isna(df[early_col_name]):
        return df[second_early_col_name] + pd.Timedelta(seconds=int(df[late_col_name]))
    else:
        return df[early_col_name] + pd.Timedelta(seconds=int(df[late_col_name]))

### Patients

In [123]:
patients_df.to_csv(sythetic_sets_path + "patients.csv", index=None)

### Admissions

In [124]:
# save the patients who don't have death record. When one patients match a death date, then delete it from the list
live_ids = patients_df['subject_id'].tolist()
admissions_df['subject_id'] = admissions_df.reset_index().apply(add_subject_id, args=(patients_df['subject_id'].tolist(), live_ids), axis=1)

In [125]:
# Adjust `subject_id` position
admissions_df.insert(0, 'subject_id', admissions_df.pop('subject_id'))
# Change type of `subject_id`
admissions_df['subject_id'] = admissions_df['subject_id'].astype(int)

In [126]:
# Merge patients info and admissions info
admissions_df = pd.merge(patients_df.loc[:, ['subject_id', 'dob']], admissions_df, how='inner', on=['subject_id'])

In [127]:
admissions_df['admittime'] = admissions_df.apply(reset_time, args=('dob', 'admittime'), axis=1)
admissions_df['dischtime'] = admissions_df.apply(reset_time, args=('admittime', 'dischtime'), axis=1)
admissions_df['edregtime'] = admissions_df.apply(reset_time, args=('admittime', 'edregtime'), axis=1)
admissions_df['edouttime'] = admissions_df.apply(reset_time, args=('edregtime', 'edouttime'), axis=1)

In [128]:
admissions_df.drop(['dob'], axis=1, inplace=True)

In [129]:
admissions_df.to_csv(sythetic_sets_path + "admissions.csv", index=None)

In [130]:
# Pick up 'subject_id', 'hadm_id' and 'admittime' from admissions_df (random sample)
hadm_ids_df = admissions_df.loc[:, ['subject_id', 'hadm_id', 'admittime']]

###  Callout

In [131]:
callout_df.head()

Unnamed: 0,submit_wardid,curr_wardid,curr_careunit,callout_wardid,callout_service,request_tele,request_resp,request_cdiff,request_mrsa,request_vre,callout_status,callout_outcome,discharge_wardid,acknowledge_status,createtime,updatetime,acknowledgetime,outcometime
0,15,48,CCU,40,MED,0,0,0,1,0,Inactive,Cancelled,30.0,Revised,1214044.0,200571.0,2.0,36038.0
1,7,20,MICU,52,CSURG,1,0,0,0,0,Inactive,Discharged,0.0,Acknowledged,2109074.0,8668.0,6790.0,12218.0
2,45,28,TSICU,55,ORTHO,0,0,1,0,0,Inactive,Cancelled,31.0,Acknowledged,31877.0,106981.0,,21913.0
3,11,24,MICU,1,MED,1,0,0,1,0,Inactive,Cancelled,,Revised,420696.0,15171.0,6380.0,9841.0
4,7,16,MICU,55,SURG,1,0,0,0,0,Inactive,Discharged,55.0,Acknowledged,709173.0,10746.0,,19413.0


In [132]:
# Add 'subject_id', 'hadm_id' to callout_df
callout_df = pd.concat([hadm_ids_df.sample(n=len(callout_df), replace=True, axis=0).reset_index(drop=True), callout_df], axis=1)

In [133]:
callout_df['createtime'] = callout_df.apply(reset_time, args=('admittime', 'createtime'), axis=1)
callout_df['updatetime'] = callout_df.apply(reset_time, args=('createtime', 'updatetime'), axis=1)
callout_df['acknowledgetime'] = callout_df.apply(reset_time, args=('updatetime', 'acknowledgetime'), axis=1)
callout_df['outcometime'] = callout_df.apply(reset_time, args=('acknowledgetime', 'outcometime', 'updatetime'), axis=1)

In [134]:
callout_df.drop(['admittime'], axis=1, inplace=True)

In [135]:
callout_df.to_csv(sythetic_sets_path + "callout.csv", index=None)

### Icustays

In [136]:
# Add 'subject_id', 'hadm_id' to icustays_df
icustays_df = pd.concat([hadm_ids_df.sample(n=len(icustays_df), replace=True, axis=0).reset_index(drop=True), icustays_df], axis=1)

In [137]:
icustays_df['intime'] = icustays_df.apply(reset_time, args=('admittime', 'intime'), axis=1)
icustays_df['outtime'] = icustays_df.apply(reset_time, args=('intime', 'outtime'), axis=1)

In [138]:
icustays_df.drop(['admittime'], axis=1, inplace=True)

In [139]:
icustays_df.to_csv(sythetic_sets_path + "icustays.csv", index=None)

In [140]:
# Pick up 'subject_id', 'hadm_id', 'icustay_id' and 'intime' from icustays_df (random sample)
icustays_ids_df = icustays_df.loc[:, ['subject_id', 'hadm_id', 'icustay_id','intime']]

### Chartevents

In [141]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
 
    return False

def format_numeric_values(df):
    # Numeric value
    if is_number(df['value']):
        return round(float(df['value']), 2)
    # String
    else:
        return df['value']

In [142]:
chartevents_df['value'] = chartevents_df.apply(format_numeric_values, axis=1)

In [143]:
# Add 'subject_id', 'hadm_id' to icustays_df
chartevents_df = pd.concat([icustays_ids_df.sample(n=len(chartevents_df), replace=True, axis=0).reset_index(drop=True), chartevents_df], axis=1)

In [144]:
chartevents_df['charttime'] = chartevents_df.apply(reset_time, args=('intime', 'charttime'), axis=1)
chartevents_df['storetime'] = chartevents_df.apply(reset_time, args=('charttime', 'storetime'), axis=1)

In [145]:
chartevents_df.drop(['intime'], axis=1, inplace=True)

In [146]:
chartevents_df.to_csv(sythetic_sets_path + "chartevents.csv", index=None)

### CPTevents

In [147]:
cptevents_icu_df = cptevents_df.loc[cptevents_df['costcenter'] == 'ICU']
cptevents_resp_df = cptevents_df.loc[cptevents_df['costcenter'] == 'Resp']

In [148]:
icu_hadm_ids_df = hadm_ids_df.sample(frac=0.8, axis=0)
resp_hadm_ids_df = hadm_ids_df[~hadm_ids_df.index.isin(icu_hadm_ids_df.index)]

In [149]:
len(hadm_ids_df)

130

In [150]:
cptevents_icu_df = pd.concat([icu_hadm_ids_df.sample(n=len(cptevents_icu_df), replace=True, axis=0).reset_index(drop=True), cptevents_icu_df.reset_index(drop=True)], axis=1)
cptevents_resp_df = pd.concat([resp_hadm_ids_df.sample(n=len(cptevents_resp_df), replace=True, axis=0).reset_index(drop=True), cptevents_resp_df.reset_index(drop=True)], axis=1)

In [151]:
cptevents_df = pd.concat([cptevents_icu_df, cptevents_resp_df], axis=0, ignore_index=True)

In [152]:
cptevents_df = cptevents_df.sort_values(['subject_id', 'hadm_id']).drop(['admittime'], axis=1)

### Labevents

In [156]:
labevents_df['value'] = chartevents_df.apply(format_numeric_values, axis=1)

In [157]:
# Add 'subject_id', 'hadm_id' to icustays_df
labevents_df = pd.concat([hadm_ids_df.sample(n=len(labevents_df), replace=True, axis=0).reset_index(drop=True), labevents_df], axis=1)

In [158]:
labevents_df['charttime'] = labevents_df.apply(reset_time, args=('admittime', 'charttime'), axis=1)

In [159]:
labevents_df.drop(['admittime'], axis=1, inplace=True)

In [162]:
labevents_df.to_csv(sythetic_sets_path + "labevents_df.csv", index=None)

### microbiologyevents

In [None]:
microbiologyevents_df['value'] = microbiologyevents_df.apply(format_numeric_values, axis=1)

In [None]:
# Add 'subject_id', 'hadm_id' to icustays_df
microbiologyevents_df = pd.concat([hadm_ids_df.sample(n=len(microbiologyevents_df), replace=True, axis=0).reset_index(drop=True), microbiologyevents_df], axis=1)

In [None]:
microbiologyevents_df['charttime'] = microbiologyevents_df.apply(reset_time, args=('admittime', 'charttime'), axis=1)

In [None]:
microbiologyevents_df.drop(['admittime'], axis=1, inplace=True)

In [None]:
microbiologyevents_df.to_csv(sythetic_sets_path + "microbiologyevents_df.csv", index=None)

### outputevents

In [None]:
outputevents_df['value'] = outputevents_df.apply(format_numeric_values, axis=1)

In [None]:
# Add 'subject_id', 'hadm_id' to icustays_df
outputevents_df = pd.concat([hadm_ids_df.sample(n=len(outputevents_df), replace=True, axis=0).reset_index(drop=True), outputevents_df], axis=1)

In [None]:
outputevents_df['charttime'] = outputevents_df.apply(reset_time, args=('admittime', 'charttime'), axis=1)

In [None]:
outputevents_df.drop(['admittime'], axis=1, inplace=True)

In [None]:
outputevents_df.to_csv(sythetic_sets_path + "outputevents_df.csv", index=None)

### procedureevents

In [None]:
procedureevents_df['value'] = procedureevents_df.apply(format_numeric_values, axis=1)

In [None]:
# Add 'subject_id', 'hadm_id' to icustays_df
procedureevents_df = pd.concat([hadm_ids_df.sample(n=len(procedureevents_df), replace=True, axis=0).reset_index(drop=True), outputevents_df], axis=1)

In [None]:
procedureevents_df['charttime'] = procedureevents_df.apply(reset_time, args=('admittime', 'charttime'), axis=1)

In [None]:
procedureevents_df.drop(['admittime'], axis=1, inplace=True)

In [None]:
procedureevents_df.to_csv(sythetic_sets_path + "procedureevents_df.csv", index=None)