In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [41]:
import pandas as pd
import numpy as np

import commonfunc
from sdv.lite import SingleTablePreset
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.single_table import CopulaGANSynthesizer
from faker import Faker
import uuid

In [67]:
sythetic_sets_path='C:/Users/shrus/Documents/Synthetic-data-generation/generated-synthetic-data/'

## Loading synthesizers

In [5]:
data_set_path='C:/Users/shrus/Documents/Synthetic-data-generation/'

In [6]:
patients_synthesizer = GaussianCopulaSynthesizer.load(data_set_path+'models/patients/patient_best_gaussiancopula.pkl')

In [7]:
procedureevents_mv_synthesizer = SingleTablePreset.load(data_set_path+'models/procedureevents_mv/procedureevents_mv_best_tabularpreset.pkl')

In [8]:
prescription_synthesizer = SingleTablePreset.load(data_set_path+'models/prescription/prescription_best_singletablepreset.pkl')

In [9]:
outputevents_synthesizer = TVAESynthesizer.load(data_set_path+'models/outputevents/outputevents_best_tvae.pkl')

In [10]:
microbiologyevents_synthesizer = SingleTablePreset.load(data_set_path+'models/microbiologyevents/microbiologyevents_best_singletablepreset.pkl')

In [12]:
labevents_synthesizer = SingleTablePreset.load(data_set_path+'models/labevents/labevents_best_singletablepreset.pkl')

In [14]:
cptevents_synthesizer = SingleTablePreset.load(data_set_path+'models/cptevents/cptevents_best_singletablepreset.pkl')

In [15]:
chartevents_synthesizer = SingleTablePreset.load(data_set_path+'models/chartevents/chartevents_best_singletablepreset.pkl')

In [16]:
callout_synthesizer = SingleTablePreset.load(data_set_path+'models/callout/callout_best_singletablepreset.pkl')

In [17]:
admissions_synthesizer = SingleTablePreset.load(data_set_path+'models/admissions/admissions_best_singletablepreset.pkl')

In [18]:
icustays_synthesizer = TVAESynthesizer.load(data_set_path+'models/icustays/icustays_best_tvae.pkl')

## Generating synthetic data

In [19]:
patients_df = patients_synthesizer.sample(100)

In [21]:
admissions_df = admissions_synthesizer.sample(130)

Sampling rows: 100%|██████████| 130/130 [00:00<00:00, 1939.98it/s]


In [98]:
icustays_df = icustays_synthesizer.sample(140)

Sampling rows: 100%|██████████| 140/140 [00:00<00:00, 1770.59it/s]


In [24]:
procedureevents_df = procedureevents_mv_synthesizer.sample(1000)

Sampling rows: 100%|██████████| 1000/1000 [00:00<00:00, 13143.01it/s]


In [26]:
outputevents_df = outputevents_synthesizer.sample(1400)

In [25]:
prescription_df = prescription_synthesizer.sample(1400)

In [27]:
microbiology_df = microbiologyevents_synthesizer.sample(50000)

Sampling rows: 100%|██████████| 50000/50000 [00:00<00:00, 75845.30it/s]


In [28]:
labevents_df = labevents_synthesizer.sample(5000)

Sampling rows: 100%|██████████| 5000/5000 [00:00<00:00, 74620.33it/s]


In [29]:
cptevents_df = cptevents_synthesizer.sample(1500)

Sampling rows: 100%|██████████| 1500/1500 [00:00<00:00, 46657.98it/s]


In [30]:
chartevents_df = chartevents_synthesizer.sample(10000)

Sampling rows: 100%|██████████| 10000/10000 [00:00<00:00, 70813.01it/s]


In [31]:
callout_df = callout_synthesizer.sample(90)

## Data Postprocessing

### Patients table

In [56]:
patients_df = patients_df.drop(['subject_id'], axis=1)

In [57]:
# Create a Faker instance
fake = Faker()


In [58]:
def generate_uuid():
    return str(uuid.uuid4())


In [59]:
def generate_name(gender):
    if gender == 'M':
        return fake.first_name_male()+' '+fake.last_name_male()
    else:
        return fake.first_name_female()+' '+fake.last_name_female()

In [60]:
patients_df['Full Name']= [generate_name(gender) for gender in patients_df['gender']]

In [61]:
patients_df['subject_id'] = [generate_uuid() for _ in range(len(patients_df))]

In [65]:
patients_df = patients_df[['subject_id', 'Full Name', 'gender','dob','dod_hosp','dod_ssn']]

In [69]:
patients_df.head()

Unnamed: 0,subject_id,Full Name,gender,dob,dod_hosp,dod_ssn
0,75fc065b-ca23-43a3-9045-c6ce13375d3a,Ryan Wood,M,2011-01-12,NaT,2154-04-30
1,c46c54e1-ec76-45b7-a79e-69f005773d94,Alexander Wong,M,1979-07-14,NaT,NaT
2,ee7e4c38-245a-44c9-90c3-fa86d5b8569d,Michael Robinson,M,1962-12-23,NaT,NaT
3,5b6229fc-1ea1-4d23-81bb-c00a990c9a32,Christopher Smith,M,1977-05-01,NaT,NaT
4,ba96d0a3-04f0-4b84-81ae-f157e3ba336d,Alicia Clark,F,1934-09-29,NaT,NaT


In [68]:
patients_df.to_csv(sythetic_sets_path + "patients.csv", index=None)


## Admissions

In [72]:
import random

def add_subject_id(df, subject_ids, live_ids):

    # No death data, 
    if pd.isna(df['deathtime']):
        if df['index'] < (len(subject_ids) - 1):
            return subject_ids[df['index']%(len(subject_ids) - 1)]
        else:
            return random.choice(subject_ids)
    else:
        # if match a death data, choose a patients from live list, and delete it from live list(because one person can just dead once)
        subject_id = live_ids.pop()
        return subject_id

In [73]:
live_ids = patients_df['subject_id'].tolist()
admissions_df['subject_id'] = admissions_df.reset_index().apply(add_subject_id, args=(patients_df['subject_id'].tolist(), live_ids), axis=1)


In [75]:
# Adjust `subject_id` position
admissions_df.insert(0, 'subject_id', admissions_df.pop('subject_id'))
# Change type of `subject_id`
# admissions_df['subject_id'] = admissions_df['subject_id'].astype(int)

In [76]:
admissions_df = pd.merge(patients_df.loc[:, ['subject_id', 'dob']], admissions_df, how='inner', on=['subject_id'])


In [77]:
admissions_df.drop(['dob'], axis=1, inplace=True)


In [80]:
admissions_df.rename(columns={'deathtime': 'Death flag'}, inplace=True)


In [81]:
admissions_df.to_csv(sythetic_sets_path + "admissions.csv", index=None)


In [82]:
hadm_ids_df = admissions_df.loc[:, ['subject_id', 'hadm_id', 'admittime']]

## Callout

In [83]:
callout_df = pd.concat([hadm_ids_df.sample(n=len(callout_df), replace=True, axis=0).reset_index(drop=True), callout_df], axis=1)


In [84]:
callout_df.drop(['admittime'], axis=1, inplace=True)


In [85]:
callout_df.to_csv(sythetic_sets_path + "callout.csv", index=None)


## ICUstays

In [99]:
icustays_df = pd.concat([hadm_ids_df.sample(n=len(icustays_df), replace=True, axis=0).reset_index(drop=True), icustays_df], axis=1)


In [100]:

icustays_df.drop(['admittime'], axis=1, inplace=True)

In [101]:
icustays_df['icustay_id'] = [generate_uuid() for _ in range(len(icustays_df))]

In [106]:
icustays_df = icustays_df[['subject_id', 'hadm_id', 'icustay_id', 'dbsource', 'first_careunit', 'last_careunit',
       'first_wardid', 'last_wardid', 'intime', 'outtime']]

In [107]:
icustays_df.to_csv(sythetic_sets_path + "icustays.csv", index=None)


In [108]:
# Pick up 'subject_id', 'hadm_id', 'icustay_id' and 'intime' from icustays_df (random sample)
icustays_ids_df = icustays_df.loc[:, ['subject_id', 'hadm_id', 'icustay_id','intime']]

## Chartevents

In [109]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
 
    return False

def format_numeric_values(df):
    # Numeric value
    if is_number(df['value']):
        return round(float(df['value']), 2)
    # String
    else:
        return df['value']

In [110]:
chartevents_df['value'] = chartevents_df.apply(format_numeric_values, axis=1)


In [111]:
chartevents_df = pd.concat([icustays_ids_df.sample(n=len(chartevents_df), replace=True, axis=0).reset_index(drop=True), chartevents_df], axis=1)


In [112]:
chartevents_df.drop(['intime'], axis=1, inplace=True)


In [113]:
chartevents_df.to_csv(sythetic_sets_path + "chartevents.csv", index=None)


## CPTevents

In [115]:
cptevents_icu_df = cptevents_df.loc[cptevents_df['costcenter'] == 'ICU']
cptevents_resp_df = cptevents_df.loc[cptevents_df['costcenter'] == 'Resp']

In [116]:
icu_hadm_ids_df = hadm_ids_df.sample(frac=0.8, axis=0)
resp_hadm_ids_df = hadm_ids_df[~hadm_ids_df.index.isin(icu_hadm_ids_df.index)]

In [117]:
len(hadm_ids_df)


130

In [118]:
cptevents_icu_df = pd.concat([icu_hadm_ids_df.sample(n=len(cptevents_icu_df), replace=True, axis=0).reset_index(drop=True), cptevents_icu_df.reset_index(drop=True)], axis=1)
cptevents_resp_df = pd.concat([resp_hadm_ids_df.sample(n=len(cptevents_resp_df), replace=True, axis=0).reset_index(drop=True), cptevents_resp_df.reset_index(drop=True)], axis=1)


In [119]:
cptevents_df = pd.concat([cptevents_icu_df, cptevents_resp_df], axis=0, ignore_index=True)


In [120]:
cptevents_df = cptevents_df.sort_values(['subject_id', 'hadm_id']).drop(['admittime'], axis=1)


In [122]:
cptevents_df.to_csv(sythetic_sets_path + "cptevents.csv", index=None)


## Labevents

In [123]:
labevents_df['value'] = chartevents_df.apply(format_numeric_values, axis=1)


In [124]:
labevents_df = pd.concat([hadm_ids_df.sample(n=len(labevents_df), replace=True, axis=0).reset_index(drop=True), labevents_df], axis=1)


In [126]:
labevents_df.drop(['admittime'], axis=1, inplace=True)


In [127]:
labevents_df.to_csv(sythetic_sets_path + "labevents_df.csv", index=None)


## Microbiologyevents

In [134]:
microbiology_df = pd.concat([hadm_ids_df.sample(n=len(microbiology_df), replace=True, axis=0).reset_index(drop=True), microbiology_df], axis=1)


In [136]:
microbiology_df.drop(['admittime'], axis=1, inplace=True)


In [137]:
microbiology_df.to_csv(sythetic_sets_path + "microbiologyevents_df.csv", index=None)


## Outputevents

In [139]:
outputevents_df['value'] = outputevents_df.apply(format_numeric_values, axis=1)


In [140]:
outputevents_df = pd.concat([hadm_ids_df.sample(n=len(outputevents_df), replace=True, axis=0).reset_index(drop=True), outputevents_df], axis=1)


In [141]:
outputevents_df.drop(['admittime'], axis=1, inplace=True)


In [142]:
outputevents_df.to_csv(sythetic_sets_path + "outputevents_df.csv", index=None)


## Procedureevents

In [145]:
procedureevents_df['value'] = procedureevents_df.apply(format_numeric_values, axis=1)


In [146]:
procedureevents_df = pd.concat([hadm_ids_df.sample(n=len(procedureevents_df), replace=True, axis=0).reset_index(drop=True), outputevents_df], axis=1)


In [147]:
procedureevents_df.drop(['admittime'], axis=1, inplace=True)


In [148]:
procedureevents_df.to_csv(sythetic_sets_path + "procedureevents_df.csv", index=None)


## Prescription

In [150]:
prescription_df = pd.concat([icustays_ids_df.sample(n=len(prescription_df), replace=True, axis=0).reset_index(drop=True), prescription_df], axis=1)


In [153]:
prescription_df.drop(['intime'], axis=1, inplace=True)

In [155]:
prescription_df.to_csv(sythetic_sets_path+"prescription.csv",index=None)