In [16]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [17]:
import pandas as pd
import numpy as np

import commonfunc


In [18]:
data_set_path='C:/Users/shrus/Documents/Synthetic-data-generation/'

In [19]:
icustays_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/icustays.csv")

In [20]:
icustays_df

Unnamed: 0,subject_id,hadm_id,icustay_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,intime,outtime,los
0,87197,176931,209341,metavision,CCU,CCU,7,7,2109-11-16 08:12:18,2109-11-17 17:39:41,1.3940
1,1765,145878,254146,carevue,NICU,NICU,56,56,2183-05-09 09:29:55,2183-05-16 19:18:28,7.4087
2,21269,109697,241691,carevue,MICU,MICU,12,12,2162-10-03 17:03:08,2162-12-01 20:11:22,59.1307
3,5692,151113,279055,carevue,CSRU,CSRU,12,15,2143-10-31 14:16:32,2143-11-01 19:20:00,1.2107
4,5692,193147,241135,metavision,CCU,CCU,7,7,2147-12-16 00:00:03,2147-12-17 16:45:54,1.6985
...,...,...,...,...,...,...,...,...,...,...,...
133,58667,197356,249803,metavision,MICU,MICU,50,50,2130-08-19 20:03:38,2130-08-20 07:17:43,0.4681
134,3391,136471,265244,carevue,NICU,NICU,56,56,2143-03-01 19:51:18,2143-03-01 23:53:41,0.1683
135,31650,137933,282691,carevue,CCU,CCU,7,7,2177-06-07 19:44:18,2177-06-10 15:45:28,2.8341
136,15895,135286,236445,carevue,CSRU,CSRU,14,14,2116-07-06 10:36:00,2116-07-07 21:28:00,1.4528


In [21]:
# Drop useless colums
icustays_df.drop(['los'], axis=1, inplace=True)

In [22]:
commonfunc.nan_count(icustays_df)

Total columns: 10
Total rows: 138
--------------
subject_id        0
hadm_id           0
icustay_id        0
dbsource          0
first_careunit    0
last_careunit     0
first_wardid      0
last_wardid       0
intime            0
outtime           0
dtype: int64


In [23]:
icustays_df.dtypes

subject_id         int64
hadm_id            int64
icustay_id         int64
dbsource          object
first_careunit    object
last_careunit     object
first_wardid       int64
last_wardid        int64
intime            object
outtime           object
dtype: object

In [24]:
# Transfer some date type
icustays_df['intime'] = pd.to_datetime(icustays_df['intime'])
icustays_df['outtime'] = pd.to_datetime(icustays_df['outtime'])

In [25]:
icustays_df.dtypes

subject_id                 int64
hadm_id                    int64
icustay_id                 int64
dbsource                  object
first_careunit            object
last_careunit             object
first_wardid               int64
last_wardid                int64
intime            datetime64[ns]
outtime           datetime64[ns]
dtype: object

In [26]:
admissions_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/admissions.csv")

In [27]:
admittime_df = admissions_df.loc[:, ['hadm_id', 'admittime']]
admittime_df['admittime'] = pd.to_datetime(admittime_df['admittime'])

In [28]:
icustays_mid_df = pd.merge(left=icustays_df, right=admittime_df, how='left', on=['hadm_id'])

In [29]:
# icustays_mid_df['outtime'] = icustays_mid_df.apply(commonfunc.time_process, args=('intime', 'outtime'), axis=1)
# icustays_mid_df['intime'] = icustays_mid_df.apply(commonfunc.time_process, args=('admittime', 'intime'), axis=1)

In [30]:
icustays_df = icustays_mid_df.drop(['subject_id', 'hadm_id', 'admittime'], axis=1)

In [31]:
icustays_df.columns

Index(['icustay_id', 'dbsource', 'first_careunit', 'last_careunit',
       'first_wardid', 'last_wardid', 'intime', 'outtime'],
      dtype='object')

## Metadata

In [32]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [33]:
metadata.detect_from_dataframe(data=icustays_df)

In [34]:
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "icustay_id": {
            "sdtype": "numerical"
        },
        "dbsource": {
            "sdtype": "categorical"
        },
        "first_careunit": {
            "sdtype": "categorical"
        },
        "last_careunit": {
            "sdtype": "categorical"
        },
        "first_wardid": {
            "sdtype": "numerical"
        },
        "last_wardid": {
            "sdtype": "numerical"
        },
        "intime": {
            "sdtype": "datetime"
        },
        "outtime": {
            "sdtype": "datetime"
        }
    }
}

In [37]:
metadata.update_column(
    column_name='icustay_id',
    sdtype='id',
    regex_format='ICUID_[0-9]{4,12}'
)

metadata.update_column(
    column_name='first_wardid',
    sdtype='categorical'
)

metadata.update_column(
    column_name='last_wardid',
    sdtype='categorical'
)

In [38]:
metadata.set_primary_key(
    column_name='icustay_id'
)

In [39]:
metadata.save_to_json(filepath=data_set_path+'metadata/icustays.json')

In [41]:
metadata.set_primary_key(
    column_name='icustay_id'
)

In [42]:
data=icustays_df

In [43]:
from sdv.lite import SingleTablePreset

synthesizer1 = SingleTablePreset(metadata, name='FAST_ML')
synthesizer1.fit(data)

synthetic_data1 = synthesizer1.sample(num_rows=100)

In [47]:
synthesizer1.save(data_set_path+'models/icustays/icustays_singletablepreset.pkl')


In [45]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer2 = GaussianCopulaSynthesizer(metadata)
synthesizer2.fit(data)

synthetic_data2 = synthesizer2.sample(num_rows=100)

In [48]:
synthesizer2.save(data_set_path+'models/icustays/icustays_guassiancopula.pkl')


In [49]:
from sdv.single_table import CTGANSynthesizer

synthesizer3 = CTGANSynthesizer(metadata)
synthesizer3.fit(data)

synthetic_data3 = synthesizer3.sample(num_rows=100)

In [50]:
synthesizer3.save(data_set_path+'models/icustays/icustays_ctgan.pkl')


In [51]:
from sdv.single_table import TVAESynthesizer

synthesizer4 = TVAESynthesizer(metadata)
synthesizer4.fit(data)

synthetic_data4 = synthesizer4.sample(num_rows=100)

In [52]:
synthesizer4.save(data_set_path+'models/icustays/icustays_tvae.pkl')


In [53]:
from sdv.single_table import CopulaGANSynthesizer

synthesizer5 = CopulaGANSynthesizer(metadata)
synthesizer5.fit(data)

synthetic_data5 = synthesizer5.sample(num_rows=100)

In [54]:
synthesizer5.save(data_set_path+'models/icustays/icustays_copulangan.pkl')


In [55]:
synthetic_data5

Unnamed: 0,icustay_id,dbsource,first_careunit,last_careunit,first_wardid,last_wardid,intime,outtime
0,ICUID_0000,metavision,NICU,MICU,23,23,2159-09-08 10:49:36,2187-09-23 06:37:25
1,ICUID_0001,carevue,NICU,SICU,15,38,2137-12-24 06:49:27,2148-02-27 04:23:45
2,ICUID_0002,carevue,MICU,NICU,33,57,2112-09-16 00:34:13,2117-08-20 22:42:54
3,ICUID_0003,carevue,CSRU,NICU,50,57,2158-03-01 17:47:00,2196-05-05 22:12:09
4,ICUID_0004,metavision,MICU,CCU,15,50,2111-01-08 06:49:09,2135-01-29 18:44:15
...,...,...,...,...,...,...,...,...
95,ICUID_0095,metavision,CCU,TSICU,52,15,2200-04-20 23:53:22,2188-06-25 00:00:26
96,ICUID_0096,carevue,NICU,SICU,52,33,2146-01-21 03:31:38,2175-09-07 14:36:17
97,ICUID_0097,carevue,MICU,TSICU,57,50,2163-11-06 21:01:18,2110-12-25 20:40:46
98,ICUID_0098,carevue,SICU,MICU,14,56,2175-08-24 16:37:24,2200-03-29 11:19:45
