In [19]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [20]:
import pandas as pd
import numpy as np

import commonfunc

In [21]:
data_set_path='C:/Users/shrus/Documents/Synthetic-data-generation/'

In [22]:
callout_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/callout.csv")

In [23]:
callout_df.drop(['submit_careunit', 'firstreservationtime', 'currentreservationtime'], axis=1, inplace=True)

In [24]:
# Check null value in table
commonfunc.nan_count(callout_df)

Total columns: 20
Total rows: 99
--------------
subject_id             0
hadm_id                0
submit_wardid          0
curr_wardid            0
curr_careunit          0
callout_wardid         0
callout_service        0
request_tele           0
request_resp           0
request_cdiff          0
request_mrsa           0
request_vre            0
callout_status         0
callout_outcome        0
discharge_wardid      15
acknowledge_status     0
createtime             0
updatetime             0
acknowledgetime        5
outcometime            0
dtype: int64


In [25]:
callout_df.dtypes

subject_id              int64
hadm_id                 int64
submit_wardid           int64
curr_wardid             int64
curr_careunit          object
callout_wardid          int64
callout_service        object
request_tele            int64
request_resp            int64
request_cdiff           int64
request_mrsa            int64
request_vre             int64
callout_status         object
callout_outcome        object
discharge_wardid      float64
acknowledge_status     object
createtime             object
updatetime             object
acknowledgetime        object
outcometime            object
dtype: object

In [26]:
# Transfer some date type
callout_df['createtime'] = pd.to_datetime(callout_df['createtime'])
callout_df['updatetime'] = pd.to_datetime(callout_df['updatetime'])
callout_df['acknowledgetime'] = pd.to_datetime(callout_df['acknowledgetime'])
callout_df['outcometime'] = pd.to_datetime(callout_df['outcometime'])

In [27]:
admissions_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/admissions.csv")

In [28]:
admittime_df = admissions_df.loc[:, ['hadm_id', 'admittime']]
admittime_df['admittime'] = pd.to_datetime(admittime_df['admittime'])

In [29]:
callout_mid_df = pd.merge(left=callout_df, right=admittime_df, how='left', on=['hadm_id'])

In [30]:
# callout_mid_df['outcometime'] = callout_mid_df.apply(commonfunc.time_process, args=('acknowledgetime', 'outcometime', 'updatetime'), axis=1)
# callout_mid_df['acknowledgetime'] = callout_mid_df.apply(commonfunc.time_process, args=('updatetime', 'acknowledgetime'), axis=1)
# callout_mid_df['updatetime'] = callout_mid_df.apply(commonfunc.time_process, args=('createtime', 'updatetime'), axis=1)
# callout_mid_df['createtime'] = callout_mid_df.apply(commonfunc.time_process, args=('admittime', 'createtime'), axis=1)

In [31]:
callout_df = callout_mid_df.drop(['hadm_id', 'admittime', 'subject_id'], axis=1)

## Metadata

In [32]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [33]:
metadata.detect_from_dataframe(data=callout_df)

In [34]:
metadata

{
    "columns": {
        "submit_wardid": {
            "sdtype": "numerical"
        },
        "curr_wardid": {
            "sdtype": "numerical"
        },
        "curr_careunit": {
            "sdtype": "categorical"
        },
        "callout_wardid": {
            "sdtype": "numerical"
        },
        "callout_service": {
            "sdtype": "categorical"
        },
        "request_tele": {
            "sdtype": "numerical"
        },
        "request_resp": {
            "sdtype": "numerical"
        },
        "request_cdiff": {
            "sdtype": "numerical"
        },
        "request_mrsa": {
            "sdtype": "numerical"
        },
        "request_vre": {
            "sdtype": "numerical"
        },
        "callout_status": {
            "sdtype": "categorical"
        },
        "callout_outcome": {
            "sdtype": "categorical"
        },
        "discharge_wardid": {
            "sdtype": "numerical"
        },
        "acknowledge_status": {
   

In [37]:


metadata.update_column(
    column_name='submit_wardid',
    sdtype='categorical'
)

metadata.update_column(
    column_name='callout_wardid',
    sdtype='categorical'
)

metadata.update_column(
    column_name='curr_wardid',
    sdtype='categorical'
)

metadata.update_column(
    column_name='request_tele',
    sdtype='categorical'
)

metadata.update_column(
    column_name='request_resp',
    sdtype='categorical'
)

metadata.update_column(
    column_name='request_cdiff',
    sdtype='categorical'
)

metadata.update_column(
    column_name='request_mrsa',
    sdtype='categorical'
)

metadata.update_column(
    column_name='request_vre',
    sdtype='categorical'
)

metadata.update_column(
    column_name='discharge_wardid',
    sdtype='categorical'
)

In [None]:
metadata.save_to_json(filepath=data_set_path+'metadata/callout/callout.json')

In [39]:
data=callout_df

In [40]:
from sdv.lite import SingleTablePreset

synthesizer1 = SingleTablePreset(metadata, name='FAST_ML')
synthesizer1.fit(data)

synthetic_data1 = synthesizer1.sample(num_rows=100)

In [41]:
synthesizer1.save(data_set_path+'models/callout/callout_singletablepreset.pkl')


In [42]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer2 = GaussianCopulaSynthesizer(metadata)
synthesizer2.fit(data)

synthetic_data2 = synthesizer2.sample(num_rows=100)

In [43]:
synthesizer2.save(data_set_path+'models/callout/callout_gaussiancopula.pkl')


In [44]:
from sdv.single_table import CTGANSynthesizer

synthesizer3 = CTGANSynthesizer(metadata)
synthesizer3.fit(data)

synthetic_data3 = synthesizer3.sample(num_rows=100)

In [45]:
synthesizer3.save(data_set_path+'models/callout/callout_ctgan.pkl')


In [46]:
from sdv.single_table import TVAESynthesizer

synthesizer4 = TVAESynthesizer(metadata)
synthesizer4.fit(data)

synthetic_data4 = synthesizer4.sample(num_rows=100)

In [47]:
synthesizer4.save(data_set_path+'models/callout/callout_tvae.pkl')


In [48]:
from sdv.single_table import CopulaGANSynthesizer

synthesizer5 = CopulaGANSynthesizer(metadata)
synthesizer5.fit(data)

synthetic_data5 = synthesizer5.sample(num_rows=100)

In [49]:
synthesizer5.save(data_set_path+'models/callout/callout_copulangan.pkl')


In [51]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data1,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:03<00:00,  1.31it/s]



Overall Quality Score: 80.88%

Properties:
Column Shapes: 88.7%
Column Pair Trends: 73.06%


In [52]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data2,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]



Overall Quality Score: 80.29%

Properties:
Column Shapes: 87.88%
Column Pair Trends: 72.71%


In [53]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data3,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:03<00:00,  1.17it/s]



Overall Quality Score: 77.46%

Properties:
Column Shapes: 87.64%
Column Pair Trends: 67.28%


In [54]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data4,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:04<00:00,  1.03s/it]



Overall Quality Score: 74.51%

Properties:
Column Shapes: 82.43%
Column Pair Trends: 66.58%


In [55]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data5,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:03<00:00,  1.12it/s]



Overall Quality Score: 77.12%

Properties:
Column Shapes: 85.9%
Column Pair Trends: 68.35%
