In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [2]:
import pandas as pd
import numpy as np

import commonfunc

In [50]:
print(pd.__version__)

1.5.3


In [51]:
import sdv
print(sdv.__version__)

1.2.1


In [3]:
data_set_path='C:/Users/shrus/Documents/Synthetic-data-generation/'

In [5]:
outputevents_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/outputevents.csv")

In [7]:
outputevents_df = outputevents_df.sample(1400)

## Data preprocessing

In [9]:
outputevents_df.drop(['valueuom', 'storetime', 'cgid', 'stopped', 'newbottle', 'iserror'], axis=1, inplace=True)

In [10]:
commonfunc.nan_count(outputevents_df)

Total columns: 6
Total rows: 1400
--------------
subject_id     0
hadm_id        0
icustay_id     2
charttime      0
itemid         0
value         24
dtype: int64


In [11]:
outputevents_df.dropna(subset=['value'], inplace=True, axis=0)

In [12]:
commonfunc.nan_count(outputevents_df)

Total columns: 6
Total rows: 1376
--------------
subject_id    0
hadm_id       0
icustay_id    2
charttime     0
itemid        0
value         0
dtype: int64


In [13]:
outputevents_df.dtypes

subject_id      int64
hadm_id         int64
icustay_id    float64
charttime      object
itemid          int64
value         float64
dtype: object

In [14]:
outputevents_df['charttime'] = pd.to_datetime(outputevents_df['charttime'])

In [16]:
icustays_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/icustays.csv")

In [17]:
icu_intime_df = icustays_df.loc[:, ['icustay_id', 'intime']]
icu_intime_df['intime'] = pd.to_datetime(icu_intime_df['intime'])

In [18]:
outputevents_mid_df = pd.merge(left=outputevents_df, right=icu_intime_df, how='left', on=['icustay_id'])

In [19]:
outputevents_mid_df.dtypes

subject_id             int64
hadm_id                int64
icustay_id           float64
charttime     datetime64[ns]
itemid                 int64
value                float64
intime        datetime64[ns]
dtype: object

In [20]:
outputevents_df = outputevents_mid_df.drop(['subject_id', 'hadm_id', 'icustay_id', 'intime'], axis=1)

In [21]:
outputevents_df.columns

Index(['charttime', 'itemid', 'value'], dtype='object')

## Metadata

In [22]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [23]:
metadata.detect_from_dataframe(data=outputevents_df)

In [24]:
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "charttime": {
            "sdtype": "datetime"
        },
        "itemid": {
            "sdtype": "numerical"
        },
        "value": {
            "sdtype": "numerical"
        }
    }
}

In [28]:
metadata.update_column(
    column_name='itemid',
    sdtype='categorical'
)


## Model building

In [33]:
data=outputevents_df

In [34]:
from sdv.lite import SingleTablePreset

synthesizer1 = SingleTablePreset(metadata, name='FAST_ML')

synthesizer1.fit(data)

synthetic_data1 = synthesizer1.sample(num_rows=len(data))

In [37]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer2 = GaussianCopulaSynthesizer(metadata)

synthesizer2.fit(data)

synthetic_data2 = synthesizer2.sample(num_rows=len(data))

In [39]:
from sdv.single_table import CTGANSynthesizer

synthesizer3 = CTGANSynthesizer(metadata)

synthesizer3.fit(data)

synthetic_data3 = synthesizer3.sample(num_rows=len(data))

In [40]:
from sdv.single_table import TVAESynthesizer

synthesizer4 = TVAESynthesizer(metadata)
synthesizer4.fit(data)

synthetic_data4 = synthesizer4.sample(num_rows=len(data))

In [41]:
from sdv.single_table import CopulaGANSynthesizer

synthesizer5 = CopulaGANSynthesizer(metadata)
synthesizer5.fit(data)

synthetic_data5 = synthesizer5.sample(num_rows=len(data))

## Evaluation of models

In [42]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data1,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 132.31it/s]


Overall Quality Score: 80.14%

Properties:
Column Shapes: 81.18%
Column Pair Trends: 79.1%





In [43]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data2,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 204.44it/s]


Overall Quality Score: 68.07%

Properties:
Column Shapes: 75.05%
Column Pair Trends: 61.1%





In [44]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data3,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 162.37it/s]


Overall Quality Score: 59.62%

Properties:
Column Shapes: 66.06%
Column Pair Trends: 53.18%





In [45]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data4,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 170.39it/s]


Overall Quality Score: 81.16%

Properties:
Column Shapes: 83.26%
Column Pair Trends: 79.05%





In [46]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data5,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 190.96it/s]


Overall Quality Score: 53.83%

Properties:
Column Shapes: 59.13%
Column Pair Trends: 48.53%





In [47]:
synthesizer4.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/outputevents/'+'outputevents_best_tvae.pkl')


In [48]:
synthetic_data4

Unnamed: 0,charttime,itemid,value
0,2173-04-30 02:32:29,40055,65.8
1,2117-01-07 15:24:14,40055,390.6
2,2178-07-01 20:37:36,40055,59.2
3,2164-03-31 13:47:50,40055,27.6
4,2112-03-05 16:18:08,40055,229.4
...,...,...,...
1371,2112-10-22 00:58:23,226559,92.7
1372,2177-01-29 18:06:14,40055,75.6
1373,2111-01-04 15:57:43,226559,75.5
1374,2103-08-03 21:47:40,226559,0.0
