In [19]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [20]:
import pandas as pd
import numpy as np

import commonfunc

In [21]:
data_set_path='C:/Users/shrus/Documents/Synthetic-data-generation/'

In [22]:
chartevents_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/chartevents.csv")

In [23]:
chartevents_df = chartevents_df.sample(n=2000)

## Data Preprocessing

In [6]:
# Drop useless colums
chartevents_df.drop(['subject_id', 'hadm_id', 'cgid', 'valuenum', 'valueuom', 'warning', 'error', 'resultstatus', 'stopped'], axis=1, inplace=True)

In [7]:
# Drop some null value
chartevents_df.dropna(subset=['icustay_id'], inplace=True, axis=0)

In [8]:
commonfunc.nan_count(chartevents_df)

Total columns: 5
Total rows: 1998
--------------
icustay_id     0
itemid         0
charttime      0
storetime      0
value         10
dtype: int64


In [9]:
chartevents_df['value'].fillna(value='Na', inplace=True)

In [10]:
chartevents_df.dtypes

icustay_id    float64
itemid          int64
charttime      object
storetime      object
value          object
dtype: object

In [11]:


chartevents_df['charttime'] = pd.to_datetime(chartevents_df['charttime'])
chartevents_df['storetime'] = pd.to_datetime(chartevents_df['storetime'])

In [12]:
chartevents_df.dtypes

icustay_id           float64
itemid                 int64
charttime     datetime64[ns]
storetime     datetime64[ns]
value                 object
dtype: object

In [15]:
icustays_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/icustays.csv")

In [16]:
icu_intime_df = icustays_df.loc[:, ['icustay_id', 'intime']]
icu_intime_df['intime'] = pd.to_datetime(icu_intime_df['intime'])

In [17]:
chartevents_mid_df = pd.merge(left=chartevents_df, right=icu_intime_df, how='left', on=['icustay_id'])

In [18]:
chartevents_mid_df.dtypes

icustay_id           float64
itemid                 int64
charttime     datetime64[ns]
storetime     datetime64[ns]
value                 object
intime        datetime64[ns]
dtype: object

In [19]:
# chartevents_mid_df['storetime'] = chartevents_mid_df.apply(commonfunc.time_process, args=('charttime', 'storetime'), axis=1)
# chartevents_mid_df['charttime'] = chartevents_mid_df.apply(commonfunc.time_process, args=('intime', 'charttime'), axis=1)

In [20]:
chartevents_df = chartevents_mid_df.drop(['icustay_id', 'intime'], axis=1)

In [21]:
chartevents_df.columns

Index(['itemid', 'charttime', 'storetime', 'value'], dtype='object')

## Metadata

In [22]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [24]:
metadata.detect_from_dataframe(data=chartevents_df)

In [25]:
metadata

{
    "columns": {
        "itemid": {
            "sdtype": "numerical"
        },
        "charttime": {
            "sdtype": "datetime"
        },
        "storetime": {
            "sdtype": "datetime"
        },
        "value": {
            "sdtype": "categorical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}

In [26]:
metadata.update_column(
    column_name='itemid',
    sdtype='categorical'
)

## Constraints

In [27]:
fixed_constraint = {
    'constraint_class': 'FixedCombinations',
    'constraint_parameters': {
        'column_names': ['itemid', 'value']
    }
}

## Model

In [28]:
data=chartevents_df

In [30]:
from sdv.lite import SingleTablePreset

synthesizer1 = SingleTablePreset(metadata, name='FAST_ML')


synthesizer1.add_constraints(constraints=[
    fixed_constraint
])
synthesizer1.fit(data)

synthetic_data1 = synthesizer1.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 1998/1998 [00:00<00:00, 28118.99it/s]


In [31]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer2 = GaussianCopulaSynthesizer(metadata)
synthesizer2.add_constraints(constraints=[
    fixed_constraint
])
synthesizer2.fit(data)

synthetic_data2 = synthesizer2.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 1998/1998 [00:00<00:00, 23042.39it/s]


In [32]:
from sdv.single_table import CTGANSynthesizer

synthesizer3 = CTGANSynthesizer(metadata)
synthesizer3.add_constraints(constraints=[
    fixed_constraint
])
synthesizer3.fit(data)

synthetic_data3 = synthesizer3.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 1998/1998 [00:05<00:00, 345.62it/s]


In [36]:
from sdv.single_table import TVAESynthesizer

synthesizer4 = TVAESynthesizer(metadata)
synthesizer4.add_constraints(constraints=[
    fixed_constraint
])
synthesizer4.fit(data)

synthetic_data4 = synthesizer4.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 1998/1998 [00:00<00:00, 4244.99it/s]


In [38]:
from sdv.single_table import CopulaGANSynthesizer

synthesizer5 = CopulaGANSynthesizer(metadata)
synthesizer5.add_constraints(constraints=[
    fixed_constraint
])
synthesizer5.fit(data)

synthetic_data5 = synthesizer5.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 1998/1998 [00:03<00:00, 565.00it/s]


## Evaluation of performance

In [33]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data1,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  5.67it/s]



Overall Quality Score: 64.6%

Properties:
Column Shapes: 81.66%
Column Pair Trends: 47.55%


In [34]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data2,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  7.72it/s]



Overall Quality Score: 63.59%

Properties:
Column Shapes: 81.29%
Column Pair Trends: 45.9%


In [35]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data3,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:01<00:00,  3.36it/s]



Overall Quality Score: 28.66%

Properties:
Column Shapes: 45.08%
Column Pair Trends: 12.24%


In [37]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data4,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:01<00:00,  3.89it/s]



Overall Quality Score: 53.19%

Properties:
Column Shapes: 66.03%
Column Pair Trends: 40.34%


In [39]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data5,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  5.92it/s]



Overall Quality Score: 31.92%

Properties:
Column Shapes: 48.19%
Column Pair Trends: 15.65%


In [40]:
synthesizer1.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/chartevents/'+'chartevents_best_singletablepreset.pkl')


In [41]:
synthesizer2.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/chartevents/'+'chartevents_gaussiancopula.pkl')


In [42]:
synthesizer3.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/chartevents/'+'chartevents_ctgan.pkl')


In [43]:
synthesizer4.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/chartevents/'+'chartevents_tvae.pkl')


In [44]:
synthesizer5.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/chartevents/'+'chartevents_copulagan.pkl')
