In [47]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [48]:
import pandas as pd
import numpy as np

import commonfunc

In [49]:
data_set_path='C:/Users/shrus/Documents/Synthetic-data-generation/'

In [50]:
labevents_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/labevents.csv")

In [51]:
labevents_df=labevents_df.sample(2000)

In [52]:
labevents_df.drop(['subject_id', 'valuenum', 'flag'], axis=1, inplace=True)

In [53]:
# Drop some null value
labevents_df.dropna(subset=['hadm_id'], inplace=True, axis=0)
labevents_df['hadm_id'] = labevents_df['hadm_id'].astype(np.int64)

In [54]:
commonfunc.nan_count(labevents_df)

Total columns: 5
Total rows: 1445
--------------
hadm_id        0
itemid         0
charttime      0
value          0
valueuom     200
dtype: int64


In [55]:
labevents_df['value'].fillna(value='Na', inplace=True)

In [56]:
labevents_df.dtypes

hadm_id       int64
itemid        int64
charttime    object
value        object
valueuom     object
dtype: object

In [57]:
labevents_df['charttime'] = pd.to_datetime(labevents_df['charttime'])

In [58]:
labevents_df.dtypes

hadm_id               int64
itemid                int64
charttime    datetime64[ns]
value                object
valueuom             object
dtype: object

In [59]:
admissions_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/admissions.csv")

In [60]:
hadm_intime_df = admissions_df.loc[:, ['hadm_id', 'admittime']]
hadm_intime_df['admittime'] = pd.to_datetime(hadm_intime_df['admittime'])

In [61]:
labevents_mid_df = pd.merge(left=labevents_df, right=hadm_intime_df, how='left', on=['hadm_id'])

In [62]:
labevents_mid_df.dtypes

hadm_id               int64
itemid                int64
charttime    datetime64[ns]
value                object
valueuom             object
admittime    datetime64[ns]
dtype: object

In [63]:
# labevents_mid_df['charttime'] = labevents_mid_df.apply(commonfunc.time_process, args=('admittime', 'charttime'), axis=1)
# labevents_mid_df['charttime'] = labevents_mid_df['charttime'].astype(np.int64)

In [64]:
labevents_df = labevents_mid_df.drop(['hadm_id', 'admittime'], axis=1)

## Metadata

In [65]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [66]:
metadata.detect_from_dataframe(data=labevents_df)

In [67]:
metadata

{
    "columns": {
        "itemid": {
            "sdtype": "numerical"
        },
        "charttime": {
            "sdtype": "datetime"
        },
        "value": {
            "sdtype": "categorical"
        },
        "valueuom": {
            "sdtype": "categorical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}

In [71]:
metadata.update_column(
    column_name='itemid',
    sdtype='categorical'
)

In [72]:
fixed_constraint = {
    'constraint_class': 'FixedCombinations',
    'constraint_parameters': {
        'column_names': ['itemid', 'value', 'valueuom']
    }
}

In [73]:
data=labevents_df

In [82]:
from sdv.lite import SingleTablePreset

synthesizer1 = SingleTablePreset(metadata, name='FAST_ML')


synthesizer1.add_constraints(constraints=[
    fixed_constraint
])
synthesizer1.fit(data)

synthetic_data1 = synthesizer1.sample(num_rows=2000)

Sampling rows: 100%|██████████| 2000/2000 [00:00<00:00, 36405.89it/s]


In [84]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer2 = GaussianCopulaSynthesizer(metadata)
synthesizer2.add_constraints(constraints=[
    fixed_constraint
])
synthesizer2.fit(data)

synthetic_data2 = synthesizer2.sample(num_rows=2000)

Sampling rows: 100%|██████████| 2000/2000 [00:00<00:00, 42507.97it/s]


In [87]:
from sdv.single_table import CTGANSynthesizer

synthesizer3 = CTGANSynthesizer(metadata)
synthesizer3.add_constraints(constraints=[
    fixed_constraint
])
synthesizer3.fit(data)

synthetic_data3 = synthesizer3.sample(num_rows=2000)

Sampling rows: 100%|██████████| 2000/2000 [00:10<00:00, 193.65it/s]


In [86]:
from sdv.single_table import TVAESynthesizer

synthesizer4 = TVAESynthesizer(metadata)
synthesizer4.add_constraints(constraints=[
    fixed_constraint
])
synthesizer4.fit(data)

synthetic_data4 = synthesizer4.sample(num_rows=2000)

Sampling rows: 100%|██████████| 2000/2000 [00:00<00:00, 3697.03it/s]


In [89]:
from sdv.single_table import CopulaGANSynthesizer

synthesizer5 = CopulaGANSynthesizer(metadata)
synthesizer5.add_constraints(constraints=[
    fixed_constraint
])
synthesizer5.fit(data)

synthetic_data5 = synthesizer5.sample(num_rows=2000)

Sampling rows: 100%|██████████| 2000/2000 [00:03<00:00, 509.14it/s]


In [83]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data1,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  7.98it/s]



Overall Quality Score: 71.69%

Properties:
Column Shapes: 84.67%
Column Pair Trends: 58.71%


In [85]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data2,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 14.39it/s]



Overall Quality Score: 71.59%

Properties:
Column Shapes: 83.64%
Column Pair Trends: 59.54%


In [88]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data3,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:01<00:00,  3.74it/s]



Overall Quality Score: 53.88%

Properties:
Column Shapes: 65.11%
Column Pair Trends: 42.66%


In [81]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data4,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  4.25it/s]



Overall Quality Score: 34.46%

Properties:
Column Shapes: 47.52%
Column Pair Trends: 21.39%


In [90]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data5,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  6.36it/s]



Overall Quality Score: 41.17%

Properties:
Column Shapes: 53.31%
Column Pair Trends: 29.02%


In [91]:
synthesizer1.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/labevents/'+'labevents_best_singletablepreset.pkl')


In [92]:
synthesizer2.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/labevents/'+'labevents_gaussiancopula.pkl')


In [93]:
synthesizer3.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/labevents/'+'labevents_ctgan.pkl')


In [94]:
synthesizer4.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/labevents/'+'labevents_tvae.pkl')


In [95]:
synthesizer5.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/labevents/'+'labevents_copulagan.pkl')
