In [32]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [33]:
import pandas as pd
import numpy as np

import commonfunc

In [34]:
data_set_path='C:/Users/shrus/Documents/Synthetic-data-generation/'

In [35]:
cptevents_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/cptevents.csv")

In [36]:
cptevents_df['costcenter'].value_counts()

ICU     1177
Resp     193
Name: costcenter, dtype: int64

In [37]:
# Drop some null value
cptevents_df.dropna(subset=['subject_id', 'hadm_id'], inplace=True, axis=0)

In [38]:
# Drop useless colums
cptevents_df.drop(['description', 'cpt_cd', 'ticket_id_seq'], axis=1, inplace=True)
# Because in this column, just super few rows is str type, so I just drop the rows which are str, then drop the whole columns
cptevents_df.drop(cptevents_df.loc[cptevents_df['cpt_suffix'].isnull() == False].index, inplace=True)
cptevents_df.drop(['cpt_suffix'], axis=1, inplace=True)

In [39]:
commonfunc.nan_count(cptevents_df)

Total columns: 7
Total rows: 1370
--------------
subject_id             0
hadm_id                0
costcenter             0
chartdate           1177
cpt_number             0
sectionheader          0
subsectionheader       0
dtype: int64


In [40]:
cptevents_df.dtypes

subject_id           int64
hadm_id              int64
costcenter          object
chartdate           object
cpt_number           int64
sectionheader       object
subsectionheader    object
dtype: object

In [41]:
# Set type of columns
cptevents_df['chartdate'] = pd.to_datetime(cptevents_df['chartdate'])
# If keep the str row in, then set the column as 'str'
# cptevents_df['cpt_cd'] = cptevents_df['cpt_cd'].astype(int)
# cptevents_df['ticket_id_seq'] = cptevents_df['ticket_id_seq'].astype(int)

In [42]:
admittime_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/admissions.csv").loc[:, ['hadm_id', 'admittime']]
admittime_df['admittime'] = pd.to_datetime(admittime_df['admittime'])

In [43]:
cptevents_mid_df = pd.merge(left=cptevents_df, right=admittime_df, how='left', on=['hadm_id'])

In [44]:
def time_process(df):
    if pd.isna(df['chartdate']):
        return df['chartdate']
    else:
        return (df['chartdate'].date() - df['admittime'].date()).days

In [45]:
# cptevents_mid_df['chartdate'] = cptevents_mid_df.apply(time_process, axis=1)

In [46]:
cptevents_df = cptevents_mid_df.drop(['subject_id', 'hadm_id', 'admittime', 'chartdate'], axis=1)

In [47]:
cptevents_df.columns

Index(['costcenter', 'cpt_number', 'sectionheader', 'subsectionheader'], dtype='object')

## Metadata

In [48]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [49]:
metadata.detect_from_dataframe(data=cptevents_df)

In [50]:
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "costcenter": {
            "sdtype": "categorical"
        },
        "cpt_number": {
            "sdtype": "numerical"
        },
        "sectionheader": {
            "sdtype": "categorical"
        },
        "subsectionheader": {
            "sdtype": "categorical"
        }
    }
}

In [51]:
metadata.update_column(
    column_name='cpt_number',
    sdtype='categorical'
)

In [52]:
metadata.save_to_json(filepath=data_set_path+'metadata/cptevents/cptevents.json')

In [53]:
data=cptevents_df

In [54]:
from sdv.lite import SingleTablePreset

synthesizer1 = SingleTablePreset(metadata, name='FAST_ML')

fixed_constraint = {
    'constraint_class': 'FixedCombinations',
    'constraint_parameters': {
        'column_names': ['costcenter', 'cpt_number', 'sectionheader', 'subsectionheader']
    }
}

synthesizer1.add_constraints(constraints=[
    fixed_constraint
])
synthesizer1.fit(data)

synthetic_data1 = synthesizer1.sample(num_rows=100)

Sampling rows: 100%|██████████| 100/100 [00:00<00:00, 2289.77it/s]


In [55]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer2 = GaussianCopulaSynthesizer(metadata)
synthesizer2.add_constraints(constraints=[
    fixed_constraint
])
synthesizer2.fit(data)

synthetic_data2 = synthesizer2.sample(num_rows=100)

Sampling rows: 100%|██████████| 100/100 [00:00<00:00, 4610.59it/s]


In [56]:
from sdv.single_table import CTGANSynthesizer

synthesizer3 = CTGANSynthesizer(metadata)
synthesizer3.add_constraints(constraints=[
    fixed_constraint
])
synthesizer3.fit(data)

synthetic_data3 = synthesizer3.sample(num_rows=100)

Sampling rows: 100%|██████████| 100/100 [00:00<00:00, 589.90it/s]


In [57]:
from sdv.single_table import TVAESynthesizer

synthesizer4 = TVAESynthesizer(metadata)
synthesizer4.add_constraints(constraints=[
    fixed_constraint
])
synthesizer4.fit(data)

synthetic_data4 = synthesizer4.sample(num_rows=100)

Sampling rows: 100%|██████████| 100/100 [00:00<00:00, 1802.60it/s]


In [58]:
from sdv.single_table import CopulaGANSynthesizer

synthesizer5 = CopulaGANSynthesizer(metadata)
synthesizer5.add_constraints(constraints=[
    fixed_constraint
])
synthesizer5.fit(data)

synthetic_data5 = synthesizer5.sample(num_rows=100)

Sampling rows: 100%|██████████| 100/100 [00:00<00:00, 483.01it/s]


In [59]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data1,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 14.56it/s]


Overall Quality Score: 87.19%

Properties:
Column Shapes: 89.93%
Column Pair Trends: 84.45%





In [60]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data2,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 16.29it/s]


Overall Quality Score: 61.21%

Properties:
Column Shapes: 67.36%
Column Pair Trends: 55.07%





In [61]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data3,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 16.96it/s]


Overall Quality Score: 85.46%

Properties:
Column Shapes: 88.04%
Column Pair Trends: 82.88%





In [62]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data4,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 14.70it/s]


Overall Quality Score: 81.42%

Properties:
Column Shapes: 84.15%
Column Pair Trends: 78.69%





In [63]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data5,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 14.75it/s]


Overall Quality Score: 86.25%

Properties:
Column Shapes: 88.75%
Column Pair Trends: 83.76%





In [64]:
synthesizer1.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/cptevents/'+'cptevents_best_singletablepreset.pkl')
