In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [2]:
import pandas as pd
import numpy as np

import commonfunc

In [3]:
data_set_path='C:/Users/shrus/Documents/Synthetic-data-generation/'

In [4]:
microbio_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/microbiologyevents.csv")

## Data Preprocessing

In [6]:
# Drop some columns. This cell must be below of Deal with null values
microbio_df.drop(['chartdate'], axis=1, inplace=True)

In [7]:
microbio_df.isnull().sum()

subject_id               0
hadm_id                  0
charttime              171
spec_itemid              0
spec_type_desc           0
org_itemid             753
org_name               753
isolate_num            753
ab_itemid              865
ab_name                865
dilution_text          872
dilution_comparison    872
dilution_value         872
interpretation         865
dtype: int64

In [8]:
microbio_df.dropna(subset=['subject_id', 'hadm_id', 'spec_itemid', 'charttime'], inplace=True, axis=0)

In [9]:
microbio_df.isnull().sum()

subject_id               0
hadm_id                  0
charttime                0
spec_itemid              0
spec_type_desc           0
org_itemid             702
org_name               702
isolate_num            702
ab_itemid              798
ab_name                798
dilution_text          805
dilution_comparison    805
dilution_value         805
interpretation         798
dtype: int64

In [11]:
microbio_df.dtypes

subject_id               int64
hadm_id                  int64
charttime               object
spec_itemid              int64
spec_type_desc          object
org_itemid             float64
org_name                object
isolate_num            float64
ab_itemid              float64
ab_name                 object
dilution_text           object
dilution_comparison     object
dilution_value         float64
interpretation          object
dtype: object

In [13]:
microbio_df['charttime'] = pd.to_datetime(microbio_df['charttime'])


In [26]:
microbio_df = microbio_df.drop(['hadm_id', 'subject_id'], axis=1)

In [27]:
microbio_df.dtypes

charttime              datetime64[ns]
spec_itemid                     int64
spec_type_desc                 object
org_itemid                    float64
org_name                       object
isolate_num                   float64
ab_itemid                     float64
ab_name                        object
dilution_text                  object
dilution_comparison            object
dilution_value                float64
interpretation                 object
dtype: object

In [28]:
microbio_df.columns

Index(['charttime', 'spec_itemid', 'spec_type_desc', 'org_itemid', 'org_name',
       'isolate_num', 'ab_itemid', 'ab_name', 'dilution_text',
       'dilution_comparison', 'dilution_value', 'interpretation'],
      dtype='object')

## Metadata

In [40]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [41]:
metadata.detect_from_dataframe(data=microbio_df)

In [42]:
metadata

{
    "columns": {
        "charttime": {
            "sdtype": "datetime"
        },
        "spec_itemid": {
            "sdtype": "numerical"
        },
        "spec_type_desc": {
            "sdtype": "categorical"
        },
        "org_itemid": {
            "sdtype": "numerical"
        },
        "org_name": {
            "sdtype": "categorical"
        },
        "isolate_num": {
            "sdtype": "numerical"
        },
        "ab_itemid": {
            "sdtype": "numerical"
        },
        "ab_name": {
            "sdtype": "categorical"
        },
        "dilution_text": {
            "sdtype": "categorical"
        },
        "dilution_comparison": {
            "sdtype": "categorical"
        },
        "dilution_value": {
            "sdtype": "numerical"
        },
        "interpretation": {
            "sdtype": "categorical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}

In [43]:
metadata.update_column(
    column_name='spec_itemid',
    sdtype='categorical'
)

metadata.update_column(
    column_name='org_itemid',
    sdtype='categorical'
)

metadata.update_column(
    column_name='ab_itemid',
    sdtype='categorical'
)

metadata.update_column(
    column_name='dilution_value',
    sdtype='categorical'
)

## Model Building

### Constraints

In [44]:
fixed_org_itemid_name = {
    'constraint_class': 'FixedCombinations',
    'constraint_parameters': {
        'column_names': ['org_itemid', 'org_name']
    }
}

fixed_spec_itemid_name = {
    'constraint_class': 'FixedCombinations',
    'constraint_parameters': {
        'column_names': ['spec_itemid', 'spec_type_desc']
    }
}

fixed_dilution_constraint = {
    'constraint_class': 'FixedCombinations',
    'constraint_parameters': {
        'column_names': ['ab_itemid', 'ab_name', 'dilution_text', 'dilution_comparison', 'dilution_value']
    }
}

### Model 

In [45]:
data=microbio_df

In [50]:
from sdv.lite import SingleTablePreset

synthesizer1 = SingleTablePreset(metadata, name='FAST_ML')


synthesizer1.add_constraints(constraints=[
    fixed_org_itemid_name,fixed_spec_itemid_name,fixed_dilution_constraint
])
synthesizer1.fit(data)

synthetic_data1 = synthesizer1.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 1348/1348 [00:00<00:00, 16331.28it/s]


In [51]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer2 = GaussianCopulaSynthesizer(metadata)
synthesizer2.add_constraints(constraints=[
    fixed_org_itemid_name,fixed_spec_itemid_name,fixed_dilution_constraint
])
synthesizer2.fit(data)

synthetic_data2 = synthesizer2.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 1348/1348 [00:00<00:00, 14459.64it/s]


In [52]:
from sdv.single_table import CTGANSynthesizer

synthesizer3 = CTGANSynthesizer(metadata)
synthesizer3.add_constraints(constraints=[
    fixed_org_itemid_name,fixed_spec_itemid_name,fixed_dilution_constraint
])
synthesizer3.fit(data)

synthetic_data3 = synthesizer3.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 1348/1348 [00:00<00:00, 1355.30it/s]


In [53]:
from sdv.single_table import TVAESynthesizer

synthesizer4 = TVAESynthesizer(metadata)
synthesizer4.add_constraints(constraints=[
    fixed_org_itemid_name,fixed_spec_itemid_name,fixed_dilution_constraint
])
synthesizer4.fit(data)

synthetic_data4 = synthesizer4.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 1348/1348 [00:00<00:00, 6288.00it/s]


In [54]:
from sdv.single_table import CopulaGANSynthesizer

synthesizer5 = CopulaGANSynthesizer(metadata)
synthesizer5.add_constraints(constraints=[
    fixed_org_itemid_name,fixed_spec_itemid_name,fixed_dilution_constraint
])
synthesizer5.fit(data)

synthetic_data5 = synthesizer5.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 1348/1348 [00:00<00:00, 1366.69it/s]


Evaluating performance of models

In [55]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data1,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  4.73it/s]



Overall Quality Score: 71.9%

Properties:
Column Shapes: 79.56%
Column Pair Trends: 64.23%


In [56]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data2,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  4.26it/s]



Overall Quality Score: 62.15%

Properties:
Column Shapes: 74.87%
Column Pair Trends: 49.42%


In [57]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data3,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  4.65it/s]



Overall Quality Score: 67.88%

Properties:
Column Shapes: 82.68%
Column Pair Trends: 53.08%


In [58]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data4,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:01<00:00,  3.28it/s]



Overall Quality Score: 64.56%

Properties:
Column Shapes: 65.8%
Column Pair Trends: 63.33%


In [59]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data5,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  4.89it/s]



Overall Quality Score: 67.3%

Properties:
Column Shapes: 82.89%
Column Pair Trends: 51.71%


In [61]:
synthesizer1.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/microbiologyevents/'+'microbiologyevents_best_singletablepreset.pkl')


In [62]:
synthesizer2.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/microbiologyevents/'+'microbiologyevents_gaussiancopula.pkl')


In [63]:
synthesizer3.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/microbiologyevents/'+'microbiologyevents_ctgan.pkl')


In [64]:
synthesizer4.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/microbiologyevents/'+'microbiologyevents_tvae.pkl')


In [65]:
synthesizer5.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/microbiologyevents/'+'microbiologyevents_copulagan.pkl')
