In [70]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [71]:
import pandas as pd
import numpy as np

import commonfunc

In [72]:
data_set_path='C:/Users/shrus/Documents/Synthetic-data-generation/'

In [73]:
prescription_df = commonfunc.read_csv_no_rowid(data_set_path+"sampled_data_csv_100/prescriptions.csv")

In [74]:
prescription_df=prescription_df.sample(1400)

In [75]:
prescription_df.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,startdate,enddate,drug_type,drug,drug_name_poe,drug_name_generic,formulary_drug_cd,gsn,ndc,prod_strength,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,route
3589,14467,172106,238827.0,2118-08-14,2118-08-24,MAIN,Lansoprazole Oral Suspension,Lansoprazole Oral Suspension,Lansoprazole Oral Suspension,LANS30L,30107.0,300304600.0,30mg/10mL Oral Syringe,30,mg,1,SYR,NG
3148,85781,143870,202174.0,2119-12-09,2119-12-12,MAIN,Oxycodone-Acetaminophen,Oxycodone-Acetaminophen,Oxycodone-Acetaminophen,PERC,4222.0,406051300.0,5mg/325mg Tablet,1-2,TAB,1-2,TAB,PO
3098,76338,130878,260880.0,2152-03-19,2152-03-19,BASE,0.9% Sodium Chloride,,,NS1000,1210.0,338004900.0,1000mL Bag,1000,mL,1000,mL,IV
756,30096,119559,231612.0,2161-08-29,2161-08-30,MAIN,Metoprolol Tartrate,Metoprolol Tartrate,Metoprolol Tartrate,METO5I,19808.0,55390010000.0,5mg/5mL Vial,5,mg,1,VIAL,IV
333,21269,109697,241691.0,2162-10-25,2162-10-25,MAIN,Metoprolol,Metoprolol,Metoprolol,METO25,50631.0,51079030000.0,25mg Tablet,37.5,mg,1.5,TAB,NG


## Data preprocessing

In [76]:
#Drop useless columns
prescription_df.drop(['subject_id', 'hadm_id', 'icustay_id','gsn','ndc','dose_val_rx','dose_unit_rx','form_val_disp'], axis=1, inplace=True)

In [77]:
prescription_df.dtypes

startdate            object
enddate              object
drug_type            object
drug                 object
drug_name_poe        object
drug_name_generic    object
formulary_drug_cd    object
prod_strength        object
form_unit_disp       object
route                object
dtype: object

In [78]:
prescription_df['startdate'] = pd.to_datetime(prescription_df['startdate'])
prescription_df['enddate'] = pd.to_datetime(prescription_df['enddate'])


In [79]:
prescription_df.isnull().sum()

startdate              0
enddate                0
drug_type              0
drug                   0
drug_name_poe        536
drug_name_generic    536
formulary_drug_cd      0
prod_strength          0
form_unit_disp         0
route                  0
dtype: int64

In [80]:
prescription_df.dropna(subset=['enddate'], inplace=True, axis=0)


In [81]:
prescription_df.columns

Index(['startdate', 'enddate', 'drug_type', 'drug', 'drug_name_poe',
       'drug_name_generic', 'formulary_drug_cd', 'prod_strength',
       'form_unit_disp', 'route'],
      dtype='object')

# Metadata

In [43]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [44]:
metadata.detect_from_dataframe(data=prescription_df)

In [45]:
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "startdate": {
            "sdtype": "datetime"
        },
        "enddate": {
            "sdtype": "datetime"
        },
        "drug_type": {
            "sdtype": "categorical"
        },
        "drug": {
            "sdtype": "categorical"
        },
        "drug_name_poe": {
            "sdtype": "categorical"
        },
        "drug_name_generic": {
            "sdtype": "categorical"
        },
        "formulary_drug_cd": {
            "sdtype": "categorical"
        },
        "prod_strength": {
            "sdtype": "categorical"
        },
        "form_unit_disp": {
            "sdtype": "categorical"
        },
        "route": {
            "sdtype": "categorical"
        }
    }
}

## Constraints

In [53]:
startdate_enddate_constraint = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'startdate',
        'high_column_name': 'enddate',
        'strict_boundaries': True
    }
}

## Modelling

In [54]:
data = prescription_df

In [56]:
from sdv.lite import SingleTablePreset

synthesizer1 = SingleTablePreset(metadata, name='FAST_ML')

synthesizer1.fit(data)

synthetic_data1 = synthesizer1.sample(num_rows=len(data))

In [61]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer2 = GaussianCopulaSynthesizer(metadata)
synthesizer2.fit(data)

synthetic_data2 = synthesizer2.sample(num_rows=len(data))

In [62]:
from sdv.single_table import CTGANSynthesizer

synthesizer3 = CTGANSynthesizer(metadata)

synthesizer3.fit(data)

synthetic_data3 = synthesizer3.sample(num_rows=len(data))

In [63]:
from sdv.single_table import TVAESynthesizer

synthesizer4 = TVAESynthesizer(metadata)
synthesizer4.fit(data)

synthetic_data4 = synthesizer4.sample(num_rows=len(data))

In [64]:
from sdv.single_table import CopulaGANSynthesizer

synthesizer5 = CopulaGANSynthesizer(metadata)
synthesizer5.fit(data)

synthetic_data5 = synthesizer5.sample(num_rows=len(data))

In [57]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data1,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:02<00:00,  1.74it/s]



Overall Quality Score: 60.77%

Properties:
Column Shapes: 80.93%
Column Pair Trends: 40.62%


In [65]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data2,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:01<00:00,  2.10it/s]



Overall Quality Score: 48.51%

Properties:
Column Shapes: 69.25%
Column Pair Trends: 27.77%


In [66]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data3,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:01<00:00,  2.48it/s]



Overall Quality Score: 54.82%

Properties:
Column Shapes: 74.23%
Column Pair Trends: 35.41%


In [67]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data4,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:02<00:00,  1.75it/s]



Overall Quality Score: 57.11%

Properties:
Column Shapes: 69.09%
Column Pair Trends: 45.14%


In [68]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data5,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:01<00:00,  2.29it/s]



Overall Quality Score: 51.68%

Properties:
Column Shapes: 72.11%
Column Pair Trends: 31.25%


In [69]:
synthesizer1.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/prescription/'+'prescription_best_singletablepreset.pkl')
