In [38]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [39]:
import pandas as pd
import numpy as np

import commonfunc

In [40]:
data_set_path="./sampled_data_csv_100/"

In [41]:
admissions_df = commonfunc.read_csv_no_rowid(data_set_path+"admissions.csv")

In [42]:
dob_df = pd.read_csv(data_set_path+"patients.csv").loc[:, ['subject_id', 'dob']]

In [43]:
# Drop useless colums
admissions_df.drop(['diagnosis', 'hospital_expire_flag'], axis=1, inplace=True)

In [44]:
admissions_df = pd.merge(dob_df, admissions_df, how='inner', on='subject_id')

In [45]:
commonfunc.nan_count(admissions_df)

Total columns: 17
Total rows: 132
--------------
subject_id                0
dob                       0
hadm_id                   0
admittime                 0
dischtime                 0
deathtime               121
admission_type            0
admission_location        0
discharge_location        0
insurance                 0
language                 45
religion                  1
marital_status           19
ethnicity                 0
edregtime                62
edouttime                62
has_chartevents_data      0
dtype: int64


In [46]:
admissions_df.dtypes

subject_id               int64
dob                     object
hadm_id                  int64
admittime               object
dischtime               object
deathtime               object
admission_type          object
admission_location      object
discharge_location      object
insurance               object
language                object
religion                object
marital_status          object
ethnicity               object
edregtime               object
edouttime               object
has_chartevents_data     int64
dtype: object

In [47]:
# Transfer some date type
admissions_df['admittime'] = pd.to_datetime(admissions_df['admittime'])
admissions_df['dischtime'] = pd.to_datetime(admissions_df['dischtime'])
admissions_df['deathtime'] = pd.to_datetime(admissions_df['deathtime'])
admissions_df['edregtime'] = pd.to_datetime(admissions_df['edregtime'])
admissions_df['edouttime'] = pd.to_datetime(admissions_df['edouttime'])

admissions_df['dob'] = pd.to_datetime(admissions_df['dob'])

In [48]:
(admissions_df['dob'] < admissions_df['admittime']).value_counts()

True    132
dtype: int64

In [49]:
admissions_df.dtypes

subject_id                       int64
dob                     datetime64[ns]
hadm_id                          int64
admittime               datetime64[ns]
dischtime               datetime64[ns]
deathtime               datetime64[ns]
admission_type                  object
admission_location              object
discharge_location              object
insurance                       object
language                        object
religion                        object
marital_status                  object
ethnicity                       object
edregtime               datetime64[ns]
edouttime               datetime64[ns]
has_chartevents_data             int64
dtype: object

In [50]:
admissions_df = admissions_df.drop(admissions_df.loc[(pd.isna(admissions_df['deathtime']) == False) & (admissions_df['dischtime'] != admissions_df['deathtime'])].index)

In [51]:
import datetime
import random

def adjust_age_over_90(df):
    '''
    This method is to adjust the invalid date in 'dob' (which is 18xx)
    Process: Use the admittime minus 90 - 100 years, to let the age of patients are between 90 to 100
    '''

    years_100 = datetime.timedelta(days = (365 * 100 + 100/4))
    random_days_10_years = datetime.timedelta(days = random.randint(0, 10)*365)
    
    if (df['dob'] - datetime.datetime(1970,1,1)).total_seconds() < 0:
        return (df['admittime'] - years_100 + random_days_10_years)
    else:
        return df['dob']

In [52]:
admissions_df['dob'] = admissions_df.apply(adjust_age_over_90, axis=1)

In [53]:
admissions_df['deathtime'] = admissions_df['deathtime'].apply(lambda flag: 1 if pd.isna(flag) == False else np.NaN)

In [54]:
# admissions_df['edouttime'] = admissions_df.apply(commonfunc.time_process, args=('edregtime', 'edouttime'), axis=1)
# admissions_df['edregtime'] = admissions_df.apply(commonfunc.time_process, args=('admittime', 'edregtime'), axis=1)
# admissions_df['dischtime'] = admissions_df.apply(commonfunc.time_process, args=('admittime', 'dischtime'), axis=1)
# admissions_df['admittime'] = admissions_df.apply(commonfunc.time_process, args=('dob', 'admittime'), axis=1)

In [55]:
admissions_df.drop(['subject_id','dob'], axis=1, inplace=True)

In [56]:
admissions_df.columns

Index(['hadm_id', 'admittime', 'dischtime', 'deathtime', 'admission_type',
       'admission_location', 'discharge_location', 'insurance', 'language',
       'religion', 'marital_status', 'ethnicity', 'edregtime', 'edouttime',
       'has_chartevents_data'],
      dtype='object')

## Metadata

In [57]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [58]:
metadata.detect_from_dataframe(data=admissions_df)

In [59]:
metadata

{
    "columns": {
        "hadm_id": {
            "sdtype": "numerical"
        },
        "admittime": {
            "sdtype": "datetime"
        },
        "dischtime": {
            "sdtype": "datetime"
        },
        "deathtime": {
            "sdtype": "numerical"
        },
        "admission_type": {
            "sdtype": "categorical"
        },
        "admission_location": {
            "sdtype": "categorical"
        },
        "discharge_location": {
            "sdtype": "categorical"
        },
        "insurance": {
            "sdtype": "categorical"
        },
        "language": {
            "sdtype": "categorical"
        },
        "religion": {
            "sdtype": "categorical"
        },
        "marital_status": {
            "sdtype": "categorical"
        },
        "ethnicity": {
            "sdtype": "categorical"
        },
        "edregtime": {
            "sdtype": "datetime"
        },
        "edouttime": {
            "sdtype": "datetime"
    

In [60]:

metadata.update_column(
    column_name='hadm_id',
    sdtype='id',
    regex_format='HID_[0-9]{4,8}'
)

metadata.update_column(
    column_name='has_chartevents_data',
    sdtype='categorical'
)

In [61]:
metadata.set_primary_key(
    column_name='hadm_id'
)

In [85]:
metadata.save_to_json(filepath='C:\\Users\\shrus\\Documents\\Synthetic-data-generation\\metadata\\admissions\\admissions_metadata.json')

In [66]:
time_constraint = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'edregtime',
        'high_column_name': 'edouttime',
        'strict_boundaries': True
    }
}

In [67]:
data=admissions_df

In [69]:
from sdv.lite import SingleTablePreset

synthesizer1 = SingleTablePreset(metadata, name='FAST_ML')
synthesizer1.add_constraints(constraints=[
    time_constraint
])
synthesizer1.fit(data)

synthetic_data1 = synthesizer1.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 132/132 [00:00<00:00, 1769.13it/s]


In [71]:

time_in_time_out_contraint = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'edregtime',
        'high_column_name': 'edouttime',
        'strict_boundaries': True
    }
}

synthesizer1.add_constraints(
    constraints=[time_in_time_out_contraint]
)

synthesizer1.fit(data)
synthetic_data = synthesizer1.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 132/132 [00:00<00:00, 2178.19it/s]


In [72]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer2 = GaussianCopulaSynthesizer(metadata)
synthesizer2.fit(data)

synthetic_data2 = synthesizer2.sample(num_rows=len(data))

In [73]:
time_in_time_out_contraint = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'edregtime',
        'high_column_name': 'edouttime',
        'strict_boundaries': True
    }
}

synthesizer2.add_constraints(
    constraints=[time_in_time_out_contraint]
)

synthesizer2.fit(data)
synthetic_data = synthesizer2.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 132/132 [00:00<00:00, 2008.36it/s]


In [75]:
from sdv.single_table import CTGANSynthesizer

synthesizer3 = CTGANSynthesizer(metadata)
synthesizer3.fit(data)

synthetic_data3 = synthesizer3.sample(num_rows=100)

In [76]:
time_in_time_out_contraint = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'edregtime',
        'high_column_name': 'edouttime',
        'strict_boundaries': True
    }
}

synthesizer3.add_constraints(
    constraints=[time_in_time_out_contraint]
)

synthesizer3.fit(data)
synthetic_data = synthesizer3.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 132/132 [00:00<00:00, 530.22it/s]


In [77]:
from sdv.single_table import TVAESynthesizer

synthesizer4 = TVAESynthesizer(metadata)
time_in_time_out_contraint = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'edregtime',
        'high_column_name': 'edouttime',
        'strict_boundaries': True
    }
}

synthesizer4.add_constraints(
    constraints=[time_in_time_out_contraint]
)

synthesizer4.fit(data)
synthetic_data4 = synthesizer3.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 132/132 [00:00<00:00, 563.00it/s]


In [78]:
from sdv.single_table import CopulaGANSynthesizer

synthesizer5 = CopulaGANSynthesizer(metadata)
time_in_time_out_contraint = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'edregtime',
        'high_column_name': 'edouttime',
        'strict_boundaries': True
    }
}

synthesizer5.add_constraints(
    constraints=[time_in_time_out_contraint]
)

synthesizer5.fit(data)
synthetic_data5 = synthesizer5.sample(num_rows=len(data))

Sampling rows: 100%|██████████| 132/132 [00:00<00:00, 534.85it/s]


## Evaluation

In [79]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data1,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  9.75it/s]



Overall Quality Score: 82.27%

Properties:
Column Shapes: 89.63%
Column Pair Trends: 74.91%


In [80]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data2,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  9.56it/s]



Overall Quality Score: 69.66%

Properties:
Column Shapes: 73.89%
Column Pair Trends: 65.43%


In [81]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data3,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  9.54it/s]



Overall Quality Score: 78.02%

Properties:
Column Shapes: 86.62%
Column Pair Trends: 69.43%


In [82]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data4,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 10.56it/s]



Overall Quality Score: 78.08%

Properties:
Column Shapes: 84.29%
Column Pair Trends: 71.88%


In [83]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data5,
    metadata=metadata)

Creating report: 100%|██████████| 4/4 [00:00<00:00, 10.58it/s]



Overall Quality Score: 73.82%

Properties:
Column Shapes: 77.67%
Column Pair Trends: 69.97%


In [84]:
synthesizer1.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/admissions/'+'admissions_best_singletablepreset.pkl')
