In [41]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [42]:
import pandas as pd
import numpy as np

import commonfunc

In [43]:
data_set_path="./sampled_data_csv_100/"

In [44]:
admissions_df = commonfunc.read_csv_no_rowid(data_set_path+"admissions.csv")

In [45]:
dob_df = pd.read_csv(data_set_path+"patients.csv").loc[:, ['subject_id', 'dob']]

In [46]:
# Drop useless colums
admissions_df.drop(['diagnosis', 'hospital_expire_flag'], axis=1, inplace=True)

In [47]:
admissions_df = pd.merge(dob_df, admissions_df, how='inner', on='subject_id')

In [48]:
commonfunc.nan_count(admissions_df)

Total columns: 17
Total rows: 132
--------------
subject_id                0
dob                       0
hadm_id                   0
admittime                 0
dischtime                 0
deathtime               121
admission_type            0
admission_location        0
discharge_location        0
insurance                 0
language                 45
religion                  1
marital_status           19
ethnicity                 0
edregtime                62
edouttime                62
has_chartevents_data      0
dtype: int64


In [49]:
admissions_df.dtypes

subject_id               int64
dob                     object
hadm_id                  int64
admittime               object
dischtime               object
deathtime               object
admission_type          object
admission_location      object
discharge_location      object
insurance               object
language                object
religion                object
marital_status          object
ethnicity               object
edregtime               object
edouttime               object
has_chartevents_data     int64
dtype: object

In [50]:
# Transfer some date type
admissions_df['admittime'] = pd.to_datetime(admissions_df['admittime'])
admissions_df['dischtime'] = pd.to_datetime(admissions_df['dischtime'])
admissions_df['deathtime'] = pd.to_datetime(admissions_df['deathtime'])
admissions_df['edregtime'] = pd.to_datetime(admissions_df['edregtime'])
admissions_df['edouttime'] = pd.to_datetime(admissions_df['edouttime'])

admissions_df['dob'] = pd.to_datetime(admissions_df['dob'])

In [51]:
(admissions_df['dob'] < admissions_df['admittime']).value_counts()

True    132
dtype: int64

In [52]:
admissions_df.dtypes

subject_id                       int64
dob                     datetime64[ns]
hadm_id                          int64
admittime               datetime64[ns]
dischtime               datetime64[ns]
deathtime               datetime64[ns]
admission_type                  object
admission_location              object
discharge_location              object
insurance                       object
language                        object
religion                        object
marital_status                  object
ethnicity                       object
edregtime               datetime64[ns]
edouttime               datetime64[ns]
has_chartevents_data             int64
dtype: object

In [53]:
admissions_df = admissions_df.drop(admissions_df.loc[(pd.isna(admissions_df['deathtime']) == False) & (admissions_df['dischtime'] != admissions_df['deathtime'])].index)

In [54]:
import datetime
import random

def adjust_age_over_90(df):
    '''
    This method is to adjust the invalid date in 'dob' (which is 18xx)
    Process: Use the admittime minus 90 - 100 years, to let the age of patients are between 90 to 100
    '''

    years_100 = datetime.timedelta(days = (365 * 100 + 100/4))
    random_days_10_years = datetime.timedelta(days = random.randint(0, 10)*365)
    
    if (df['dob'] - datetime.datetime(1970,1,1)).total_seconds() < 0:
        return (df['admittime'] - years_100 + random_days_10_years)
    else:
        return df['dob']

In [55]:
admissions_df['dob'] = admissions_df.apply(adjust_age_over_90, axis=1)

In [56]:
admissions_df['deathtime'] = admissions_df['deathtime'].apply(lambda flag: 1 if pd.isna(flag) == False else np.NaN)

In [57]:
# admissions_df['edouttime'] = admissions_df.apply(commonfunc.time_process, args=('edregtime', 'edouttime'), axis=1)
# admissions_df['edregtime'] = admissions_df.apply(commonfunc.time_process, args=('admittime', 'edregtime'), axis=1)
# admissions_df['dischtime'] = admissions_df.apply(commonfunc.time_process, args=('admittime', 'dischtime'), axis=1)
# admissions_df['admittime'] = admissions_df.apply(commonfunc.time_process, args=('dob', 'admittime'), axis=1)

In [58]:
admissions_df.drop(['subject_id','dob'], axis=1, inplace=True)

## Metadata

In [59]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [60]:
metadata.detect_from_dataframe(data=admissions_df)

In [61]:
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "hadm_id": {
            "sdtype": "numerical"
        },
        "admittime": {
            "sdtype": "datetime"
        },
        "dischtime": {
            "sdtype": "datetime"
        },
        "deathtime": {
            "sdtype": "numerical"
        },
        "admission_type": {
            "sdtype": "categorical"
        },
        "admission_location": {
            "sdtype": "categorical"
        },
        "discharge_location": {
            "sdtype": "categorical"
        },
        "insurance": {
            "sdtype": "categorical"
        },
        "language": {
            "sdtype": "categorical"
        },
        "religion": {
            "sdtype": "categorical"
        },
        "marital_status": {
            "sdtype": "categorical"
        },
        "ethnicity": {
            "sdtype": "categorical"
        },
        "edregtime": {
            "sdtype": "datetime"
        },
        "edo

In [62]:

metadata.update_column(
    column_name='hadm_id',
    sdtype='id',
    regex_format='HID_[0-9]{4,8}'
)

metadata.update_column(
    column_name='has_chartevents_data',
    sdtype='categorical'
)

In [63]:
metadata.set_primary_key(
    column_name='hadm_id'
)

In [64]:
metadata.save_to_json(filepath='C:\\Users\\shrus\\Documents\\Synthetic-data-generation\\metadata\\admissions\\admissions_metadata.json')

In [65]:
data=admissions_df

In [66]:
from sdv.lite import SingleTablePreset

synthesizer1 = SingleTablePreset(metadata, name='FAST_ML')
synthesizer1.fit(data)

synthetic_data1 = synthesizer1.sample(num_rows=100)

In [67]:
synthetic_data1

Unnamed: 0,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,religion,marital_status,ethnicity,edregtime,edouttime,has_chartevents_data
0,HID_0000,2168-01-18 22:54:05,2168-01-05 10:06:01,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Private,ENGL,UNOBTAINABLE,SINGLE,WHITE,2166-02-10 16:50:33,2166-02-14 04:09:38,1
1,HID_0001,2113-03-11 08:17:24,2113-03-24 02:01:29,,EMERGENCY,CLINIC REFERRAL/PREMATURE,SNF,Private,,CATHOLIC,MARRIED,WHITE,NaT,2140-03-16 05:12:12,1
2,HID_0002,2138-03-13 12:41:41,2138-04-03 16:50:43,,EMERGENCY,CLINIC REFERRAL/PREMATURE,DISC-TRAN CANCER/CHLDRN H,Medicare,ENGL,CATHOLIC,SINGLE,WHITE,2156-12-12 13:56:27,NaT,1
3,HID_0003,2132-04-19 10:50:36,2132-04-28 02:03:31,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,2137-03-24 04:00:18,NaT,1
4,HID_0004,2115-01-12 14:32:56,2115-01-31 00:38:27,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,,CATHOLIC,WIDOWED,WHITE,2135-12-15 14:03:03,NaT,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,HID_0095,2145-06-09 19:44:36,2145-06-06 01:19:28,,URGENT,EMERGENCY ROOM ADMIT,HOME,Medicare,,UNOBTAINABLE,SINGLE,WHITE,2154-02-14 14:49:30,2154-02-20 18:11:34,1
96,HID_0096,2105-10-08 17:58:50,2105-10-19 13:05:19,,EMERGENCY,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,SINGLE,WHITE,NaT,NaT,1
97,HID_0097,2154-03-24 10:47:46,2154-04-10 09:33:41,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Medicare,,CATHOLIC,,WHITE,NaT,NaT,1
98,HID_0098,2152-08-10 03:08:51,2152-09-05 13:47:25,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,ENGL,CATHOLIC,MARRIED,WHITE,2183-03-27 08:56:18,2183-03-28 06:13:02,1


In [68]:

time_in_time_out_contraint = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'edregtime',
        'high_column_name': 'edouttime',
        'strict_boundaries': True
    }
}

synthesizer1.add_constraints(
    constraints=[time_in_time_out_contraint]
)

synthesizer1.fit(data)
synthetic_data = synthesizer1.sample(num_rows=100)

Sampling rows: 100%|██████████| 100/100 [00:00<00:00, 621.16it/s]


In [69]:
synthesizer1.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/admissions/'+'admissions_singletablepreset.pkl')


In [177]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer2 = GaussianCopulaSynthesizer(metadata)
synthesizer2.fit(data)

synthetic_data2 = synthesizer2.sample(num_rows=100)

In [191]:
time_in_time_out_contraint = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'edregtime',
        'high_column_name': 'edouttime',
        'strict_boundaries': True
    }
}

synthesizer2.add_constraints(
    constraints=[time_in_time_out_contraint]
)

synthesizer2.fit(data)
synthetic_data = synthesizer2.sample(num_rows=100)

Sampling rows: 100%|██████████| 100/100 [00:00<00:00, 964.50it/s]


In [196]:
synthesizer2.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/admissions/'+'admissions_gaussiancopula.pkl')


In [179]:
from sdv.single_table import CTGANSynthesizer

synthesizer3 = CTGANSynthesizer(metadata)
synthesizer3.fit(data)

synthetic_data3 = synthesizer3.sample(num_rows=100)

In [193]:
time_in_time_out_contraint = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'edregtime',
        'high_column_name': 'edouttime',
        'strict_boundaries': True
    }
}

synthesizer3.add_constraints(
    constraints=[time_in_time_out_contraint]
)

synthesizer3.fit(data)
synthetic_data = synthesizer3.sample(num_rows=100)

Sampling rows: 100%|██████████| 100/100 [00:00<00:00, 598.52it/s]


In [194]:
synthesizer3.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/admissions/'+'admissions_ctgan.pkl')


In [197]:
from sdv.single_table import TVAESynthesizer

synthesizer4 = TVAESynthesizer(metadata)
synthesizer4.fit(data)

synthetic_data4 = synthesizer4.sample(num_rows=10)

In [198]:
time_in_time_out_contraint = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'edregtime',
        'high_column_name': 'edouttime',
        'strict_boundaries': True
    }
}

synthesizer4.add_constraints(
    constraints=[time_in_time_out_contraint]
)

synthesizer4.fit(data)
synthetic_data4 = synthesizer3.sample(num_rows=100)

Sampling rows: 100%|██████████| 100/100 [00:00<00:00, 445.80it/s]


In [204]:
synthesizer4.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/admissions/'+'admissions_tvae.pkl')


In [200]:
from sdv.single_table import CopulaGANSynthesizer

synthesizer5 = CopulaGANSynthesizer(metadata)
synthesizer5.fit(data)

synthetic_data5 = synthesizer5.sample(num_rows=10)

In [202]:
time_in_time_out_contraint = {
    'constraint_class': 'Inequality',
    'constraint_parameters': {
        'low_column_name': 'edregtime',
        'high_column_name': 'edouttime',
        'strict_boundaries': True
    }
}

synthesizer5.add_constraints(
    constraints=[time_in_time_out_contraint]
)

synthesizer5.fit(data)
synthetic_data5 = synthesizer5.sample(num_rows=100)

Sampling rows: 100%|██████████| 100/100 [00:00<00:00, 418.16it/s]


In [205]:
synthesizer5.save('C:/Users/shrus/Documents/Synthetic-data-generation/models/admissions/'+'admissions_copulaGAN.pkl')


In [187]:
synthetic_data

Unnamed: 0,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,religion,marital_status,ethnicity,edregtime,edouttime,has_chartevents_data
0,HID_0000,2142-06-09 01:02:39,2142-06-19 21:15:58,,EMERGENCY,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,ENGL,NOT SPECIFIED,MARRIED,WHITE,2166-08-21 00:35:41,2166-08-21 05:01:58,1
1,HID_0001,2149-05-25 08:59:27,2149-06-04 04:30:55,,ELECTIVE,EMERGENCY ROOM ADMIT,HOME,Private,ENGL,CATHOLIC,MARRIED,UNKNOWN/NOT SPECIFIED,NaT,NaT,1
2,HID_0002,2107-10-29 21:16:55,2107-10-26 00:48:06,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,,NOT SPECIFIED,MARRIED,WHITE,NaT,NaT,1
3,HID_0003,2165-10-01 19:53:18,2165-09-25 13:47:21,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME,Private,,UNOBTAINABLE,SINGLE,WHITE,NaT,NaT,1
4,HID_0004,2101-04-16 03:42:06,2101-04-23 16:25:24,,EMERGENCY,PHYS REFERRAL/NORMAL DELI,SNF,Medicare,ENGL,CATHOLIC,SEPARATED,BLACK/AFRICAN AMERICAN,2133-04-02 07:45:35,2133-04-02 13:01:14,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,HID_0095,2153-10-01 05:35:46,2153-10-30 01:47:14,,EMERGENCY,PHYS REFERRAL/NORMAL DELI,SNF,Medicare,ENGL,CATHOLIC,SINGLE,WHITE,2100-08-13 14:26:47,2100-08-13 16:37:41,1
96,HID_0096,2189-08-13 02:28:35,2189-08-26 08:13:46,,NEWBORN,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,ENGL,OTHER,,WHITE,2163-02-07 01:05:02,2163-02-07 04:35:38,1
97,HID_0097,2141-12-15 06:28:53,2141-12-16 01:43:42,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME,Private,CANT,CATHOLIC,MARRIED,WHITE,NaT,NaT,1
98,HID_0098,2121-12-13 17:14:50,2122-01-09 11:02:56,,EMERGENCY,CLINIC REFERRAL/PREMATURE,SNF,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,2163-04-13 10:32:02,2163-04-13 13:05:28,1
