In [120]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [121]:
import pandas as pd
import numpy as np

import commonfunc

In [122]:
data_set_path="./sampled_data_csv_100/"

In [123]:
admissions_df = commonfunc.read_csv_no_rowid(data_set_path+"admissions.csv")

In [124]:
dob_df = pd.read_csv(data_set_path+"patients.csv").loc[:, ['subject_id', 'dob']]

In [125]:
# Drop useless colums
admissions_df.drop(['diagnosis', 'hospital_expire_flag'], axis=1, inplace=True)

In [126]:
admissions_df = pd.merge(dob_df, admissions_df, how='inner', on='subject_id')

In [127]:
commonfunc.nan_count(admissions_df)

Total columns: 17
Total rows: 132
--------------
subject_id                0
dob                       0
hadm_id                   0
admittime                 0
dischtime                 0
deathtime               121
admission_type            0
admission_location        0
discharge_location        0
insurance                 0
language                 45
religion                  1
marital_status           19
ethnicity                 0
edregtime                62
edouttime                62
has_chartevents_data      0
dtype: int64


In [128]:
admissions_df.dtypes

subject_id               int64
dob                     object
hadm_id                  int64
admittime               object
dischtime               object
deathtime               object
admission_type          object
admission_location      object
discharge_location      object
insurance               object
language                object
religion                object
marital_status          object
ethnicity               object
edregtime               object
edouttime               object
has_chartevents_data     int64
dtype: object

In [129]:
# Transfer some date type
admissions_df['admittime'] = pd.to_datetime(admissions_df['admittime'])
admissions_df['dischtime'] = pd.to_datetime(admissions_df['dischtime'])
admissions_df['deathtime'] = pd.to_datetime(admissions_df['deathtime'])
admissions_df['edregtime'] = pd.to_datetime(admissions_df['edregtime'])
admissions_df['edouttime'] = pd.to_datetime(admissions_df['edouttime'])

admissions_df['dob'] = pd.to_datetime(admissions_df['dob'])

In [130]:
(admissions_df['dob'] < admissions_df['admittime']).value_counts()

True    132
dtype: int64

In [131]:
admissions_df.dtypes

subject_id                       int64
dob                     datetime64[ns]
hadm_id                          int64
admittime               datetime64[ns]
dischtime               datetime64[ns]
deathtime               datetime64[ns]
admission_type                  object
admission_location              object
discharge_location              object
insurance                       object
language                        object
religion                        object
marital_status                  object
ethnicity                       object
edregtime               datetime64[ns]
edouttime               datetime64[ns]
has_chartevents_data             int64
dtype: object

In [132]:
admissions_df = admissions_df.drop(admissions_df.loc[(pd.isna(admissions_df['deathtime']) == False) & (admissions_df['dischtime'] != admissions_df['deathtime'])].index)

In [133]:
import datetime
import random

def adjust_age_over_90(df):
    '''
    This method is to adjust the invalid date in 'dob' (which is 18xx)
    Process: Use the admittime minus 90 - 100 years, to let the age of patients are between 90 to 100
    '''

    years_100 = datetime.timedelta(days = (365 * 100 + 100/4))
    random_days_10_years = datetime.timedelta(days = random.randint(0, 10)*365)
    
    if (df['dob'] - datetime.datetime(1970,1,1)).total_seconds() < 0:
        return (df['admittime'] - years_100 + random_days_10_years)
    else:
        return df['dob']

In [134]:
admissions_df['dob'] = admissions_df.apply(adjust_age_over_90, axis=1)

In [135]:
admissions_df['deathtime'] = admissions_df['deathtime'].apply(lambda flag: 1 if pd.isna(flag) == False else np.NaN)

In [136]:
# admissions_df['edouttime'] = admissions_df.apply(commonfunc.time_process, args=('edregtime', 'edouttime'), axis=1)
# admissions_df['edregtime'] = admissions_df.apply(commonfunc.time_process, args=('admittime', 'edregtime'), axis=1)
# admissions_df['dischtime'] = admissions_df.apply(commonfunc.time_process, args=('admittime', 'dischtime'), axis=1)
# admissions_df['admittime'] = admissions_df.apply(commonfunc.time_process, args=('dob', 'admittime'), axis=1)

In [137]:
admissions_df.drop(['subject_id','dob'], axis=1, inplace=True)

## Metadata

In [138]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [139]:
metadata.detect_from_dataframe(data=admissions_df)

In [140]:
metadata

{
    "columns": {
        "hadm_id": {
            "sdtype": "numerical"
        },
        "admittime": {
            "sdtype": "datetime"
        },
        "dischtime": {
            "sdtype": "datetime"
        },
        "deathtime": {
            "sdtype": "numerical"
        },
        "admission_type": {
            "sdtype": "categorical"
        },
        "admission_location": {
            "sdtype": "categorical"
        },
        "discharge_location": {
            "sdtype": "categorical"
        },
        "insurance": {
            "sdtype": "categorical"
        },
        "language": {
            "sdtype": "categorical"
        },
        "religion": {
            "sdtype": "categorical"
        },
        "marital_status": {
            "sdtype": "categorical"
        },
        "ethnicity": {
            "sdtype": "categorical"
        },
        "edregtime": {
            "sdtype": "datetime"
        },
        "edouttime": {
            "sdtype": "datetime"
    

In [141]:

metadata.update_column(
    column_name='hadm_id',
    sdtype='id',
    regex_format='HID_[0-9]{4,8}'
)

metadata.update_column(
    column_name='has_chartevents_data',
    sdtype='categorical'
)

In [142]:
metadata.set_primary_key(
    column_name='hadm_id'
)

In [143]:
metadata.save_to_json(filepath='C:\\Users\\shrus\\Documents\\Synthetic-data-generation\\metadata\\admissions\\admissions_metadata.json')

In [144]:
data=admissions_df

In [145]:
from sdv.lite import SingleTablePreset

synthesizer1 = SingleTablePreset(metadata, name='FAST_ML')
synthesizer1.fit(data)

synthetic_data1 = synthesizer1.sample(num_rows=100)

In [146]:
synthetic_data1

Unnamed: 0,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,religion,marital_status,ethnicity,edregtime,edouttime,has_chartevents_data
0,0,2168-01-18 22:54:05,2168-01-05 10:06:01,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Private,ENGL,UNOBTAINABLE,SINGLE,WHITE,2166-02-10 16:50:33,2166-02-14 04:09:38,1
1,1,2113-03-11 08:17:24,2113-03-24 02:01:29,,EMERGENCY,CLINIC REFERRAL/PREMATURE,SNF,Private,,CATHOLIC,MARRIED,WHITE,NaT,2140-03-16 05:12:12,1
2,2,2138-03-13 12:41:41,2138-04-03 16:50:43,,EMERGENCY,CLINIC REFERRAL/PREMATURE,DISC-TRAN CANCER/CHLDRN H,Medicare,ENGL,CATHOLIC,SINGLE,WHITE,2156-12-12 13:56:27,NaT,1
3,3,2132-04-19 10:50:36,2132-04-28 02:03:31,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,2137-03-24 04:00:18,NaT,1
4,4,2115-01-12 14:32:56,2115-01-31 00:38:27,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,,CATHOLIC,WIDOWED,WHITE,2135-12-15 14:03:03,NaT,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,2145-06-09 19:44:36,2145-06-06 01:19:28,,URGENT,EMERGENCY ROOM ADMIT,HOME,Medicare,,UNOBTAINABLE,SINGLE,WHITE,2154-02-14 14:49:30,2154-02-20 18:11:34,1
96,96,2105-10-08 17:58:50,2105-10-19 13:05:19,,EMERGENCY,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,SINGLE,WHITE,NaT,NaT,1
97,97,2154-03-24 10:47:46,2154-04-10 09:33:41,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Medicare,,CATHOLIC,,WHITE,NaT,NaT,1
98,98,2152-08-10 03:08:51,2152-09-05 13:47:25,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,ENGL,CATHOLIC,MARRIED,WHITE,2183-03-27 08:56:18,2183-03-28 06:13:02,1
