In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# ! pip install pgmpy

In [3]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import numpy as np

import common

In [5]:
date_set_path = "../temp_sets_100/"

### Read samples

In [6]:
patients_df = common.read_csv_no_rowid(date_set_path+"patients.csv")

## Patients  Preprocess data

### Drop useless colums

In [7]:
# Drop useless colums
patients_df.drop(['expire_flag'], axis=1, inplace=True)

### Deal with null values

In [8]:
# Check null value in table
common.nan_count(patients_df)

Total columns: 6
Total rows: 100
--------------
subject_id     0
gender         0
dob            0
dod           59
dod_hosp      78
dod_ssn       63
dtype: int64


In [9]:
# Set a value replacing the null time value
nan_datetime=pd.to_datetime(0)

In [10]:
patients_df['dob'].fillna(value=nan_datetime, inplace=True)
# patients_df['dod_hosp'].fillna(value=nan_datetime, inplace=True)
# patients_df['dod_ssn'].fillna(value=nan_datetime, inplace=True)

In [11]:
common.nan_count(patients_df)

Total columns: 6
Total rows: 100
--------------
subject_id     0
gender         0
dob            0
dod           59
dod_hosp      78
dod_ssn       63
dtype: int64


### Set the column types

In [12]:
patients_df.dtypes

subject_id     int64
gender        object
dob           object
dod           object
dod_hosp      object
dod_ssn       object
dtype: object

In [13]:
# Transfer some date type

patients_df['dob'] = pd.to_datetime(patients_df['dob'])
patients_df['dod'] = pd.to_datetime(patients_df['dod'])
patients_df['dod_hosp'] = pd.to_datetime(patients_df['dod_hosp'])
patients_df['dod_ssn'] = pd.to_datetime(patients_df['dod_ssn'])

In [14]:
patients_df.dtypes

subject_id             int64
gender                object
dob           datetime64[ns]
dod           datetime64[ns]
dod_hosp      datetime64[ns]
dod_ssn       datetime64[ns]
dtype: object

### Process dod_hosp and dod_ssn

In [15]:
from pandas import NaT

# Define a method to deal with death time
def dod_process(df):
    '''
    Note that DOD merged together DOD_HOSP and DOD_SSN, giving priority to DOD_HOSP if both were recorded
    '''
    
    if not (pd.isna(df['dod_hosp']) or pd.isna(df['dod_ssn'])):
        return NaT
    else:
        return df['dod_ssn']

In [16]:
patients_df['dod_ssn'] = patients_df.apply(dod_process, axis=1)

In [17]:
patients_df

Unnamed: 0,subject_id,gender,dob,dod,dod_hosp,dod_ssn
0,569,M,2021-11-04,2107-11-30,NaT,2107-11-30
1,26282,F,2074-03-20,NaT,NaT,NaT
2,1762,F,2120-03-19,NaT,NaT,NaT
3,14481,M,1816-05-08,2121-02-14,NaT,2121-02-14
4,21470,M,2195-05-20,NaT,NaT,NaT
...,...,...,...,...,...,...
95,23647,M,2077-02-13,NaT,NaT,NaT
96,26485,F,2075-08-15,2164-05-08,2164-05-08,NaT
97,26884,F,2067-01-31,2153-06-23,NaT,2153-06-23
98,49024,M,2055-02-17,2137-01-12,NaT,2137-01-12


In [18]:
# from pandas import NaT

# patients_df['dod_hosp'] = patients_df['dod_hosp'].apply(lambda x: 1 if not pd.isna(x) else 0)
# patients_df['dod_ssn'] = patients_df['dod_ssn'].apply(lambda x: 1 if not pd.isna(x) else 0)
# patients_df['dod_live'] = patients_df['dod_live'].apply(lambda x: 1 if not pd.isna(x) else 0)

---

##  Build Network

In [19]:
from sdv.tabular import CTGAN
from sdv.constraints import FixedCombinations

In [20]:
patients_df.columns

Index(['subject_id', 'gender', 'dob', 'dod', 'dod_hosp', 'dod_ssn'], dtype='object')

### Set constraint

In [21]:
# Fixed constraints
# fixed_subject_hadm_icustay_constraint = FixedCombinations(
#     column_names=['subject_id', 'hadm_id']
# )

In [22]:
# dod_hosp_dod_ssn_constraint = OneHotEncoding(
#     column_names=['dod_hosp', 'dod_ssn', 'dod_live']
# )

In [23]:
# patients_constraints = [dod_hosp_dod_ssn_constraint]

In [24]:
# Custom constrains
def dod_data_unique_is_valid(column_names, data):
    one = (pd.isna(data['dod']) == False) & (pd.isna(data[column_names[0]]) == False) & (pd.isna(data[column_names[1]]) == True) #& (data['dod'] == data[column_names[0]])
    two = (pd.isna(data['dod']) == False) & (pd.isna(data[column_names[0]]) == True) & (pd.isna(data[column_names[1]]) == False) #& (data['dod'] == data[column_names[1]])
    three = (pd.isna(data['dod']) == True) & (pd.isna(data[column_names[0]]) == True) & (pd.isna(data[column_names[1]]) == True)
    is_only = one | two | three
    return is_only

def dod_data_unique_trasform(column_names, data):
    # print(data.loc[:, ['dod', 'dod_hosp', 'dod_ssn']])
    return data

def dod_data_unique_reverse_transform(column_names, transformed_data):
    # print(transformed_data.loc[:, ['dod', 'dod_hosp', 'dod_ssn']])

    one = (pd.isna(transformed_data['dod']) == False) & (pd.isna(transformed_data[column_names[0]]) == False) & (pd.isna(transformed_data[column_names[1]]) == True)
    two = (pd.isna(transformed_data['dod']) == False) & (pd.isna(transformed_data[column_names[0]]) == True) & (pd.isna(transformed_data[column_names[1]]) == False)

    transformed_data[column_names[0]] = transformed_data[one]['dod']
    transformed_data[column_names[1]] = transformed_data[two]['dod']

    return transformed_data

In [25]:
# Build custom constrains
from sdv.constraints import create_custom_constraint

DodUniqueProcess = create_custom_constraint(
    is_valid_fn=dod_data_unique_is_valid,
    transform_fn=dod_data_unique_trasform,
    reverse_transform_fn=dod_data_unique_reverse_transform
)

In [26]:
dod_data_unique_constraint = DodUniqueProcess(
    column_names=['dod_hosp', 'dod_ssn']
)

In [27]:
constrains = [dod_data_unique_constraint]

### Build and train model

In [28]:
model = CTGAN(
    constraints=constrains, 
    batch_size=10000,
    cuda=True, 
    verbose=True, 
    epochs=60)

In [29]:
len(patients_df)

100

In [30]:
train_data = patients_df
model.fit(train_data)

Epoch 1, Loss G:  0.7192,Loss D:  0.0055
Epoch 2, Loss G:  0.6978,Loss D: -0.0052
Epoch 3, Loss G:  0.6873,Loss D: -0.0120
Epoch 4, Loss G:  0.6749,Loss D: -0.0226
Epoch 5, Loss G:  0.6467,Loss D: -0.0295
Epoch 6, Loss G:  0.6278,Loss D: -0.0409
Epoch 7, Loss G:  0.6059,Loss D: -0.0311
Epoch 8, Loss G:  0.5909,Loss D: -0.0440
Epoch 9, Loss G:  0.5697,Loss D: -0.0551
Epoch 10, Loss G:  0.5446,Loss D: -0.0516
Epoch 11, Loss G:  0.5249,Loss D: -0.0372
Epoch 12, Loss G:  0.4812,Loss D: -0.0514
Epoch 13, Loss G:  0.4619,Loss D: -0.0400
Epoch 14, Loss G:  0.4076,Loss D: -0.0418
Epoch 15, Loss G:  0.3704,Loss D: -0.0199
Epoch 16, Loss G:  0.3359,Loss D: -0.0241
Epoch 17, Loss G:  0.3017,Loss D: -0.0094
Epoch 18, Loss G:  0.2630,Loss D: -0.0239
Epoch 19, Loss G:  0.2329,Loss D: -0.0115
Epoch 20, Loss G:  0.1899,Loss D:  0.0059
Epoch 21, Loss G:  0.1501,Loss D: -0.0047
Epoch 22, Loss G:  0.1591,Loss D:  0.0194
Epoch 23, Loss G:  0.1183,Loss D:  0.0047
Epoch 24, Loss G:  0.0762,Loss D:  0.0128
E

In [31]:
import cloudpickle

with open(date_set_path + "100_models/" + "patients_model.pkl", 'wb') as f:
    cloudpickle.dump(model, f)

In [32]:
# model.save(date_set_path + "100_models/" + "patients_model.pkl")

### Generate synthetic data

In [33]:
sample = model.sample(num_rows=len(train_data))

Sampling rows: 100%|██████████| 100/100 [00:00<00:00, 266.66it/s]


In [34]:
sample.head()

Unnamed: 0,subject_id,gender,dob,dod,dod_hosp,dod_ssn
0,23822,F,2129-09-04,NaT,NaT,NaT
1,35000,F,1953-01-14,NaT,NaT,NaT
2,3631,M,2150-05-29,NaT,NaT,NaT
3,99503,F,2068-07-01,NaT,NaT,NaT
4,99503,F,2160-02-04,NaT,NaT,NaT


---

## Evaluate data

In [35]:
from sdv.evaluation import evaluate

In [36]:
evaluate(sample, train_data, metrics=['ContinuousKLDivergence'])

nan

In [37]:
evaluate(sample, train_data, metrics=['DiscreteKLDivergence'])

nan

---

### Evaluate timeseries data (not accurate in tis table)

In [38]:
from sdv.metrics.timeseries import LSTMDetection, TSFCDetection

In [39]:
metadata_2 = model.get_metadata().to_dict()

In [40]:
metadata_2['entity_columns']=['subject_id']

In [41]:
metadata_2

{'fields': {'subject_id': {'type': 'numerical',
   'subtype': 'integer',
   'transformer': 'integer'},
  'gender': {'type': 'categorical', 'transformer': None},
  'dob': {'type': 'datetime', 'transformer': 'datetime'},
  'dod': {'type': 'datetime', 'transformer': 'datetime'},
  'dod_hosp': {'type': 'datetime', 'transformer': 'datetime'},
  'dod_ssn': {'type': 'datetime', 'transformer': 'datetime'}},
 'constraints': [{'constraint': 'sdv.constraints.tabular.CustomConstraint',
   'column_names': ['dod_hosp', 'dod_ssn']}],
 'model_kwargs': {},
 'name': None,
 'primary_key': None,
 'sequence_index': None,
 'entity_columns': ['subject_id'],
 'context_columns': []}

In [42]:
LSTMDetection.compute(sample, train_data, metadata_2)

0.4565217391304348