In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# ! pip install pgmpy

In [4]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [5]:
import pandas as pd
import numpy as np

import common

In [6]:
date_set_path = "../temp_sets_100/"

## Patients

### Read samples

In [7]:
patients_df = common.read_csv_no_rowid(date_set_path+"patients.csv")

### Data preproces

In [8]:
# Drop useless colums
patients_df.drop(['expire_flag'], axis=1, inplace=True)

#### Deal with null values

In [9]:
# Check null value in table
common.nan_count(patients_df)

Total columns: 6
Total rows: 100
--------------
subject_id     0
gender         0
dob            0
dod           59
dod_hosp      78
dod_ssn       63
dtype: int64


In [10]:
# Set a value replacing the null time value
nan_datetime=pd.to_datetime(0)

In [11]:
patients_df['dob'].fillna(value=nan_datetime, inplace=True)
# patients_df['dod_hosp'].fillna(value=nan_datetime, inplace=True)
# patients_df['dod_ssn'].fillna(value=nan_datetime, inplace=True)

In [12]:
common.nan_count(patients_df)

Total columns: 6
Total rows: 100
--------------
subject_id     0
gender         0
dob            0
dod           59
dod_hosp      78
dod_ssn       63
dtype: int64


#### Set the column types

In [13]:
patients_df.dtypes

subject_id     int64
gender        object
dob           object
dod           object
dod_hosp      object
dod_ssn       object
dtype: object

In [14]:
# Transfer some date type

patients_df['dob'] = pd.to_datetime(patients_df['dob'])
patients_df['dod'] = pd.to_datetime(patients_df['dod'])
patients_df['dod_hosp'] = pd.to_datetime(patients_df['dod_hosp'])
patients_df['dod_ssn'] = pd.to_datetime(patients_df['dod_ssn'])

In [15]:
patients_df.dtypes

subject_id             int64
gender                object
dob           datetime64[ns]
dod           datetime64[ns]
dod_hosp      datetime64[ns]
dod_ssn       datetime64[ns]
dtype: object

### Process dod_hosp and dod_ssn

In [16]:
from pandas import NaT

# Define a method to deal with death time
def dod_process(df):
    '''
    Note that DOD merged together DOD_HOSP and DOD_SSN, giving priority to DOD_HOSP if both were recorded
    '''
    
    if not (pd.isna(df['dod_hosp']) or pd.isna(df['dod_ssn'])):
        return NaT
    else:
        return df['dod_ssn']


# Define a method to deal with death time
def dod_live(df):
    '''
    If patients didn't dead, then set a new column with a random time to flag it
    '''
    # print(df['dod_hosp'])

    if pd.isna(df['dod_hosp']) and pd.isna(df['dod_ssn']):
        return pd.to_datetime(0)
    else:
        return NaT

In [17]:
patients_df['dod_ssn'] = patients_df.apply(dod_process, axis=1)
# patients_df['dod_live'] = patients_df.apply(dod_live, axis=1)

In [18]:
patients_df

Unnamed: 0,subject_id,gender,dob,dod,dod_hosp,dod_ssn
0,569,M,2021-11-04,2107-11-30,NaT,2107-11-30
1,26282,F,2074-03-20,NaT,NaT,NaT
2,1762,F,2120-03-19,NaT,NaT,NaT
3,14481,M,1816-05-08,2121-02-14,NaT,2121-02-14
4,21470,M,2195-05-20,NaT,NaT,NaT
...,...,...,...,...,...,...
95,23647,M,2077-02-13,NaT,NaT,NaT
96,26485,F,2075-08-15,2164-05-08,2164-05-08,NaT
97,26884,F,2067-01-31,2153-06-23,NaT,2153-06-23
98,49024,M,2055-02-17,2137-01-12,NaT,2137-01-12


In [19]:
# from pandas import NaT

# patients_df['dod_hosp'] = patients_df['dod_hosp'].apply(lambda x: 1 if not pd.isna(x) else 0)
# patients_df['dod_ssn'] = patients_df['dod_ssn'].apply(lambda x: 1 if not pd.isna(x) else 0)
# patients_df['dod_live'] = patients_df['dod_live'].apply(lambda x: 1 if not pd.isna(x) else 0)

---

##  Build Network

In [20]:
from sdv.tabular import CTGAN
from sdv.constraints import FixedCombinations

In [21]:
patients_df.columns

Index(['subject_id', 'gender', 'dob', 'dod', 'dod_hosp', 'dod_ssn'], dtype='object')

### Set constraint

In [22]:
# Fixed constraints
# fixed_subject_hadm_icustay_constraint = FixedCombinations(
#     column_names=['subject_id', 'hadm_id']
# )

In [23]:
# dod_hosp_dod_ssn_constraint = OneHotEncoding(
#     column_names=['dod_hosp', 'dod_ssn', 'dod_live']
# )

In [24]:
# patients_constraints = [dod_hosp_dod_ssn_constraint]

In [25]:
def dod_data_unique_is_valid(column_names, data):
    one = (pd.isna(data['dod']) == False) & (pd.isna(data[column_names[0]]) == False) & (pd.isna(data[column_names[1]]) == True) #& (data['dod'] == data[column_names[0]])
    two = (pd.isna(data['dod']) == False) & (pd.isna(data[column_names[0]]) == True) & (pd.isna(data[column_names[1]]) == False) #& (data['dod'] == data[column_names[1]])
    three = (pd.isna(data['dod']) == True) & (pd.isna(data[column_names[0]]) == True) & (pd.isna(data[column_names[1]]) == True)
    is_only = one | two | three
    return is_only

In [26]:
def dod_data_unique_trasform(column_names, data):
    print(data.loc[:, ['dod', 'dod_hosp', 'dod_ssn']])
    return data

In [27]:
def dod_data_unique_reverse_transform(column_names, transformed_data):
    print(transformed_data.loc[:, ['dod', 'dod_hosp', 'dod_ssn']])

    one = (pd.isna(transformed_data['dod']) == False) & (pd.isna(transformed_data[column_names[0]]) == False) & (pd.isna(transformed_data[column_names[1]]) == True)
    two = (pd.isna(transformed_data['dod']) == False) & (pd.isna(transformed_data[column_names[0]]) == True) & (pd.isna(transformed_data[column_names[1]]) == False)

    transformed_data[column_names[0]] = transformed_data[one]['dod']
    transformed_data[column_names[1]] = transformed_data[two]['dod']

    return transformed_data

In [28]:
from sdv.constraints import create_custom_constraint

DodUniqueProcess = create_custom_constraint(
    is_valid_fn=dod_data_unique_is_valid,
    transform_fn=dod_data_unique_trasform,
    reverse_transform_fn=dod_data_unique_reverse_transform
)

In [29]:
dod_data_unique_constraint = DodUniqueProcess(
    column_names=['dod_hosp', 'dod_ssn']
)

In [30]:
constrains = [dod_data_unique_constraint]

### Build model

In [31]:
model = CTGAN(
    constraints=constrains, 
    batch_size=10000,
    cuda=True, 
    verbose=True, 
    epochs=50)

In [32]:
len(patients_df)

100

In [33]:
train_data = patients_df
model.fit(train_data)

          dod   dod_hosp    dod_ssn
0  2107-11-30        NaT 2107-11-30
1         NaT        NaT        NaT
2         NaT        NaT        NaT
3  2121-02-14        NaT 2121-02-14
4         NaT        NaT        NaT
..        ...        ...        ...
95        NaT        NaT        NaT
96 2164-05-08 2164-05-08        NaT
97 2153-06-23        NaT 2153-06-23
98 2137-01-12        NaT 2137-01-12
99        NaT        NaT        NaT

[100 rows x 3 columns]
          dod   dod_hosp    dod_ssn
0  2107-11-30        NaT 2107-11-30
1         NaT        NaT        NaT
2         NaT        NaT        NaT
3  2121-02-14        NaT 2121-02-14
4         NaT        NaT        NaT
..        ...        ...        ...
95        NaT        NaT        NaT
96 2164-05-08 2164-05-08        NaT
97 2153-06-23        NaT 2153-06-23
98 2137-01-12        NaT 2137-01-12
99        NaT        NaT        NaT

[100 rows x 3 columns]
          dod   dod_hosp    dod_ssn
0  2107-11-30        NaT 2107-11-30
1         NaT   

In [34]:
# model.save(date_set_path + "100_models/" + "patients_model.pkl")

In [127]:
sample = model.sample(num_rows=len(train_data))

Sampling rows: 100%|██████████| 100/100 [00:00<00:00, 272.88it/s]

          dod   dod_hosp    dod_ssn
0  2142-05-23        NaT        NaT
1  2191-08-29 2139-06-01        NaT
2         NaT        NaT        NaT
3         NaT 2180-07-27        NaT
4  2203-06-27        NaT        NaT
..        ...        ...        ...
95 2124-09-19        NaT        NaT
96 2143-03-13 2141-09-30        NaT
97 2152-09-20 2138-02-04 2150-06-09
98 2131-05-18        NaT        NaT
99 2208-01-12 2142-10-06        NaT

[100 rows x 3 columns]
          dod   dod_hosp    dod_ssn
0  2142-07-12        NaT 2148-08-25
1         NaT        NaT        NaT
2  2151-08-31        NaT        NaT
3  2184-09-25 2142-04-27        NaT
4  2194-01-19        NaT 2108-06-23
5         NaT 2142-07-08        NaT
6  2149-05-15 2139-10-22        NaT
7         NaT        NaT        NaT
8         NaT        NaT        NaT
9         NaT 2143-01-03 2151-03-22
10        NaT 2142-08-20        NaT
11        NaT        NaT 2085-04-01
12        NaT 2140-08-21        NaT
13        NaT        NaT        NaT
14  




In [36]:
sample

Unnamed: 0,subject_id,gender,dob,dod,dod_hosp,dod_ssn
0,42447,F,2125-08-22,NaT,NaT,NaT
1,26450,F,2042-01-01,2151-09-09,2151-09-09,NaT
2,569,M,2080-12-26,NaT,NaT,NaT
3,29558,F,1962-05-15,2147-10-21,NaT,2147-10-21
4,5138,F,2003-03-27,NaT,NaT,NaT
...,...,...,...,...,...,...
95,21505,F,2153-05-26,NaT,NaT,NaT
96,22512,M,1965-12-24,2163-09-26,2163-09-26,NaT
97,28114,M,2015-08-31,NaT,NaT,NaT
98,29341,F,2073-05-25,NaT,NaT,NaT


In [None]:
train_data.to_csv("test.csv")

In [45]:
sample.to_csv("test.csv")

### Process the generated data

---

## Evaluate data

In [109]:
from sdv.metrics.timeseries import LSTMDetection, TSFCDetection

In [105]:
metadata = {'fields': {'subject_id': {'type': 'numerical', 'subtype': 'integer'},
  'dod': {'type': 'datetime'}},
 'entity_columns': ['subject_id'],
 'sequence_index': ''}

In [106]:
metadata

{'fields': {'subject_id': {'type': 'numerical', 'subtype': 'integer'},
  'dod': {'type': 'datetime'}},
 'entity_columns': ['subject_id'],
 'sequence_index': ''}

In [108]:
df = pd.read_csv('real.csv').loc[:, ['store_id', 'date']]
df.columns=['subject_id', 'dod']

In [110]:
LSTMDetection.compute(df, df, metadata)

0.6739130434782609

In [113]:
t_sample = sample.loc[:, ['subject_id', 'dod']]

In [114]:
LSTMDetection.compute(t_sample, t_sample, metadata)

0.6571428571428571

---

In [118]:
metadata_2 = model.get_metadata().to_dict()

In [121]:
metadata_2['entity_columns']=['subject_id']

In [122]:
metadata_2

{'fields': {'subject_id': {'type': 'numerical',
   'subtype': 'integer',
   'transformer': 'integer'},
  'gender': {'type': 'categorical', 'transformer': None},
  'dob': {'type': 'datetime', 'transformer': 'datetime'},
  'dod': {'type': 'datetime', 'transformer': 'datetime'},
  'dod_hosp': {'type': 'datetime', 'transformer': 'datetime'},
  'dod_ssn': {'type': 'datetime', 'transformer': 'datetime'}},
 'constraints': [{'constraint': 'sdv.constraints.tabular.CustomConstraint',
   'column_names': ['dod_hosp', 'dod_ssn']}],
 'model_kwargs': {},
 'name': None,
 'primary_key': None,
 'sequence_index': None,
 'entity_columns': ['subject_id'],
 'context_columns': []}

In [125]:
train_data

Unnamed: 0,subject_id,gender,dob,dod,dod_hosp,dod_ssn
0,569,M,2021-11-04,2107-11-30,NaT,2107-11-30
1,26282,F,2074-03-20,NaT,NaT,NaT
2,1762,F,2120-03-19,NaT,NaT,NaT
3,14481,M,1816-05-08,2121-02-14,NaT,2121-02-14
4,21470,M,2195-05-20,NaT,NaT,NaT
...,...,...,...,...,...,...
95,23647,M,2077-02-13,NaT,NaT,NaT
96,26485,F,2075-08-15,2164-05-08,2164-05-08,NaT
97,26884,F,2067-01-31,2153-06-23,NaT,2153-06-23
98,49024,M,2055-02-17,2137-01-12,NaT,2137-01-12


In [132]:
LSTMDetection.compute(sample, train_data, metadata_2)

0.26190476190476186