In [14]:
# from google.colab import drive
# drive.mount('/content/drive')

In [15]:
# ! pip install sdv

In [16]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [17]:
import pandas as pd
import numpy as np

In [18]:
# read csv file as dataframe, and drop ROW_ID column
def read_csv_no_rowid(file_path):
    df = pd.read_csv(file_path)
    df.drop(["Unnamed: 0", 'row_id'], axis=1, inplace=True)

    return df

In [19]:
# check NaN value

def nan_count(df):
    print("Total columns: " + str(len(df.columns)))
    print("Total rows: " + str(len(df)))
    print("--------------")
    print(df.isnull().sum())

In [20]:
date_set_path = "../temp_sets/"

In [21]:
procedures_icd_sample_df = read_csv_no_rowid(date_set_path + "procedures_icd.csv")

In [22]:
procedures_icd_sample_df

Unnamed: 0,subject_id,hadm_id,seq_num,icd9_code
0,11290,137961,1,9915
1,11290,137961,2,966
2,11290,137961,3,9607
3,11290,137961,4,9671
4,11290,137961,5,3892
...,...,...,...,...
5464,7059,171973,3,9671
5465,7059,171973,4,9604
5466,7059,171973,5,3893
5467,7059,171973,6,3893


In [23]:
procedures_icd_sample_df.dropna(subset=['hadm_id'], inplace=True, axis=0)

In [24]:
nan_count(procedures_icd_sample_df)

Total columns: 4
Total rows: 5469
--------------
subject_id    0
hadm_id       0
seq_num       0
icd9_code     0
dtype: int64


In [25]:
nan_count(procedures_icd_sample_df)

Total columns: 4
Total rows: 5469
--------------
subject_id    0
hadm_id       0
seq_num       0
icd9_code     0
dtype: int64


In [26]:
# Set columns' type
# procedures_icd_sample_df['subject_id'] = procedures_icd_sample_df['subject_id'].astype(str)
# procedures_icd_sample_df['hadm_id'] = procedures_icd_sample_df['hadm_id'].astype(str)

# procedures_icd_sample_df['icd9_code'] = procedures_icd_sample_df['icd9_code'].astype(str)

In [27]:
procedures_icd_sample_df.dtypes

subject_id    int64
hadm_id       int64
seq_num       int64
icd9_code     int64
dtype: object

In [28]:
len(procedures_icd_sample_df)

5469

---

## Build model

In [29]:
from sdv.tabular import CTGAN
from sdv.evaluation import evaluate
from sdv.constraints import FixedCombinations

In [30]:
procedures_icd_sample_df.columns

Index(['subject_id', 'hadm_id', 'seq_num', 'icd9_code'], dtype='object')

In [31]:
fixed_subject_hadm_icustay_constraint = FixedCombinations(
    column_names=['subject_id', 'hadm_id']
)

In [32]:
procedures_icd_constraints = [fixed_subject_hadm_icustay_constraint]

In [33]:
model = CTGAN(
    constraints=procedures_icd_constraints, 
    batch_size=1000,
    cuda=True, 
    verbose=True, 
    epochs=50)

In [34]:
len(procedures_icd_sample_df)

5469

In [35]:
train_data = procedures_icd_sample_df
model.fit(train_data)

Epoch 1, Loss G:  7.0929,Loss D: -0.0153
Epoch 2, Loss G:  7.0626,Loss D: -0.0283
Epoch 3, Loss G:  7.0106,Loss D: -0.0457
Epoch 4, Loss G:  6.9718,Loss D: -0.0194
Epoch 5, Loss G:  6.9802,Loss D: -0.0214
Epoch 6, Loss G:  6.9000,Loss D:  0.0089
Epoch 7, Loss G:  6.8877,Loss D:  0.0255
Epoch 8, Loss G:  6.8621,Loss D:  0.0182
Epoch 9, Loss G:  6.8533,Loss D:  0.0038
Epoch 10, Loss G:  6.8741,Loss D: -0.0247
Epoch 11, Loss G:  6.8514,Loss D: -0.0192
Epoch 12, Loss G:  6.8556,Loss D: -0.0109
Epoch 13, Loss G:  6.8249,Loss D:  0.0141
Epoch 14, Loss G:  6.7791,Loss D:  0.0399
Epoch 15, Loss G:  6.7453,Loss D:  0.0336
Epoch 16, Loss G:  6.7035,Loss D:  0.0292
Epoch 17, Loss G:  6.7190,Loss D:  0.0137
Epoch 18, Loss G:  6.7290,Loss D: -0.0145
Epoch 19, Loss G:  6.7037,Loss D: -0.0243
Epoch 20, Loss G:  6.6793,Loss D: -0.0222
Epoch 21, Loss G:  6.6529,Loss D:  0.0174
Epoch 22, Loss G:  6.5636,Loss D:  0.0482
Epoch 23, Loss G:  6.5934,Loss D:  0.0593
Epoch 24, Loss G:  6.5824,Loss D:  0.0075
E

In [36]:
model.save("../procedures_icd_model.pkl")

In [37]:
# model = CTGAN.load(("/content/drive/MyDrive/MSc Project/procedures_icd_model.pkl"))

In [38]:
sample = model.sample(num_rows=10000)

Sampling rows: 100%|██████████| 10000/10000 [00:00<00:00, 25170.44it/s]


In [39]:
nan_count(sample)

Total columns: 4
Total rows: 10000
--------------
subject_id    0
hadm_id       0
seq_num       0
icd9_code     0
dtype: int64


In [40]:
sample.dtypes

subject_id    int64
hadm_id       int64
seq_num       int64
icd9_code     int64
dtype: object

In [41]:
evaluate(sample, train_data, metrics=['ContinuousKLDivergence'])

0.7673607594732967

In [42]:
evaluate(sample, train_data, metrics=['DiscreteKLDivergence'])

nan