In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# ! pip install sdv

In [3]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import numpy as np

In [5]:
# read csv file as dataframe, and drop ROW_ID column
def read_csv_no_rowid(file_path):
    df = pd.read_csv(file_path)
    df.drop(["Unnamed: 0", 'row_id'], axis=1, inplace=True)

    return df

In [6]:
# check NaN value

def nan_count(df):
    print("Total columns: " + str(len(df.columns)))
    print("Total rows: " + str(len(df)))
    print("--------------")
    print(df.isnull().sum())

In [7]:
date_set_path = "../temp_sets_100/"

In [8]:
procedures_icd_sample_df = read_csv_no_rowid(date_set_path + "procedures_icd.csv")

In [9]:
procedures_icd_sample_df

Unnamed: 0,subject_id,hadm_id,seq_num,icd9_code
0,569,116412,1,66
1,569,116412,2,3607
2,569,116412,3,46
3,569,116412,4,41
4,569,116412,5,3723
...,...,...,...,...
485,26884,123835,2,8851
486,49024,182358,1,9910
487,15440,158614,1,3511
488,15440,158614,2,3844


In [10]:
procedures_icd_sample_df.dropna(subset=['hadm_id'], inplace=True, axis=0)

In [11]:
nan_count(procedures_icd_sample_df)

Total columns: 4
Total rows: 490
--------------
subject_id    0
hadm_id       0
seq_num       0
icd9_code     0
dtype: int64


In [12]:
nan_count(procedures_icd_sample_df)

Total columns: 4
Total rows: 490
--------------
subject_id    0
hadm_id       0
seq_num       0
icd9_code     0
dtype: int64


In [13]:
# Set columns' type
# procedures_icd_sample_df['subject_id'] = procedures_icd_sample_df['subject_id'].astype(str)
# procedures_icd_sample_df['hadm_id'] = procedures_icd_sample_df['hadm_id'].astype(str)

# procedures_icd_sample_df['icd9_code'] = procedures_icd_sample_df['icd9_code'].astype(str)

In [14]:
procedures_icd_sample_df.dtypes

subject_id    int64
hadm_id       int64
seq_num       int64
icd9_code     int64
dtype: object

In [15]:
len(procedures_icd_sample_df)

490

---

## Build model

In [16]:
from sdv.tabular import CTGAN
from sdv.evaluation import evaluate
from sdv.constraints import FixedCombinations

In [17]:
procedures_icd_sample_df.columns

Index(['subject_id', 'hadm_id', 'seq_num', 'icd9_code'], dtype='object')

In [18]:
fixed_subject_hadm_icustay_constraint = FixedCombinations(
    column_names=['subject_id', 'hadm_id']
)

In [19]:
procedures_icd_constraints = [fixed_subject_hadm_icustay_constraint]

In [36]:
model = CTGAN(
    constraints=procedures_icd_constraints, 
    generator_lr=0.0005,
    batch_size=1000,
    cuda=True, 
    verbose=True, 
    epochs=150)

In [37]:
len(procedures_icd_sample_df)

490

In [38]:
train_data = procedures_icd_sample_df
model.fit(train_data)

Epoch 1, Loss G:  4.8260,Loss D:  0.0024
Epoch 2, Loss G:  4.8250,Loss D: -0.0026
Epoch 3, Loss G:  4.7827,Loss D:  0.0010
Epoch 4, Loss G:  4.7948,Loss D:  0.0017
Epoch 5, Loss G:  4.7395,Loss D: -0.0120
Epoch 6, Loss G:  4.7073,Loss D: -0.0136
Epoch 7, Loss G:  4.7086,Loss D: -0.0141
Epoch 8, Loss G:  4.6863,Loss D: -0.0087
Epoch 9, Loss G:  4.6682,Loss D: -0.0128
Epoch 10, Loss G:  4.6556,Loss D: -0.0183
Epoch 11, Loss G:  4.6111,Loss D: -0.0050
Epoch 12, Loss G:  4.5838,Loss D:  0.0055
Epoch 13, Loss G:  4.5839,Loss D: -0.0146
Epoch 14, Loss G:  4.5740,Loss D: -0.0084
Epoch 15, Loss G:  4.5585,Loss D:  0.0080
Epoch 16, Loss G:  4.5215,Loss D: -0.0139
Epoch 17, Loss G:  4.5209,Loss D:  0.0145
Epoch 18, Loss G:  4.4911,Loss D:  0.0196
Epoch 19, Loss G:  4.4448,Loss D:  0.0250
Epoch 20, Loss G:  4.4355,Loss D:  0.0367
Epoch 21, Loss G:  4.3870,Loss D:  0.0865
Epoch 22, Loss G:  4.4498,Loss D:  0.0581
Epoch 23, Loss G:  4.4186,Loss D:  0.0410
Epoch 24, Loss G:  4.4437,Loss D:  0.0213
E

In [39]:
model.save(date_set_path + "100_models/" + "procedures_icd_model.pkl")

In [24]:
# model = CTGAN.load(("/content/drive/MyDrive/MSc Project/procedures_icd_model.pkl"))

In [41]:
sample = model.sample(num_rows=len(train_data))

Sampling rows: 100%|██████████| 490/490 [00:00<00:00, 12699.80it/s]


In [42]:
nan_count(sample)

Total columns: 4
Total rows: 490
--------------
subject_id    0
hadm_id       0
seq_num       0
icd9_code     0
dtype: int64


In [43]:
sample.dtypes

subject_id    int64
hadm_id       int64
seq_num       int64
icd9_code     int64
dtype: object

In [44]:
evaluate(sample, train_data, metrics=['ContinuousKLDivergence'])

0.489854202279242

In [45]:
evaluate(sample, train_data, metrics=['DiscreteKLDivergence'])

nan