In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# ! pip install sdv

In [3]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import numpy as np

In [5]:
# read csv file as dataframe, and drop ROW_ID column
def read_csv_no_rowid(file_path):
    df = pd.read_csv(file_path)
    df.drop(["Unnamed: 0", 'row_id'], axis=1, inplace=True)

    return df

In [6]:
# check NaN value

def nan_count(df):
    print("Total columns: " + str(len(df.columns)))
    print("Total rows: " + str(len(df)))
    print("--------------")
    print(df.isnull().sum())

In [7]:
date_set_path = "../temp_sets_100/"

In [8]:
procedures_icd_sample_df = read_csv_no_rowid(date_set_path + "procedures_icd.csv")

In [9]:
procedures_icd_sample_df

Unnamed: 0,subject_id,hadm_id,seq_num,icd9_code
0,569,116412,1,66
1,569,116412,2,3607
2,569,116412,3,46
3,569,116412,4,41
4,569,116412,5,3723
...,...,...,...,...
485,26884,123835,2,8851
486,49024,182358,1,9910
487,15440,158614,1,3511
488,15440,158614,2,3844


In [10]:
procedures_icd_sample_df.dropna(subset=['hadm_id'], inplace=True, axis=0)

In [11]:
nan_count(procedures_icd_sample_df)

Total columns: 4
Total rows: 490
--------------
subject_id    0
hadm_id       0
seq_num       0
icd9_code     0
dtype: int64


In [12]:
nan_count(procedures_icd_sample_df)

Total columns: 4
Total rows: 490
--------------
subject_id    0
hadm_id       0
seq_num       0
icd9_code     0
dtype: int64


In [13]:
# Set columns' type
# procedures_icd_sample_df['subject_id'] = procedures_icd_sample_df['subject_id'].astype(str)
# procedures_icd_sample_df['hadm_id'] = procedures_icd_sample_df['hadm_id'].astype(str)

# procedures_icd_sample_df['icd9_code'] = procedures_icd_sample_df['icd9_code'].astype(str)

In [14]:
procedures_icd_sample_df.dtypes

subject_id    int64
hadm_id       int64
seq_num       int64
icd9_code     int64
dtype: object

In [15]:
len(procedures_icd_sample_df)

490

---

## Build model

In [16]:
from sdv.tabular import CTGAN
from sdv.evaluation import evaluate
from sdv.constraints import FixedCombinations

In [17]:
procedures_icd_sample_df.columns

Index(['subject_id', 'hadm_id', 'seq_num', 'icd9_code'], dtype='object')

In [18]:
fixed_subject_hadm_icustay_constraint = FixedCombinations(
    column_names=['subject_id', 'hadm_id']
)

In [19]:
procedures_icd_constraints = [fixed_subject_hadm_icustay_constraint]

In [20]:
model = CTGAN(
    constraints=procedures_icd_constraints, 
    generator_lr=0.0005,
    batch_size=1000,
    cuda=True, 
    verbose=True, 
    epochs=150)

In [21]:
len(procedures_icd_sample_df)

490

In [22]:
train_data = procedures_icd_sample_df
model.fit(train_data)

Epoch 1, Loss G:  4.7871,Loss D: -0.0024
Epoch 2, Loss G:  4.7621,Loss D: -0.0018
Epoch 3, Loss G:  4.7448,Loss D: -0.0035
Epoch 4, Loss G:  4.6804,Loss D: -0.0065
Epoch 5, Loss G:  4.6998,Loss D: -0.0085
Epoch 6, Loss G:  4.6838,Loss D: -0.0144
Epoch 7, Loss G:  4.6469,Loss D: -0.0096
Epoch 8, Loss G:  4.6291,Loss D: -0.0054
Epoch 9, Loss G:  4.6166,Loss D:  0.0046
Epoch 10, Loss G:  4.5862,Loss D:  0.0135
Epoch 11, Loss G:  4.5859,Loss D:  0.0036
Epoch 12, Loss G:  4.5477,Loss D:  0.0356
Epoch 13, Loss G:  4.5561,Loss D:  0.0041
Epoch 14, Loss G:  4.5378,Loss D:  0.0214
Epoch 15, Loss G:  4.5235,Loss D:  0.0293
Epoch 16, Loss G:  4.5102,Loss D:  0.0347
Epoch 17, Loss G:  4.5133,Loss D:  0.0085
Epoch 18, Loss G:  4.4982,Loss D:  0.0287
Epoch 19, Loss G:  4.4983,Loss D:  0.0388
Epoch 20, Loss G:  4.5052,Loss D:  0.0232
Epoch 21, Loss G:  4.4459,Loss D:  0.0039
Epoch 22, Loss G:  4.4578,Loss D:  0.0225
Epoch 23, Loss G:  4.4790,Loss D: -0.0236
Epoch 24, Loss G:  4.4823,Loss D: -0.0027
E

In [23]:
model.save(date_set_path + "100_models/" + "procedures_icd_model.pkl")

In [24]:
# model = CTGAN.load(("/content/drive/MyDrive/MSc Project/procedures_icd_model.pkl"))

In [25]:
sample = model.sample(num_rows=len(train_data))

Sampling rows: 100%|██████████| 490/490 [00:00<00:00, 13168.17it/s]


In [35]:
sample.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd9_code
0,49024,182358,12,3472
1,97158,152158,8,2825
2,31440,157693,10,2679
3,71297,111213,1,2817
4,19815,139461,10,2698


In [26]:
nan_count(sample)

Total columns: 4
Total rows: 490
--------------
subject_id    0
hadm_id       0
seq_num       0
icd9_code     0
dtype: int64


In [27]:
sample.dtypes

subject_id    int64
hadm_id       int64
seq_num       int64
icd9_code     int64
dtype: object

In [28]:
evaluate(sample, train_data, metrics=['ContinuousKLDivergence'])

0.33342425715442175

In [29]:
evaluate(sample, train_data, metrics=['DiscreteKLDivergence'])

nan

In [30]:
from sdv.sampling import Condition

In [36]:
admission_condition = Condition({
    'seq_num': 12
}, num_rows=10)

In [37]:
model.sample_conditions(conditions=admission_condition)

TypeError: reduce() arg 2 must support iteration