In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# ! pip install sdv

In [3]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import numpy as np

In [5]:
# read csv file as dataframe, and drop ROW_ID column
def read_csv_no_rowid(file_path):
    df = pd.read_csv(file_path)
    df.drop(["Unnamed: 0", 'row_id'], axis=1, inplace=True)

    return df

In [6]:
# check NaN value

def nan_count(df):
    print("Total columns: " + str(len(df.columns)))
    print("Total rows: " + str(len(df)))
    print("--------------")
    print(df.isnull().sum())

In [7]:
date_set_path = "../temp_sets/"

In [8]:
outputevents_sample_df = read_csv_no_rowid(date_set_path + "outputevents.csv")

In [9]:
outputevents_sample_df.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,itemid,value,valueuom,storetime,cgid,stopped,newbottle,iserror
0,11290,137961.0,264377.0,2173-11-26 00:30:00,43175,26.0,ml,2173-11-26 00:35:00,17206,,,
1,11290,137961.0,264377.0,2173-11-20 02:00:00,43175,3.0,ml,2173-11-20 03:13:00,17581,,,
2,11290,137961.0,264377.0,2173-11-20 08:00:00,43175,28.0,ml,2173-11-20 11:38:00,14489,,,
3,11290,137961.0,264377.0,2173-11-20 12:30:00,43175,12.0,ml,2173-11-20 14:38:00,14489,,,
4,11290,137961.0,264377.0,2173-11-20 20:30:00,43175,11.0,ml,2173-11-20 21:46:00,18125,,,


In [10]:
outputevents_sample_df.dropna(subset=['hadm_id'], inplace=True, axis=0)
outputevents_sample_df.dropna(subset=['icustay_id'], inplace=True, axis=0)

In [11]:
outputevents_sample_df.drop(['valueuom', 'storetime', 'cgid', 'stopped', 'newbottle', 'iserror'], axis=1, inplace=True)

In [12]:
nan_count(outputevents_sample_df)

Total columns: 6
Total rows: 100914
--------------
subject_id       0
hadm_id          0
icustay_id       0
charttime        0
itemid           0
value         1840
dtype: int64


In [13]:
outputevents_sample_df['value'].fillna(value=0, inplace=True)

In [14]:
nan_count(outputevents_sample_df)

Total columns: 6
Total rows: 100914
--------------
subject_id    0
hadm_id       0
icustay_id    0
charttime     0
itemid        0
value         0
dtype: int64


In [15]:
outputevents_sample_df['hadm_id'] = outputevents_sample_df['hadm_id'].astype(int)
outputevents_sample_df['icustay_id'] = outputevents_sample_df['icustay_id'].astype(int)
outputevents_sample_df['charttime'] = pd.to_datetime(outputevents_sample_df['charttime'])

In [16]:
outputevents_sample_df.dtypes

subject_id             int64
hadm_id                int32
icustay_id             int32
charttime     datetime64[ns]
itemid                 int64
value                float64
dtype: object

In [17]:
len(outputevents_sample_df)

100914

---

## Build model

In [18]:
from sdv.tabular import CTGAN
from sdv.evaluation import evaluate
from sdv.constraints import FixedCombinations

In [25]:
outputevents_sample_df.columns

Index(['subject_id', 'hadm_id', 'icustay_id', 'charttime', 'itemid', 'value'], dtype='object')

In [27]:
fixed_subject_hadm_icustay_constraint = FixedCombinations(
    column_names=['subject_id', 'hadm_id', 'icustay_id'],
    handling_strategy='transform'
)

fixed_item_amount_orderid_linkorderid_constraint = FixedCombinations(
    column_names=['itemid', 'value'],
    handling_strategy='transform'
)

In [28]:
outputevents_constraints = [fixed_subject_hadm_icustay_constraint, fixed_item_amount_orderid_linkorderid_constraint]

In [29]:
model = CTGAN(
    constraints=outputevents_constraints, 
    batch_size=1000,
    cuda=True, 
    verbose=True, 
    epochs=100)

In [30]:
len(outputevents_sample_df)

100914

In [31]:
train_data = outputevents_sample_df.sample(n=10000)
model.fit(train_data)

Epoch 1, Loss G:  7.1368,Loss D: -0.0201
Epoch 2, Loss G:  7.0501,Loss D: -0.0212
Epoch 3, Loss G:  7.0214,Loss D: -0.0348
Epoch 4, Loss G:  6.9985,Loss D: -0.0287
Epoch 5, Loss G:  6.9790,Loss D: -0.0599
Epoch 6, Loss G:  6.9255,Loss D: -0.0535
Epoch 7, Loss G:  6.9385,Loss D: -0.1220
Epoch 8, Loss G:  6.8626,Loss D: -0.0700
Epoch 9, Loss G:  6.8663,Loss D: -0.0889
Epoch 10, Loss G:  6.8527,Loss D: -0.1074
Epoch 11, Loss G:  6.7276,Loss D: -0.0311
Epoch 12, Loss G:  6.8236,Loss D: -0.1052
Epoch 13, Loss G:  6.7147,Loss D: -0.0888
Epoch 14, Loss G:  6.6588,Loss D: -0.0279
Epoch 15, Loss G:  6.6909,Loss D: -0.0680
Epoch 16, Loss G:  6.5692,Loss D: -0.0224
Epoch 17, Loss G:  6.4311,Loss D: -0.0086
Epoch 18, Loss G:  6.3279,Loss D: -0.0534
Epoch 19, Loss G:  6.1744,Loss D: -0.0160
Epoch 20, Loss G:  5.9774,Loss D: -0.0190
Epoch 21, Loss G:  5.9288,Loss D: -0.0533
Epoch 22, Loss G:  5.6505,Loss D: -0.0184
Epoch 23, Loss G:  5.4267,Loss D: -0.0312
Epoch 24, Loss G:  5.3009,Loss D: -0.0573
E

In [37]:
model.save("../outputevents_model.pkl")

In [25]:
# model = CTGAN.load(("/content/drive/MyDrive/MSc Project/outputevents_model.pkl"))

In [32]:
sample = model.sample(num_rows=10000)

In [33]:
nan_count(sample)

Total columns: 6
Total rows: 10000
--------------
subject_id    0
hadm_id       0
icustay_id    0
charttime     0
itemid        0
value         0
dtype: int64


In [34]:
sample.dtypes

subject_id             int64
hadm_id                int32
icustay_id             int32
charttime     datetime64[ns]
itemid                 int64
value                float64
dtype: object

In [29]:
evaluate(sample.loc[:, ['subject_id', 'hadm_id', 'icustay_id']], train_data.loc[:, ['subject_id', 'hadm_id', 'icustay_id']], metrics=['ContinuousKLDivergence'])

0.8563774666790959

In [30]:
evaluate(sample.loc[:, ['amount', 'originalamount']], train_data.loc[:, ['amount', 'originalamount']], metrics=['ContinuousKLDivergence'])

0.9968915106957409

In [31]:
evaluate(sample, train_data, metrics=['DiscreteKLDivergence'])

nan

In [35]:
evaluate(sample, outputevents_sample_df.sample(n=10000), aggregate=False)

Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,BNLogLikelihood,BayesianNetwork Log Likelihood,,,-inf,0.0,MAXIMIZE,"Cannot find fields of types ('categorical', 'b..."
1,LogisticDetection,LogisticRegression Detection,0.945988,0.9459876,0.0,1.0,MAXIMIZE,
2,SVCDetection,SVC Detection,0.482996,0.4829958,0.0,1.0,MAXIMIZE,
3,BinaryDecisionTreeClassifier,,,,0.0,1.0,MAXIMIZE,`target` must be passed either directly or ins...
4,BinaryAdaBoostClassifier,,,,0.0,1.0,MAXIMIZE,`target` must be passed either directly or ins...
5,BinaryLogisticRegression,,,,0.0,1.0,MAXIMIZE,`target` must be passed either directly or ins...
6,BinaryMLPClassifier,,,,0.0,1.0,MAXIMIZE,`target` must be passed either directly or ins...
7,MulticlassDecisionTreeClassifier,,,,0.0,1.0,MAXIMIZE,`target` must be passed either directly or ins...
8,MulticlassMLPClassifier,,,,0.0,1.0,MAXIMIZE,`target` must be passed either directly or ins...
9,LinearRegression,,,,-inf,1.0,MAXIMIZE,`target` must be passed either directly or ins...
