In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
import numpy as np

In [4]:
date_set_path = "../temp_sets/"

## Read Tempdatasets

In [5]:
# read csv file as dataframe, and drop ROW_ID column
def read_csv_no_rowid(file_path):
    df = pd.read_csv(file_path)
    low_list = [x.lower() for x in df.columns]
    df.columns = low_list
    df.drop(['unnamed: 0.1', 'unnamed: 0', 'row_id'], axis=1, inplace=True)

    return df

In [6]:
# check NaN value

def nan_count(df):
    print("Total columns: " + str(len(df.columns)))
    print("Total rows: " + str(len(df)))
    print("--------------")
    print(df.isnull().sum())

### Chartevents

In [7]:
chartevents_sample_df = read_csv_no_rowid(date_set_path + "CHARTEVENTS_random_sample_1.csv")

In [8]:
chartevents_sample_df.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,itemid,charttime,storetime,cgid,value,valuenum,valueuom,warning,error,resultstatus,stopped
0,10694,138159,294193.0,220210,2153-08-21 10:34:00,,,27.0,27.0,insp/min,0.0,0.0,,
1,1459,172420,212644.0,224162,2195-11-10 11:56:00,2195-11-10 11:57:00,15047.0,8.0,8.0,insp/min,0.0,0.0,,
2,8492,118470,225777.0,220210,2117-07-03 14:00:00,2117-07-03 15:01:00,19593.0,19.0,19.0,insp/min,0.0,0.0,,
3,10694,138159,294193.0,220293,2153-08-14 06:13:00,,,18.0,18.0,L/min,0.0,0.0,,
4,12831,119477,295273.0,224697,2195-09-19 12:02:00,,,11.0,11.0,cmH2O,0.0,0.0,,


In [9]:
# Drop useless colums
chartevents_sample_df.drop(['storetime', 'cgid', 'valuenum', 'valueuom', 'warning', 'error', 'resultstatus', 'stopped'], axis=1, inplace=True)

In [10]:
chartevents_sample_df.dropna(subset=['icustay_id'], inplace=True, axis=0)

In [11]:
chartevents_sample_df.loc[pd.isna(chartevents_sample_df['icustay_id']) == True]

Unnamed: 0,subject_id,hadm_id,icustay_id,itemid,charttime,value


In [12]:
# Transfer some date type
chartevents_sample_df['subject_id'] = chartevents_sample_df['subject_id'].astype(str)
chartevents_sample_df['hadm_id'] = chartevents_sample_df['hadm_id'].astype(str)
chartevents_sample_df['icustay_id'] = chartevents_sample_df['icustay_id'].astype(str)

chartevents_sample_df['itemid'] = chartevents_sample_df['itemid'].astype(str)

chartevents_sample_df['charttime'] = pd.to_datetime(chartevents_sample_df['charttime'])

In [13]:
chartevents_sample_df.dtypes

subject_id             int64
hadm_id                int64
icustay_id             int32
itemid                 int64
charttime     datetime64[ns]
value                 object
dtype: object

In [14]:
nan_count(chartevents_sample_df)

Total columns: 6
Total rows: 825983
--------------
subject_id       0
hadm_id          0
icustay_id       0
itemid           0
charttime        0
value         5164
dtype: int64


In [15]:
chartevents_sample_df['value'].fillna(value='Na', inplace=True)

In [16]:
nan_count(chartevents_sample_df)

Total columns: 6
Total rows: 825983
--------------
subject_id    0
hadm_id       0
icustay_id    0
itemid        0
charttime     0
value         0
dtype: int64


In [18]:
chartevents_sample_df.dtypes

subject_id            object
hadm_id               object
icustay_id            object
itemid                object
charttime     datetime64[ns]
value                 object
dtype: object

##  Build Network

---

### CTGAN

In [19]:
from sdv.tabular import CTGAN
from sdv.evaluation import evaluate
from sdv.constraints import FixedCombinations

In [20]:
fixed_subject_hadm_icustay_constraint = FixedCombinations(
    column_names=['subject_id', 'hadm_id', 'icustay_id']
)

fixed_item_value_constraint = FixedCombinations(
    column_names=['itemid', 'value']
)

In [21]:
chartevents_constraints = [fixed_subject_hadm_icustay_constraint, fixed_item_value_constraint]

In [22]:
model = CTGAN(constraints=chartevents_constraints, cuda=True, verbose=True, epochs=300)

In [23]:
len(chartevents_sample_df)

825983

In [24]:
train_data = chartevents_sample_df.sample(n=1000)
model.fit(train_data)

Epoch 1, Loss G:  6.7569,Loss D: -0.0011
Epoch 2, Loss G:  6.7536,Loss D: -0.0013
Epoch 3, Loss G:  6.7645,Loss D:  0.0017
Epoch 4, Loss G:  6.7405,Loss D: -0.0083
Epoch 5, Loss G:  6.7443,Loss D: -0.0083
Epoch 6, Loss G:  6.7227,Loss D: -0.0114
Epoch 7, Loss G:  6.7362,Loss D:  0.0015
Epoch 8, Loss G:  6.7101,Loss D: -0.0011
Epoch 9, Loss G:  6.7032,Loss D:  0.0044
Epoch 10, Loss G:  6.7014,Loss D: -0.0025
Epoch 11, Loss G:  6.7294,Loss D:  0.0053
Epoch 12, Loss G:  6.6997,Loss D: -0.0120
Epoch 13, Loss G:  6.6963,Loss D:  0.0016
Epoch 14, Loss G:  6.7035,Loss D: -0.0071
Epoch 15, Loss G:  6.6980,Loss D: -0.0005
Epoch 16, Loss G:  6.6509,Loss D:  0.0078
Epoch 17, Loss G:  6.7037,Loss D:  0.0204
Epoch 18, Loss G:  6.6613,Loss D: -0.0013
Epoch 19, Loss G:  6.7197,Loss D: -0.0078
Epoch 20, Loss G:  6.6938,Loss D: -0.0056
Epoch 21, Loss G:  6.6820,Loss D: -0.0163
Epoch 22, Loss G:  6.7046,Loss D: -0.0126
Epoch 23, Loss G:  6.6801,Loss D: -0.0076
Epoch 24, Loss G:  6.6943,Loss D:  0.0048
E

In [25]:
sample = model.sample(num_rows=1000)

In [39]:
chartevents_sample_df['value'] = chartevents_sample_df['value'].astype(str)

In [26]:
evaluate(sample, train_data, metrics=['ContinuousKLDivergence'])

nan

In [27]:
evaluate(sample, train_data, metrics=['DiscreteKLDivergence'])

0.2878326468072839

In [28]:
evaluate(sample, chartevents_sample_df.loc[0:1000, :], aggregate=False)

Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,BNLogLikelihood,BayesianNetwork Log Likelihood,-18.420681,1e-08,-inf,0.0,MAXIMIZE,
1,LogisticDetection,LogisticRegression Detection,,,0.0,1.0,MAXIMIZE,DetectionMetric: Unable to be fit with error C...
2,SVCDetection,SVC Detection,,,0.0,1.0,MAXIMIZE,DetectionMetric: Unable to be fit with error C...
3,BinaryDecisionTreeClassifier,,,,0.0,1.0,MAXIMIZE,`target` must be passed either directly or ins...
4,BinaryAdaBoostClassifier,,,,0.0,1.0,MAXIMIZE,`target` must be passed either directly or ins...
5,BinaryLogisticRegression,,,,0.0,1.0,MAXIMIZE,`target` must be passed either directly or ins...
6,BinaryMLPClassifier,,,,0.0,1.0,MAXIMIZE,`target` must be passed either directly or ins...
7,MulticlassDecisionTreeClassifier,,,,0.0,1.0,MAXIMIZE,`target` must be passed either directly or ins...
8,MulticlassMLPClassifier,,,,0.0,1.0,MAXIMIZE,`target` must be passed either directly or ins...
9,LinearRegression,,,,-inf,1.0,MAXIMIZE,`target` must be passed either directly or ins...


### Timeseries table (error)

In [29]:
# from sdv.timeseries import PAR
# from sdv.constraints import FixedCombinations

In [30]:
# entity_columns = ['subject_id', 'hadm_id', 'icustay_id']
# context_columns = []
# sequence_index = 'charttime'

In [31]:
# fixed_itemid_value_constraint = FixedCombinations(column_names=['itemid', 'value'], handling_strategy='transform')

# constraints = [fixed_itemid_value_constraint]

In [32]:
# model = PAR(entity_columns=entity_columns,context_columns=context_columns,sequence_index=sequence_index, constraints=constraints)

In [33]:
# model.fit(chartevents_sample_df.loc[18170:19000, :])

In [34]:
# model.sample(num_sequences=2)

In [35]:
# # 检查是否存在entity=1的情况, 这种情况下回报错(bugs)
# sequences = chartevents_sample_df[['subject_id', 'hadm_id', 'icustay_id', 'itemid']].groupby(['subject_id', 'hadm_id', 'icustay_id', 'itemid']).size().reset_index().rename(columns={0: 'sequence_length'})
# sequences[sequences['sequence_length'] == 1]