In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from ydata_synthetic.synthesizers import ModelParameters
from ydata_synthetic.preprocessing.timeseries.timeseries_processor import TimeSeriesDataProcessor
from ydata_synthetic.synthesizers.timeseries import TimeGAN

In [2]:
df = pd.read_csv('../data/tum_synthetic_training_data/train.csv')
df_arr = [df[df['case']==case] for case in df['case'].unique()]
min_hr = min(df['heart_rate'])
max_hr = max(df['heart_rate'])

In [3]:
num_ts = 100
num_days=50
seq_len = 288
n_seq = 1
hidden_dim=24
gamma=1

noise_dim=100
dim=64
batch_size=16

log_step = 100
learning_rate = 1e-3
retrain=True
samples = 16
assert samples % batch_size == 0
epochs=300

gan_args = ModelParameters(batch_size=batch_size, lr=learning_rate, noise_dim=noise_dim, layers_dim=dim)


In [4]:
data = 6*[[]]
timestamps = df_arr[0]['timestamp'][:seq_len].to_numpy()
for df_case in df_arr:
    hr = df_case['heart_rate'].to_numpy()
    so = df_case['symptom_onset'].to_numpy()
    symptom_day = np.argwhere(so==1)
    if len(symptom_day) > 0:
        idx_symptom = symptom_day[0][0]

        for i in range(6):
            idx_start = idx_symptom - i * seq_len
            if idx_start >= 0:
                hr_t = (hr[idx_symptom:idx_symptom+seq_len] - min_hr) / (max_hr - min_hr)
                if len(hr_t) == seq_len:
                    data[i].append(hr_t)


In [5]:
if retrain:
    synth = []
    for i in range(6):
        synth.append(TimeGAN(model_parameters=gan_args, hidden_dim=hidden_dim, seq_len=seq_len, n_seq=n_seq, gamma=gamma))
        synth[i].train(data[i], train_steps=epochs)
        synth[i].save(f'synthesizer_{i}.pkl')
else:
    synth = []
    for i in range(6):
        synth.append(TimeGAN.load(f'synthesizer_{i}.pkl'))

2022-10-01 19:10:30.662672: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-01 19:10:31.227133: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10794 MB memory:  -> device: 0, name: Tesla K80, pci bus id: 0001:00:00.0, compute capability: 3.7
Emddeding network training:   0%|          | 0/300 [00:00<?, ?it/s]2022-10-01 19:10:45.374628: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8201
Emddeding network training: 100%|██████████| 300/300 [01:02<00:00,  4.78it/s]
Supervised network training: 100%|██████████| 300/300 [00:37<00:00,  8.07it/s]
Joint networks training: 100%|██████████| 300/300 [15:41<00:00,  3.14s/it]
Emdde

In [6]:
total_hr = []
total_so = []
total_ts = []
total_cs = []
for k in range(num_ts):
    synth_data = []
    for i in range(5):
        d = synth[i].sample(1)[0]
        synth_data.append(d)
    sick_days = np.stack(synth_data[::-1], axis=0)
    healthy_days = synth[5].sample(num_days-5)[:num_days-5]
    idx = np.random.randint(10, num_days-10)
    total_days = [healthy_days[:idx], sick_days, healthy_days[idx:]]
    hr_data = np.concatenate(total_days, axis=0).reshape(-1)
    so_data = np.array((idx + 4) * seq_len * [0] + seq_len * [1] + (num_days - 5 - idx) * seq_len * [0])
    hr_data = hr_data * (max_hr - min_hr) + min_hr
    timestamps = np.array(pd.date_range('2020-10-1', periods=len(hr_data), freq='5min'))
    case = np.array(len(hr_data) * [f'synthetic_{k}'])
    total_hr.append(hr_data)
    total_so.append(so_data)
    total_ts.append(timestamps)
    total_cs.append(case)
tt_hr = np.concatenate(total_hr, axis=0)
tt_so = np.concatenate(total_so, axis=0)
tt_ts = np.concatenate(total_ts, axis=0)
tt_cs = np.concatenate(total_cs, axis=0)

Synthetic data generation: 100%|██████████| 1/1 [00:00<00:00,  7.56it/s]
Synthetic data generation: 100%|██████████| 1/1 [00:00<00:00,  7.53it/s]
Synthetic data generation: 100%|██████████| 1/1 [00:00<00:00,  7.71it/s]
Synthetic data generation: 100%|██████████| 1/1 [00:00<00:00,  7.84it/s]
Synthetic data generation: 100%|██████████| 1/1 [00:00<00:00,  7.89it/s]
Synthetic data generation: 100%|██████████| 3/3 [00:00<00:00,  7.75it/s]
Synthetic data generation: 100%|██████████| 1/1 [00:00<00:00,  7.73it/s]
Synthetic data generation: 100%|██████████| 1/1 [00:00<00:00,  7.80it/s]
Synthetic data generation: 100%|██████████| 1/1 [00:00<00:00,  7.43it/s]
Synthetic data generation: 100%|██████████| 1/1 [00:00<00:00,  7.75it/s]
Synthetic data generation: 100%|██████████| 1/1 [00:00<00:00,  7.77it/s]
Synthetic data generation: 100%|██████████| 3/3 [00:00<00:00,  7.77it/s]
Synthetic data generation: 100%|██████████| 1/1 [00:00<00:00,  7.91it/s]
Synthetic data generation: 100%|██████████| 1/1 [00

In [7]:
new_df = pd.DataFrame({'case': tt_cs, 'timestamp': tt_ts, 'heart_rate': tt_hr, 'symptom_onset': tt_so})
new_df.to_csv('../data/tum_synthetic_generated_data/synthetic.csv', index=False)