In [None]:
#Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tqdm import tqdm

from keras.models import Sequential, Model
from keras.layers import GRU, Dense, Input
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy, MeanSquaredError

#### TimeGAN Synthetic Data generation

In [5]:
#getting the data
real_data = pd.read_csv('../Data/GOOG.csv')   
real_data['Date'] = pd.to_datetime(real_data['Date']) #making datetime
real_data['Date'] = real_data['Date'].values.astype(float) #converting to float for model

In [6]:
#setting parameters
seq_len = 30
n_seq = 22 
batch_size = 128 

#TimeGAN model parameters
hidden_dim = 24
num_layers = 3

In [7]:
#normalising the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(real_data).astype(np.float32)

In [8]:
#creating rolling windows
data = []
for i in range(len(real_data) - seq_len):
    data.append(scaled_data[i:i + seq_len])

n_windows = len(data)

In [9]:
#creating tf.data.Dataset
real_series = (tf.data.Dataset
               .from_tensor_slices(data)
               .shuffle(buffer_size=n_windows)
               .batch(batch_size))
real_series_iter = iter(real_series.repeat())

In [10]:
#setting up random series generator
def make_random_data():
    while True:
        yield np.random.uniform(low=0, high=1, size=(seq_len, n_seq))

random_series = iter(tf.data.Dataset
                     .from_generator(make_random_data, output_types=tf.float32)
                     .batch(batch_size)
                     .repeat())

In [11]:
#input placeholders
X = Input(shape=[seq_len, n_seq], name='RealData')
Z = Input(shape=[seq_len, n_seq], name='RandomData')

In [12]:
#RNN creation function
def make_rnn(n_layers, hidden_units, output_units, name):
    return Sequential([GRU(units=hidden_units,
                           return_sequences=True,
                           name=f'GRU_{i + 1}') for i in range(n_layers)] +
                      [Dense(units=output_units,
                             activation='sigmoid',
                             name='OUT')], name=name)

#creating the RNN's
#embedder
embedder = make_rnn(n_layers=3, 
                    hidden_units=hidden_dim, 
                    output_units=hidden_dim, 
                    name='Embedder')

#recovery
recovery = make_rnn(n_layers=3, 
                    hidden_units=hidden_dim, 
                    output_units=n_seq, 
                    name='Recovery')

#generator and discriminator
generator = make_rnn(n_layers=3, 
                     hidden_units=hidden_dim, 
                     output_units=hidden_dim, 
                     name='Generator')
discriminator = make_rnn(n_layers=3, 
                         hidden_units=hidden_dim, 
                         output_units=1, 
                         name='Discriminator')
supervisor = make_rnn(n_layers=2, 
                      hidden_units=hidden_dim, 
                      output_units=hidden_dim, 
                      name='Supervisor')

In [13]:
#training settings
train_steps = 10000
gamma = 1

#generic loss functions
mse = MeanSquaredError()
bce = BinaryCrossentropy()

In [14]:
#autoencoder training
H = embedder(X)
X_tilde = recovery(H)
autoencoder = Model(inputs=X,
                    outputs=X_tilde,
                    name='Autoencoder')
print(f'Autoencoder summary: {autoencoder.summary()}')

#autoencoder optimisation
autoencoder_optimizer = Adam()

#autoencoder training step
@tf.function
def train_autoencoder_init(x):
    with tf.GradientTape() as tape:
        x_tilde = autoencoder(x)
        embedding_loss_t0 = mse(x, x_tilde)
        e_loss_0 = 10 * tf.sqrt(embedding_loss_t0)

    var_list = embedder.trainable_variables + recovery.trainable_variables
    gradients = tape.gradient(e_loss_0, var_list)
    autoencoder_optimizer.apply_gradients(zip(gradients, var_list))
    return tf.sqrt(embedding_loss_t0)

#autoencoder trzining loop
for step in tqdm(range(train_steps)):
    X_ = next(real_series_iter)
    step_e_loss_t0 = train_autoencoder_init(X_)


Model: "Autoencoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 RealData (InputLayer)       [(None, 30, 22)]          0         
                                                                 
 Embedder (Sequential)       (None, 30, 24)            11256     
                                                                 
 Recovery (Sequential)       (None, 30, 22)            11350     
                                                                 
Total params: 22,606
Trainable params: 22,606
Non-trainable params: 0
_________________________________________________________________
Autoencoder summary: None


100%|██████████| 10000/10000 [10:59<00:00, 15.15it/s]


In [15]:
#Supervised training

#training optimiser
supervisor_optimizer = Adam()

#training step
@tf.function
def train_supervisor(x):
    with tf.GradientTape() as tape:
        h = embedder(x)
        h_hat_supervised = supervisor(h)
        g_loss_s = mse(h[:, 1:, :], h_hat_supervised[:, 1:, :])

    var_list = supervisor.trainable_variables
    gradients = tape.gradient(g_loss_s, var_list)
    supervisor_optimizer.apply_gradients(zip(gradients, var_list))
    return g_loss_s

#training loop
for step in tqdm(range(train_steps)):
    X_ = next(real_series_iter)
    step_g_loss_s = train_supervisor(X_)

100%|██████████| 10000/10000 [05:31<00:00, 30.17it/s]


In [16]:
#joint training: Generator adversarial architecture - supervised
E_hat = generator(Z)
H_hat = supervisor(E_hat)
Y_fake = discriminator(H_hat)
adversarial_supervised = Model(inputs=Z,
                               outputs=Y_fake,
                               name='AdversarialNetSupervised')

#plot_model(adversarial_supervised, show_shapes=True)
print(f'Adverserial supervised summary: {adversarial_supervised.summary()}')

#adverserial architecture in latent space
Y_fake_e = discriminator(E_hat)
adversarial_emb = Model(inputs=Z,
                    outputs=Y_fake_e,
                    name='AdversarialNet')

#plot_model(adversarial_emb, show_shapes=True)
print(f'Adverserial emb summary: {adversarial_emb.summary()}')  

Model: "AdversarialNetSupervised"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 RandomData (InputLayer)     [(None, 30, 22)]          0         
                                                                 
 Generator (Sequential)      (None, 30, 24)            11256     
                                                                 
 Supervisor (Sequential)     (None, 30, 24)            7800      
                                                                 
 Discriminator (Sequential)  (None, 30, 1)             10825     
                                                                 
Total params: 29,881
Trainable params: 29,881
Non-trainable params: 0
_________________________________________________________________
Adverserial supervised summary: None
Model: "AdversarialNet"
_________________________________________________________________
 Layer (type)                Output Shape  

In [17]:
#mean and variance loss
X_hat = recovery(H_hat)
synthetic_data = Model(inputs=Z,
                       outputs=X_hat,
                       name='SyntheticData')
#plot_model(synthetic_data, show_shapes=True)
print(f'Sythnetic data summary: {synthetic_data.summary()}')    


def get_generator_moment_loss(y_true, y_pred):
    y_true_mean, y_true_var = tf.nn.moments(x=y_true, axes=[0])
    y_pred_mean, y_pred_var = tf.nn.moments(x=y_pred, axes=[0])
    g_loss_mean = tf.reduce_mean(tf.abs(y_true_mean - y_pred_mean))
    g_loss_var = tf.reduce_mean(tf.abs(tf.sqrt(y_true_var + 1e-6) - tf.sqrt(y_pred_var + 1e-6)))
    return g_loss_mean + g_loss_var

Model: "SyntheticData"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 RandomData (InputLayer)     [(None, 30, 22)]          0         
                                                                 
 Generator (Sequential)      (None, 30, 24)            11256     
                                                                 
 Supervisor (Sequential)     (None, 30, 24)            7800      
                                                                 
 Recovery (Sequential)       (None, 30, 22)            11350     
                                                                 
Total params: 30,406
Trainable params: 30,406
Non-trainable params: 0
_________________________________________________________________
Sythnetic data summary: None


In [18]:
#discriminator
#real data architecture
Y_real = discriminator(H)
discriminator_model = Model(inputs=X,
                            outputs=Y_real,
                            name='DiscriminatorReal')

#plot_model(discriminator_model, show_shapes=True)
print(f'Discriminator models summary: {discriminator_model.summary()}')

Model: "DiscriminatorReal"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 RealData (InputLayer)       [(None, 30, 22)]          0         
                                                                 
 Embedder (Sequential)       (None, 30, 24)            11256     
                                                                 
 Discriminator (Sequential)  (None, 30, 1)             10825     
                                                                 
Total params: 22,081
Trainable params: 22,081
Non-trainable params: 0
_________________________________________________________________
Discriminator models summary: None


In [19]:
#optimisers
generator_optimizer = Adam()
discriminator_optimizer = Adam()
embedding_optimizer = Adam()

In [20]:
#generator step training
@tf.function
def train_generator(x, z):
    with tf.GradientTape() as tape:
        y_fake = adversarial_supervised(z)
        generator_loss_unsupervised = bce(y_true=tf.ones_like(y_fake),
                                          y_pred=y_fake)
        y_fake_e = adversarial_emb(z)
        generator_loss_unsupervised_e = bce(y_true=tf.ones_like(y_fake_e),
                                            y_pred=y_fake_e)
        h = embedder(x)
        h_hat_supervised = supervisor(h)
        generator_loss_supervised = mse(h[:, 1:, :], h_hat_supervised[:, 1:, :])
        x_hat = synthetic_data(z)
        generator_moment_loss = get_generator_moment_loss(x, x_hat)
        generator_loss = (generator_loss_unsupervised +
                          generator_loss_unsupervised_e +
                          100 * tf.sqrt(generator_loss_supervised) +
                          100 * generator_moment_loss)
    var_list = generator.trainable_variables + supervisor.trainable_variables
    gradients = tape.gradient(generator_loss, var_list)
    generator_optimizer.apply_gradients(zip(gradients, var_list))
    return generator_loss_unsupervised, generator_loss_supervised, generator_moment_loss

#embedding train step
@tf.function
def train_embedder(x):
    with tf.GradientTape() as tape:
        h = embedder(x)
        h_hat_supervised = supervisor(h)
        generator_loss_supervised = mse(h[:, 1:, :], h_hat_supervised[:, 1:, :])

        x_tilde = autoencoder(x)
        embedding_loss_t0 = mse(x, x_tilde)
        e_loss = 10 * tf.sqrt(embedding_loss_t0) + 0.1 * generator_loss_supervised
    var_list = embedder.trainable_variables + recovery.trainable_variables
    gradients = tape.gradient(e_loss, var_list)
    embedding_optimizer.apply_gradients(zip(gradients, var_list))
    return tf.sqrt(embedding_loss_t0)

#discriminator train step
@tf.function
def get_discriminator_loss(x, z):
    y_real = discriminator_model(x)
    discriminator_loss_real = bce(y_true=tf.ones_like(y_real),
                                  y_pred=y_real)
    y_fake = adversarial_supervised(z)
    discriminator_loss_fake = bce(y_true=tf.zeros_like(y_fake),
                                  y_pred=y_fake)
    y_fake_e = adversarial_emb(z)
    discriminator_loss_fake_e = bce(y_true=tf.zeros_like(y_fake_e),
                                    y_pred=y_fake_e)
    return (discriminator_loss_real +
            discriminator_loss_fake +
            gamma * discriminator_loss_fake_e)

@tf.function
def train_discriminator(x, z):
    with tf.GradientTape() as tape:
        discriminator_loss = get_discriminator_loss(x, z)
    var_list = discriminator.trainable_variables
    gradients = tape.gradient(discriminator_loss, var_list)
    discriminator_optimizer.apply_gradients(zip(gradients, var_list))
    return discriminator_loss

In [21]:
#training loop
step_g_loss_u = step_g_loss_s = step_g_loss_v = step_e_loss_t0 = step_d_loss = 0
for step in range(train_steps):
    # Train generator (twice as often as discriminator)
    for kk in range(2):
        X_ = next(real_series_iter)
        Z_ = next(random_series)
        # Train generator
        step_g_loss_u, step_g_loss_s, step_g_loss_v = train_generator(X_, Z_)
        # Train embedder
        step_e_loss_t0 = train_embedder(X_)
    X_ = next(real_series_iter)
    Z_ = next(random_series)
    step_d_loss = get_discriminator_loss(X_, Z_)
    if step_d_loss > 0.15:
        step_d_loss = train_discriminator(X_, Z_)
    if step % 1000 == 0:
        print(f'{step:6,.0f} | d_loss: {step_d_loss:6.4f} | g_loss_u: {step_g_loss_u:6.4f} | '
              f'g_loss_s: {step_g_loss_s:6.4f} | g_loss_v: {step_g_loss_v:6.4f} | e_loss_t0: {step_e_loss_t0:6.4f}')

#saving synthisizer
synthetic_data.save('timegan_synthetic_data_synthisizer.h5')

     0 | d_loss: 2.0710 | g_loss_u: 0.6819 | g_loss_s: 0.0005 | g_loss_v: 0.3343 | e_loss_t0: 0.0567
 1,000 | d_loss: 1.6145 | g_loss_u: 1.0555 | g_loss_s: 0.0000 | g_loss_v: 0.0505 | e_loss_t0: 0.0119
 2,000 | d_loss: 1.7458 | g_loss_u: 0.9766 | g_loss_s: 0.0001 | g_loss_v: 0.0243 | e_loss_t0: 0.0126
 3,000 | d_loss: 1.4082 | g_loss_u: 1.1271 | g_loss_s: 0.0002 | g_loss_v: 0.0941 | e_loss_t0: 0.0111
 4,000 | d_loss: 1.6551 | g_loss_u: 1.4664 | g_loss_s: 0.0001 | g_loss_v: 0.0907 | e_loss_t0: 0.0109
 5,000 | d_loss: 1.5184 | g_loss_u: 1.4200 | g_loss_s: 0.0001 | g_loss_v: 0.0381 | e_loss_t0: 0.0108
 6,000 | d_loss: 1.6997 | g_loss_u: 1.2203 | g_loss_s: 0.0000 | g_loss_v: 0.0315 | e_loss_t0: 0.0106
 7,000 | d_loss: 1.7667 | g_loss_u: 1.3464 | g_loss_s: 0.0001 | g_loss_v: 0.0396 | e_loss_t0: 0.0107
 8,000 | d_loss: 1.8113 | g_loss_u: 1.2142 | g_loss_s: 0.0001 | g_loss_v: 0.0178 | e_loss_t0: 0.0100
 9,000 | d_loss: 1.7873 | g_loss_u: 1.2495 | g_loss_s: 0.0000 | g_loss_v: 0.0263 | e_loss_t

In [22]:
#generate synthetic data
generated_data = []
for i in range(int(n_windows / batch_size)):
    Z_ = next(random_series)
    d = synthetic_data(Z_)
    generated_data.append(d)

generated_data = np.array(np.vstack(generated_data))


In [23]:
#rescaling
generated_data = (scaler.inverse_transform(generated_data
                                           .reshape(-1, n_seq))
                  .reshape(-1, seq_len, n_seq))

#getting generated data in same format as orig
cols = real_data.columns
m,n,r = generated_data.shape
out_arr = np.column_stack((np.repeat(np.arange(m),n),generated_data.reshape(m*n,-1)))
out_df = pd.DataFrame(out_arr)
out_df.drop(out_df.columns[0],axis=1,inplace=True)
out_df.columns = cols
TG_generated_data = out_df
TG_generated_data.head()


In [30]:
#formatting the data
TG_generated_data['Date'] = pd.to_datetime(TG_generated_data['Date']).dt.normalize() #changing Date to datetime
TG_generated_data = TG_generated_data.sort_values(by='Date') #ordering by date
TG_generated_data.drop(['Unnamed: 0'], axis=1, inplace=True) #dropping unnamed column 

#saving
TG_generated_data.to_csv("../Data/TimeGAN_synth_data.csv")