In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import os
import tensorflow as tf
from pathlib import Path
from tensorflow.keras.layers import GRU, Dense, RNN, GRUCell, Input
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.losses import BinaryCrossentropy, MeanSquaredError
from tensorflow.keras.optimizers import Adam
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    print('Using GPU')
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)
else:
    print('Using CPU')

Using GPU


In [None]:
experiment = 0

In [None]:
results_path = Path('time_gan')
if not results_path.exists():
    results_path.mkdir()

In [None]:
log_dir = results_path / f'experiment_{experiment:02}'
if not log_dir.exists():
    log_dir.mkdir(parents=True)

In [None]:
hdf_store = results_path / 'TimeSeriesGAN.h5'

In [None]:
seq_len = 10
n_seq = 6
batch_size = 128

In [None]:
tickers = ['s1', 's2', 's3', 's4', 's5', 's6']

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Roland and Luca
df_train = pd.read_csv("./drive/MyDrive/genhack/data/df_train.csv")

In [None]:
# Pierre
# df_train = pd.read_csv("./drive/MyDrive/Master Data Science - Polytechnique/MCMC/GenHack/data/df_train.csv")

In [None]:
#df_train = pd.read_csv(os.getcwd()+'/the_ginger_elephants-master/data/df_train.csv')
df = df_train.drop('dates',1)

  df = df_train.drop('dates',1)


In [None]:
train, test = train_test_split(df, test_size=0.2, shuffle=False)

In [None]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(train).astype(np.float32)

In [None]:
scaled_data.shape

(7694, 6)

In [None]:
seq_len = 1920

In [None]:
data = []
for i in range(len(train) - seq_len):
    data.append(scaled_data[i:i + seq_len])

n_windows = len(data)

In [None]:
real_series = (tf.data.Dataset
               .from_tensor_slices(data)
               .shuffle(buffer_size=n_windows)
               .batch(batch_size))
real_series_iter = iter(real_series.repeat())

In [None]:
# Roland and Luca
noise = np.load("./drive/MyDrive/genhack/data/noise.npy")

FileNotFoundError: ignored

In [None]:
# Pierre
##noise = np.load(os.getcwd()+'/the_ginger_elephants-master/data/noise.npy')

#noise = np.load("./drive/MyDrive/Master Data Science - Polytechnique/MCMC/GenHack/data/noise.npy")

In [None]:
noise = np.tile(noise,(192,1))
noise = noise[:,:10]


def make_random_data():
    while True:
        yield noise
        
# create 128 * noise shape (10,50)
random_series = iter(tf.data.Dataset
                     .from_generator(make_random_data, output_types=tf.float32)
                     .batch(batch_size)
                     .repeat())

In [None]:
next(random_series).shape

In [None]:
hidden_dim = 5
num_layers = 3

In [None]:
writer = tf.summary.create_file_writer(log_dir.as_posix())

In [None]:
#X = Input(shape=[10, n_seq], name='RealData')
#Z = Input(shape=[10, 50], name='RandomData')

X = Input(shape=[seq_len, n_seq], name='RealData')
Z = Input(shape=[seq_len, 10], name='RandomData')

In [None]:
X,Z

In [None]:
def make_rnn(n_layers, hidden_units, output_units, name):
    return Sequential([GRU(units=hidden_units,
                           return_sequences=True,
                           name=f'GRU_{i + 1}') for i in range(n_layers)] +
                      [Dense(units=output_units,
                             activation='sigmoid',
                             name='OUT')], name=name)

In [None]:
embedder = make_rnn(n_layers=3, 
                    hidden_units=hidden_dim, 
                    output_units=hidden_dim, 
                    name='Embedder')
recovery = make_rnn(n_layers=3, 
                    hidden_units=hidden_dim, 
                    output_units=n_seq, 
                    name='Recovery')

In [None]:
generator = make_rnn(n_layers=3, 
                     hidden_units=hidden_dim, 
                     output_units=hidden_dim, 
                     name='Generator')
discriminator = make_rnn(n_layers=3, 
                         hidden_units=hidden_dim, 
                         output_units=1, 
                         name='Discriminator')
supervisor = make_rnn(n_layers=2, 
                      hidden_units=hidden_dim, 
                      output_units=hidden_dim, 
                      name='Supervisor')

In [None]:
train_steps = 1000
gamma = 1

In [None]:
mse = MeanSquaredError()
bce = BinaryCrossentropy()

In [None]:
H = embedder(X)
X_tilde = recovery(H)

autoencoder = Model(inputs=X,
                    outputs=X_tilde,
                    name='Autoencoder')

In [None]:
autoencoder.summary()

In [None]:
autoencoder_optimizer = Adam()

In [None]:
@tf.function
def train_autoencoder_init(x):
    with tf.GradientTape() as tape:
        x_tilde = autoencoder(x)
        embedding_loss_t0 = mse(x, x_tilde)
        e_loss_0 = 10 * tf.sqrt(embedding_loss_t0)

    var_list = embedder.trainable_variables + recovery.trainable_variables
    gradients = tape.gradient(e_loss_0, var_list)
    autoencoder_optimizer.apply_gradients(zip(gradients, var_list))
    return tf.sqrt(embedding_loss_t0)

In [None]:
for step in tqdm(range(train_steps)):
    X_ = next(real_series_iter)#real_test_1#next(real_series_iter)
    step_e_loss_t0 = train_autoencoder_init(X_)
    #with writer.as_default():
     #   tf.summary.scalar('Loss Autoencoder Init', step_e_loss_t0, step=step)

In [None]:
autoencoder.save(log_dir / 'autoencoder')

In [None]:
'''# Load weights
model.load_weights(checkpoint_path)

# Create a new model instance
model = create_model()

# Save the weights using the `checkpoint_path` format
model.save_weights(checkpoint_path.format(epoch=0))

######## ######### ######## ######### ######## #########

# Save the weights
model.save_weights('./checkpoints/my_checkpoint')

# Create a new model instance
model = create_model()

# Restore the weights
model.load_weights('./checkpoints/my_checkpoint')

######## ######### ######## ######### ######## #########

# Create and train a new model instance.
model = create_model()
model.fit(train_images, train_labels, epochs=5)

# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('saved_model/my_model')

# my_model directory
ls saved_model

# Contains an assets folder, saved_model.pb, and variables folder.
ls saved_model/my_model

new_model = tf.keras.models.load_model('saved_model/my_model')

# Check its architecture
new_model.summary()'''

In [None]:
supervisor_optimizer = Adam()

In [None]:
@tf.function
def train_supervisor(x):
    with tf.GradientTape() as tape:
        h = embedder(x)
        h_hat_supervised = supervisor(h)
        g_loss_s = mse(h[:, 1:, :], h_hat_supervised[:, :-1, :])

    var_list = supervisor.trainable_variables
    gradients = tape.gradient(g_loss_s, var_list)
    supervisor_optimizer.apply_gradients(zip(gradients, var_list))
    return g_loss_s

In [None]:
for step in tqdm(range(train_steps)):
    X_ = next(real_series_iter)#real_test_1#next(real_series_iter)
    step_g_loss_s = train_supervisor(X_)
    with writer.as_default():
        tf.summary.scalar('Loss Generator Supervised Init', step_g_loss_s, step=step)

In [None]:
supervisor.save(log_dir / 'supervisor')

In [None]:
E_hat = generator(Z)
H_hat = supervisor(E_hat)
Y_fake = discriminator(H_hat)

adversarial_supervised = Model(inputs=Z,
                               outputs=Y_fake,
                               name='AdversarialNetSupervised')

In [None]:
adversarial_supervised.summary()

In [None]:
Y_fake_e = discriminator(E_hat)

adversarial_emb = Model(inputs=Z,
                    outputs=Y_fake_e,
                    name='AdversarialNet')

In [None]:
adversarial_emb.summary()

In [None]:
X_hat = recovery(H_hat)
synthetic_data = Model(inputs=Z,
                       outputs=X_hat,
                       name='SyntheticData')

In [None]:
synthetic_data.summary()

In [None]:
def get_generator_moment_loss(y_true, y_pred):
    y_true_mean, y_true_var = tf.nn.moments(x=y_true, axes=[0])
    y_pred_mean, y_pred_var = tf.nn.moments(x=y_pred, axes=[0])
    g_loss_mean = tf.reduce_mean(tf.abs(y_true_mean - y_pred_mean))
    g_loss_var = tf.reduce_mean(tf.abs(tf.sqrt(y_true_var + 1e-6) - tf.sqrt(y_pred_var + 1e-6)))
    return g_loss_mean + g_loss_var

In [None]:
Y_real = discriminator(H)
discriminator_model = Model(inputs=X,
                            outputs=Y_real,
                            name='DiscriminatorReal')

In [None]:
discriminator_model.summary()

In [None]:
generator_optimizer = Adam()
discriminator_optimizer = Adam()
embedding_optimizer = Adam()

In [None]:
@tf.function
def train_generator(x, z):
    with tf.GradientTape() as tape:
        y_fake = adversarial_supervised(z)
        generator_loss_unsupervised = bce(y_true=tf.ones_like(y_fake),
                                          y_pred=y_fake)

        y_fake_e = adversarial_emb(z)
        generator_loss_unsupervised_e = bce(y_true=tf.ones_like(y_fake_e),
                                            y_pred=y_fake_e)
        h = embedder(x)
        h_hat_supervised = supervisor(h)
        generator_loss_supervised = mse(h[:, 1:, :], h_hat_supervised[:, 1:, :])

        x_hat = synthetic_data(z)
        generator_moment_loss = get_generator_moment_loss(x, x_hat)

        generator_loss = (generator_loss_unsupervised +
                          generator_loss_unsupervised_e +
                          100 * tf.sqrt(generator_loss_supervised) +
                          100 * generator_moment_loss)

    var_list = generator.trainable_variables + supervisor.trainable_variables
    gradients = tape.gradient(generator_loss, var_list)
    generator_optimizer.apply_gradients(zip(gradients, var_list))
    return generator_loss_unsupervised, generator_loss_supervised, generator_moment_loss


In [None]:
@tf.function
def train_embedder(x):
    with tf.GradientTape() as tape:
        h = embedder(x)
        h_hat_supervised = supervisor(h)
        generator_loss_supervised = mse(h[:, 1:, :], h_hat_supervised[:, 1:, :])

        x_tilde = autoencoder(x)
        embedding_loss_t0 = mse(x, x_tilde)
        e_loss = 10 * tf.sqrt(embedding_loss_t0) + 0.1 * generator_loss_supervised

    var_list = embedder.trainable_variables + recovery.trainable_variables
    gradients = tape.gradient(e_loss, var_list)
    embedding_optimizer.apply_gradients(zip(gradients, var_list))
    return tf.sqrt(embedding_loss_t0)

In [None]:
@tf.function
def get_discriminator_loss(x, z):
    y_real = discriminator_model(x)
    discriminator_loss_real = bce(y_true=tf.ones_like(y_real),
                                  y_pred=y_real)

    y_fake = adversarial_supervised(z)
    discriminator_loss_fake = bce(y_true=tf.zeros_like(y_fake),
                                  y_pred=y_fake)

    y_fake_e = adversarial_emb(z)
    discriminator_loss_fake_e = bce(y_true=tf.zeros_like(y_fake_e),
                                    y_pred=y_fake_e)
    return (discriminator_loss_real +
            discriminator_loss_fake +
            gamma * discriminator_loss_fake_e)

In [None]:
@tf.function
def train_discriminator(x, z):
    with tf.GradientTape() as tape:
        discriminator_loss = get_discriminator_loss(x, z)

    var_list = discriminator.trainable_variables
    gradients = tape.gradient(discriminator_loss, var_list)
    discriminator_optimizer.apply_gradients(zip(gradients, var_list))
    return discriminator_loss

In [None]:
next(random_series).shape

In [None]:
step_g_loss_u = step_g_loss_s = step_g_loss_v = step_e_loss_t0 = step_d_loss = 0
for step in range(train_steps):
    # Train generator (twice as often as discriminator)
    for kk in range(2):
        X_ = next(real_series_iter)#real_test_1#next(real_series_iter)
        Z_ = next(random_series)#noise#next(random_series)

        # Train generator
        step_g_loss_u, step_g_loss_s, step_g_loss_v = train_generator(X_, Z_)
        # Train embedder
        step_e_loss_t0 = train_embedder(X_)

    X_ = next(real_series_iter)#real_test_1#next(real_series_iter)
    Z_ = next(random_series)#noise#next(random_series)
    step_d_loss = get_discriminator_loss(X_, Z_)
    if step_d_loss > 0.15:
        step_d_loss = train_discriminator(X_, Z_)

    if step % 10 == 0:
    #if step%2 == 0:
        print(f'{step:6,.0f} | d_loss: {step_d_loss:6.4f} | g_loss_u: {step_g_loss_u:6.4f} | '
              f'g_loss_s: {step_g_loss_s:6.4f} | g_loss_v: {step_g_loss_v:6.4f} | e_loss_t0: {step_e_loss_t0:6.4f}')

    with writer.as_default():
        tf.summary.scalar('G Loss S', step_g_loss_s, step=step)
        tf.summary.scalar('G Loss U', step_g_loss_u, step=step)
        tf.summary.scalar('G Loss V', step_g_loss_v, step=step)
        tf.summary.scalar('E Loss T0', step_e_loss_t0, step=step)
        tf.summary.scalar('D Loss', step_d_loss, step=step)

     0 | d_loss: 2.0569 | g_loss_u: 0.7401 | g_loss_s: 0.0004 | g_loss_v: 0.2473 | e_loss_t0: 0.0819
    10 | d_loss: 1.9444 | g_loss_u: 0.9059 | g_loss_s: 0.0017 | g_loss_v: 0.1513 | e_loss_t0: 0.0782
    20 | d_loss: 1.9052 | g_loss_u: 1.0809 | g_loss_s: 0.0007 | g_loss_v: 0.1491 | e_loss_t0: 0.0790
    30 | d_loss: 1.9020 | g_loss_u: 1.1693 | g_loss_s: 0.0004 | g_loss_v: 0.1481 | e_loss_t0: 0.0787
    40 | d_loss: 1.8945 | g_loss_u: 1.1321 | g_loss_s: 0.0003 | g_loss_v: 0.1475 | e_loss_t0: 0.0788
    50 | d_loss: 1.8883 | g_loss_u: 1.0792 | g_loss_s: 0.0003 | g_loss_v: 0.1475 | e_loss_t0: 0.0788
    60 | d_loss: 1.8809 | g_loss_u: 1.0651 | g_loss_s: 0.0003 | g_loss_v: 0.1475 | e_loss_t0: 0.0788
    70 | d_loss: 1.8700 | g_loss_u: 1.0706 | g_loss_s: 0.0003 | g_loss_v: 0.1471 | e_loss_t0: 0.0784
    80 | d_loss: 1.8572 | g_loss_u: 1.0603 | g_loss_s: 0.0002 | g_loss_v: 0.1451 | e_loss_t0: 0.0781
    90 | d_loss: 1.8404 | g_loss_u: 1.0308 | g_loss_s: 0.0002 | g_loss_v: 0.1453 | e_loss_t

In [None]:
synthetic_data.save(log_dir / 'synthetic_data')

In [None]:
generated_data = []
for i in range(int(n_windows / batch_size)):
    Z_ = next(random_series)
    d = synthetic_data(Z_)
    generated_data.append(d)

In [None]:
generated_data = np.array(np.vstack(generated_data))
generated_data.shape

In [None]:
np.save(log_dir / 'generated_data.npy', generated_data)

In [None]:
generated_data = (scaler.inverse_transform(generated_data.reshape(-1, n_seq)).reshape(-1, seq_len, n_seq))
generated_data.shape

In [None]:
with pd.HDFStore(hdf_store) as store:
    store.put('data/synthetic', pd.DataFrame(generated_data.reshape(-1, n_seq),
                                             columns=tickers))

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14, 7))
axes = axes.flatten()

index = list(range(1, 25))
synthetic = generated_data[np.random.randint(n_windows)]

idx = np.random.randint(len(train) - seq_len)
real = train.iloc[idx: idx + seq_len]

for j, ticker in enumerate(tickers):
    (pd.DataFrame({'Real': real.iloc[:, j].values,
                   'Synthetic': synthetic[:, j]})
     .plot(ax=axes[j],
           title=ticker,
           secondary_y='Synthetic', style=['-', '--'],
           lw=1))
sns.despine()
fig.tight_layout()

In [None]:
def AndersonDarling(data,predictions):
    N,P = data.shape
    ADdistance = 0
    for station in range(P) :
        temp_predictions = predictions[:,station].reshape(-1)
        temp_data = data[:,station].reshape(-1)
        sorted_array = np.sort(temp_predictions)
        count = np.zeros(len(temp_data))
        count = (1/(N+2)) * np.array([(temp_data < order).sum()+1 for order in sorted_array])
        idx = np.arange(1, N+1)
        ADdistance = (2*idx - 1) * (np.log(count) + np.log(1-count[::-1]))
        ADdistance = - N - np.sum(ADdistance)/N
    return ADdistance/P

In [None]:
AndersonDarling(np.array(test[:10]), generated_data[np.random.randint(n_windows)])

In [None]:
AndersonDarling(np.array(test[:10]), generated_data[np.random.randint(n_windows)])

In [None]:
generated_data[1]

In [None]:
# Load weights
model.load_weights(checkpoint_path)

# Create a new model instance
model = create_model()

# Save the weights using the `checkpoint_path` format
model.save_weights(checkpoint_path.format(epoch=0))

######## ######### ######## ######### ######## #########

# Save the weights
model.save_weights('./checkpoints/my_checkpoint')

# Create a new model instance
model = create_model()

# Restore the weights
model.load_weights('./checkpoints/my_checkpoint')

######## ######### ######## ######### ######## #########

# Create and train a new model instance.
model = create_model()
model.fit(train_images, train_labels, epochs=5)

# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('saved_model/my_model')

# my_model directory
ls saved_model

# Contains an assets folder, saved_model.pb, and variables folder.
ls saved_model/my_model

new_model = tf.keras.models.load_model('saved_model/my_model')

# Check its architecture
new_model.summary()

In [None]:
!mkdir -p saved_model
synthetic_data.save('saved_model/my_model')

In [None]:
new_model = tf.keras.models.load_model('saved_model/my_model')

In [None]:
generated_data = []
for i in range(int(n_windows / batch_size)):
    Z_ = next(random_series)
    d = new_model(Z_)
    generated_data.append(d)

In [None]:
generated_data = np.array(np.vstack(generated_data))
generated_data.shape

In [None]:
generated_data = (scaler.inverse_transform(generated_data.reshape(-1, n_seq)).reshape(-1, seq_len, n_seq))
generated_data.shape

In [None]:
AndersonDarling(np.array(test[:10]), generated_data[np.random.randint(n_windows)])

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14, 7))
axes = axes.flatten()

index = list(range(1, 25))
synthetic = generated_data[np.random.randint(n_windows)]

idx = np.random.randint(len(train) - seq_len)
real = train.iloc[idx: idx + seq_len]

for j, ticker in enumerate(tickers):
    (pd.DataFrame({'Real': real.iloc[:, j].values,
                   'Synthetic': synthetic[:, j]})
     .plot(ax=axes[j],
           title=ticker,
           secondary_y='Synthetic', style=['-', '--'],
           lw=1))
sns.despine()
fig.tight_layout()