In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
# !pip3.8 install tensorflow-addons

In [3]:
import warnings
warnings.filterwarnings("ignore")

import random
from copy import deepcopy
from time import time

import sklearn
import numpy as np
import pandas as pd
import tensorflow as tf

import seaborn as sns
import matplotlib.pyplot as plt

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.metrics import MeanSquaredError
from tensorflow.keras.losses import mean_squared_error
from tensorflow.keras.layers import Input, Dense, Layer, InputSpec, Lambda, Add, Multiply, LeakyReLU, ReLU
from tensorflow.keras import regularizers, activations, initializers, constraints, Sequential, layers, optimizers
from tensorflow.keras.regularizers import l1_l2 #l2, l1
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.constraints import UnitNorm, Constraint
from tensorflow.keras.callbacks import EarlyStopping, Callback, ModelCheckpoint, CallbackList
from tensorflow.keras.utils import plot_model

from tensorboard.plugins.hparams import api as hp

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

import scipy.io as scio
from scipy.linalg import orth

In [4]:
# %load_ext tensorboard

In [5]:
# !rm -rf ./logs/hparam_tuning_0
# !mkdir ./logs/

# # !rm -rf ./checkpoints/
# # !mkdir ./checkpoints/

In [19]:

from pairs_trading_package.pairs_trading_backtester import (
    SeriesAnalyser, DataProcessor
)

series_analyser = SeriesAnalyser()
data_processor = DataProcessor()

# intraday
df_prices = pd.read_pickle('../data_folder/original/commodity_ETFs_from_2014_complete.pickle')

SPLIT_IDX = 2

splits = [ 
    [('01-01-2012', '31-12-2014'), ('01-01-2015', '31-12-2015'), '2014-01-01'],
    [('01-01-2013', '31-12-2015'), ('01-01-2016', '31-12-2016'), '2015-01-01'],
    [('01-01-2014', '31-12-2016'), ('01-01-2017', '31-12-2017'), '2016-01-01'] 
]

# split data in training and test
df_prices_train, df_prices_test = data_processor.split_data(df_prices, splits[SPLIT_IDX][0], splits[SPLIT_IDX][1], remove_nan=True)

df_train_returns = data_processor.get_return_series(df_prices_train)

df_training_set = df_train_returns[:int(len(df_train_returns)*0.7)] 
df_test_set = df_train_returns[int(len(df_train_returns)*0.7):len(df_train_returns)] 

Total of 116 tickers
Total of 116 tickers after removing tickers with Nan values


In [20]:
INPUT_SHAPE = len(df_training_set.columns)
epochs = 1000

HP_BATCH_SIZE = hp.HParam('batch_size', hp.RealInterval(10.0, INPUT_SHAPE*1.0))
HP_L2 = hp.HParam('l2', hp.RealInterval(0.001, 0.5))
HP_NOISE_FACTOR = hp.HParam('noise_factor', hp.RealInterval(0.1, 0.5))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd', 'rmsprop']))
HP_DEPTH = hp.HParam('depth', hp.Discrete([1, 3, 6]))

METRIC_1 = 'train_loss'
METRIC_2 = 'test_loss'

with tf.summary.create_file_writer('logs/hparam_tuning_' + str(SPLIT_IDX)).as_default():
    hp.hparams_config(
        hparams=[HP_BATCH_SIZE, HP_L2, HP_NOISE_FACTOR, HP_OPTIMIZER, HP_DEPTH],
        metrics=[hp.Metric(METRIC_1, display_name='Train Loss'),
                 hp.Metric(METRIC_2, display_name='Test Loss')],
    )

In [22]:

train_X_cov = pd.DataFrame(np.dot(df_training_set.T, df_training_set))
refinement_X_cov = pd.DataFrame(np.dot(df_test_set.T, df_test_set)) 

cov_scaler = StandardScaler()
cov_scaler.fit(train_X_cov)

train_X_cov = pd.DataFrame(cov_scaler.transform(train_X_cov))
refinement_X_cov = pd.DataFrame(cov_scaler.transform(refinement_X_cov))



In [23]:
from time import time
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import InputLayer, Layer, InputSpec, Dense, Dropout, LeakyReLU, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

from tensorflow.keras.constraints import UnitNorm, Constraint

def func_api_get_autoencoder(dims, embedding_dim=10, act='relu', kernel_regularizer=None):
    """
    Fully connected auto-encoder model, symmetric.
    Arguments:
        dims: list of number of units in each layer of encoder. dims[0] is input dim, dims[-1] is units in hidden layer.
            The decoder is symmetric with encoder. So number of layers of the auto-encoder is 2*len(dims)-1
        act: activation, not applied to Input, Hidden and Output layers
    return:
        (ae_model, encoder_model), Model of autoencoder and model of encoder
    """
    n_stacks = len(dims) - 1
    # input
    x = Input(shape=(dims[0],), name='input')
    h = x

    if dims != 1:
        # internal layers in encoder
        for i in range(n_stacks):
            h = Dense(dims[i + 1], activation=act, kernel_regularizer=kernel_regularizer, name='encoder_%d' % i)(h)
            h = BatchNormalization()(h)

    # hidden layer # hidden layer, features are extracted from here
    h = Dense(embedding_dim, activation=act, kernel_regularizer=kernel_regularizer, name='embedding')(h)
    h = BatchNormalization()(h)

    y = h
    # internal layers in decoder
    for i in range(n_stacks, 0, -1):
        y = Dense(dims[i], activation=act, kernel_regularizer=kernel_regularizer, name='decoder_%d' % i)(y)
        y = BatchNormalization()(y)

    # output
    y = Dense(dims[0], activation=act, name='decoder_0')(y)

    return Model(inputs=x, outputs=y)

In [24]:
def train_test_model(hparams, run_dir, summary_writer):    
        
    train_dataset = tf.data.Dataset.from_tensor_slices((train_X_cov, train_X_cov))
    train_dataset = train_dataset.shuffle(buffer_size=25).batch(hparams[HP_BATCH_SIZE], drop_remainder=True)

    test_dataset = tf.data.Dataset.from_tensor_slices((refinement_X_cov, refinement_X_cov))
    test_dataset = test_dataset.shuffle(buffer_size=25).batch(hparams[HP_BATCH_SIZE], drop_remainder=True)
    
    
    logs = {}    
    verbose = False
    loss_per_epoch = []
    noise_factor = hparams[HP_NOISE_FACTOR]
    
    loss_fn = tf.keras.losses.MeanSquaredError('auto')

    train_loss_tracker = tf.keras.metrics.MeanSquaredError('train_loss')
    val_loss_tracker = tf.keras.metrics.MeanSquaredError(name='val_loss')
    
    model = func_api_get_autoencoder([INPUT_SHAPE//depth_factor for depth_factor in range(1, hparams[HP_DEPTH]+1)], 
                                     kernel_regularizer=l2(hparams[HP_L2]))
                                     #l1_l2(l1=hparams[HP_L1], l2=hparams[HP_L2]))    
    
    if hparams[HP_OPTIMIZER] == 'adam':
        optimizer = Adam()
        
    elif hparams[HP_OPTIMIZER] == 'sgd':
        optimizer = SGD()
        
    elif hparams[HP_OPTIMIZER] == 'rmsprop':
        optimizer = RMSprop()
    
    @tf.function
    def train_step(x, y):

        with tf.GradientTape() as tape:
            logits = model(x, training=True)
            loss_value = loss_fn(y, logits)

        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        train_loss_tracker.update_state(y, logits)

        return loss_value

    @tf.function
    def test_step(x, y):

        val_logits = model(x, training=False)
        val_loss_tracker.update_state(y, val_logits)

    
    _callbacks = [EarlyStopping(monitor='val_loss', mode='min', patience=100, restore_best_weights=True),
                  ModelCheckpoint(verbose=0, filepath=run_dir + '.ckpt', save_weights_only=True, monitor='val_loss', mode='min', save_best_only=True)]
    
    callbacks = CallbackList(_callbacks, add_history=True, model=model)

    callbacks.on_train_begin(logs=logs)

    for epoch in range(epochs):
        callbacks.on_epoch_begin(epoch, logs=logs)

        if verbose:
            print("\nStart of epoch %d" % (epoch,))

        # Iterate over the batches of the dataset.
        for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
            callbacks.on_batch_begin(step, logs=logs)
            callbacks.on_train_batch_begin(step, logs=logs)
        
            # Add noise to the input to feed to Denoising encoder model.
            x_noisy_train = x_batch_train + noise_factor*tf.random.normal(shape=tf.shape(x_batch_train), mean=0.0, stddev=1.0, dtype=tf.float64) 
            x_noisy_train = tf.clip_by_value(x_noisy_train, 0.0, 1.0)

            loss_value = train_step(x_noisy_train, y_batch_train)
            
            callbacks.on_train_batch_end(step, logs=logs)
            callbacks.on_batch_end(step, logs=logs)

        
        tf.summary.scalar(METRIC_1, train_loss_tracker.result(), step=epoch)
            
        # Display metrics at the end of each epoch.
        train_loss = train_loss_tracker.result()        

        if verbose:
            print("Training acc over epoch: %.4f" % (float(train_loss),))

        # Reset training metrics at the end of each epoch
        train_loss_tracker.reset_states()           

        # Iterate over the batches of the dataset.
        for step, (x_batch_val, y_batch_val) in enumerate(test_dataset):
            callbacks.on_batch_begin(step, logs=logs)
            callbacks.on_test_batch_begin(step, logs=logs)

            # Add noise to the input to feed to Denoising encoder model.
            x_noisy_val = x_batch_val + noise_factor*tf.random.normal(shape=tf.shape(x_batch_val), mean=0.0, stddev=1.0, dtype=tf.float64) 
            x_noisy_val = tf.clip_by_value(x_noisy_val, 0.0, 1.0)

            test_step(x_noisy_val, y_batch_val)
            
            callbacks.on_test_batch_end(step, logs=logs)
            callbacks.on_batch_end(step, logs=logs)
        
        tf.summary.scalar(METRIC_2, val_loss_tracker.result(), step=epoch)

        val_loss = val_loss_tracker.result()
            
        val_loss_tracker.reset_states()

        if verbose:
            print("Validation acc: %.4f" % (float(val_loss),))

        loss_per_epoch.append([train_loss, val_loss])

        logs['val_loss'] = val_loss

        callbacks.on_epoch_end(epoch, logs=logs)

        if model.stop_training:
            break

    callbacks.on_train_end(logs=logs)

    return loss_fn(train_X_cov, model.predict(train_X_cov)).numpy()


def run(run_dir, hparams):
    with tf.summary.create_file_writer(run_dir).as_default() as summary_writer:
        hp.hparams(hparams)  # record the values used in this trial
        train_test_model(hparams, run_dir, summary_writer)

In [1]:

session_num = 0

for batch_size in np.random.randint(HP_BATCH_SIZE.domain.min_value, HP_BATCH_SIZE.domain.max_value, 3):
    for l2_param in np.random.uniform(HP_L2.domain.min_value, HP_L2.domain.max_value, 3):
        for noise_factor in np.random.uniform(HP_NOISE_FACTOR.domain.min_value, HP_NOISE_FACTOR.domain.max_value, 3):
            for depth in HP_DEPTH.domain.values:
                for optimizer in HP_OPTIMIZER.domain.values:
                    hparams = {
                        HP_BATCH_SIZE: batch_size,
                        HP_L2: l2_param,
                        HP_NOISE_FACTOR: noise_factor,
                        HP_OPTIMIZER: optimizer,
                        HP_DEPTH: depth,
                    }

                    run_name = "run-%d" % session_num
                    print('--- Starting trial: %s' % run_name)
                    print({h.name: hparams[h] for h in hparams})
                    run('logs/hparam_tuning_' + str(SPLIT_IDX) + '/' + run_name, hparams)
                    session_num += 1

In [11]:
# !ls /tmp/.tensorboard-info/
# !rm /tmp/.tensorboard-info/pid-21090.info
# %tensorboard --logdir logs/hparam_tuning_0