In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q tensorflow
!pip install -q tensorflow_ranking
!pip install tensorboard-plugin-profile


[K     |████████████████████████████████| 462 kB 5.6 MB/s 
[K     |████████████████████████████████| 141 kB 5.4 MB/s 
[?25hCollecting tensorboard-plugin-profile
  Downloading tensorboard_plugin_profile-2.8.0-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 7.8 MB/s 
Collecting gviz-api>=1.9.0
  Downloading gviz_api-1.10.0-py2.py3-none-any.whl (13 kB)
Installing collected packages: gviz-api, tensorboard-plugin-profile
Successfully installed gviz-api-1.10.0 tensorboard-plugin-profile-2.8.0


In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import os

#PATH = os.getcwd()
PATH = '/content/drive/Shareddrives/Master Tesis/Tesis/'

In [None]:
import tensorflow_ranking as tfr
import tensorflow as tf
from tensorflow_serving.apis import input_pb2
from tensorboard.plugins.hparams import api as hp

final_table = pd.read_csv(os.path.join(PATH, 'Tables', 'final_table.csv'))
final_table['Query'] = pd.to_datetime(final_table['Query']).dt.date
final_table

Unnamed: 0,Query,Ticker,big_log_ret,big_RCV,big_RVT,big_positivePartscr,big_negativePartscr,big_splogscr,big_linscr
0,2012-01-08,AAL,0.063980,8.476000,0.000280,0.020240,0.013340,0.010760,54.733580
1,2012-01-08,AAPL,-0.006151,13.162167,0.003400,0.016050,0.015650,0.020333,48.380133
2,2012-01-08,ABC,-0.020684,0.404000,0.000760,0.027820,0.012980,0.017000,65.547120
3,2012-01-08,ABT,0.000863,-24.607200,0.000320,0.010820,0.018540,-0.012980,27.375300
4,2012-01-08,AMZN,-0.023212,-3.370000,0.001100,0.016180,0.020000,-0.024640,40.688020
...,...,...,...,...,...,...,...,...,...
41819,2021-11-28,UPS,-0.022512,-42.160333,0.000317,0.018200,0.024717,-0.008817,46.677617
41820,2021-11-28,USB,-0.033782,-28.514833,0.000483,0.020517,0.010367,0.037033,68.229333
41821,2021-11-28,VZ,-0.007363,-22.016000,0.000686,0.018957,0.036629,-0.066443,28.958114
41822,2021-11-28,WFC,-0.014140,-37.779000,0.000983,0.004283,0.017333,-0.061550,15.077883


## FIXED VARIABLES FOR THE NEURAL NETWORK

In [None]:
# Store the paths to files containing training and test instances.
_TRAIN_DATA_PATH = PATH +"Dataset-tfrecords/70v30Split/train.tfrecord"
_VALID_DATA_PATH =  PATH +"Dataset-tfrecords/70v30Split/test.tfrecord"

# The maximum number of documents per query in the dataset.
# Document lists are padded or truncated to this size.
_LIST_SIZE = final_table.groupby("Query").count().max()[0] #107 documents

# The document relevance label in the tf-records.
_LABEL_FEATURE_NAME = "rel"
_NUM_FEATURES = final_table.shape[1] - 3
_NAME_FEATURES = list(final_table.columns[3:]) #Name of the doc features ("doc id" and "rel" are not features)

# Padding labels are set negative so that the corresponding examples can be
# ignored in loss and metrics.
_PADDING_LABEL = -1

# Parameters to the scoring function.
_BATCH_SIZE = 16

#Loss function for listwise 
_loss_obj = tfr.keras.losses.get(
    tfr.losses.RankingLossKey.LIST_MLE_LOSS)

# Location of model directory and number of training steps.
_MODEL_DIR = f"./Models/model_{dt.datetime.now().strftime('%m-%d-%Y_%H-%M-%S')}"

#Location of the log of the hyperparameter tuning DO NOT CHANGE
logdir = f"/content/logs/hparam_tuning/"

# setting as shell env for tensorboard stuff
os.environ["models_dir"] = _MODEL_DIR

In [None]:
'''Specifying Features via Feature Columns: (see https://developers.googleblog.com/2017/11/introducing-tensorflow-feature-columns.html)

Feature Columns are TensorFlow abstractions that are used to capture rich information about each feature.
It allows for easy transformations for a diverse range of raw features and for interfacing with Estimators.

Consistent with our input formats for ranking, such as ELWC format, we create feature columns for context features 
and example features.
'''

def create_feature_columns():
    # We dont have context featuresin in our datasets (query id is not a feature)
    context_feature_columns = {}
    
    feature_names = _NAME_FEATURES
    example_feature_columns = {
        name:
        tf.feature_column.numeric_column(name, shape=(1,), default_value=0.0)
        for name in feature_names}
    
    return context_feature_columns, example_feature_columns

In [None]:
def create_dataset_from_tfrecords(input_path:str,
                                  batch_sz:int,
                                  list_sz:int,
                                  shuffle:bool = True,
                                  num_epochs:int = None,
                                  data_format:str = "ELWC",
                                  compression_type:str = ''):

    context_feature_columns, example_feature_columns = create_feature_columns()


    context_feature_spec = tf.feature_column.make_parse_example_spec(
      context_feature_columns.values())
    label_column = tf.feature_column.numeric_column(
      _LABEL_FEATURE_NAME, dtype=tf.float32, default_value=_PADDING_LABEL)
    example_feature_spec = tf.feature_column.make_parse_example_spec(
      list(example_feature_columns.values()) + [label_column])

    _reader_arg_list = []
    if compression_type:
        assert compression_type in ["", "GZIP","ZLIB"]
        _reader_arg_list = [compression_type]


    dataset = tfr.data.build_ranking_dataset(
      file_pattern=input_path,
      data_format=tfr.data.ELWC,
      batch_size=batch_sz,
      list_size=list_sz,
      context_feature_spec=context_feature_spec,
      example_feature_spec=example_feature_spec,
      reader=tf.data.TFRecordDataset,
      reader_args= _reader_arg_list,
      shuffle=shuffle,
      num_epochs=num_epochs,
      )
    
    def _log1p_transform(features):
        '''
        computes elementwise log_e(|x|)*sign(x)
        '''
        transformed_feats = {
            f:tf.math.multiply(
                tf.math.log1p(
                    tf.math.abs(features[f])
                    ),
                tf.math.sign(features[f])
                )
            for f in features}
        return transformed_feats

    def _split_label_and_transform_features(features):
        label = tf.squeeze(features.pop(_LABEL_FEATURE_NAME), axis=2)
        label = tf.cast(label, tf.float32)
        features = features #_log1p_transform(features)

        return features, label

    dataset = dataset.map(_split_label_and_transform_features)
    return dataset

In [None]:
#assert tf.test.gpu_device_name() != '', "GPU not detected, training is much faster GPU/TPU instance of colab"

train_dataset = create_dataset_from_tfrecords(_TRAIN_DATA_PATH,
                                              _BATCH_SIZE,
                                              _LIST_SIZE,
                                              compression_type="")

vali_dataset = create_dataset_from_tfrecords(_VALID_DATA_PATH,
                                             _BATCH_SIZE,
                                             _LIST_SIZE,
                                             shuffle=False,
                                             num_epochs=1, 
                                             compression_type="")

## VARIABLES TO CROSS VALIDATE

In [19]:
#Number of hidden nodes
HP_NUM_UNITS_DIM = hp.HParam('num_units_Dim', hp.Discrete([5, 10,15,20]))
#Learning rate
HP_LEARNING_RATE = hp.HParam('learning_rate', hp.Discrete([0.001, 0.005, 0.01]))
#Number of steps per epoch
HP_STEPS = hp.HParam('steps', hp.Discrete([100,200]))
#Number of epochs
HP_EPOCHS = hp.HParam('epochs', hp.Discrete([50, 100, 200]))
METRIC = 'ngdc'

#Creates the file in the log directory of the configuration of the hyperparameters
# and the metrics
with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_NUM_UNITS_DIM, HP_LEARNING_RATE, HP_STEPS, HP_EPOCHS],
    metrics=[hp.Metric(METRIC, display_name='NGDCMetric')],
  )

In [28]:
def train_test_model(hparams):
    '''
    Creates the scorer function (the neural net), compiles and fits it. It 
    allows us to use this function in a loop to iterate over all combinations
    of hyperparameters. 

    It runs one model at a time, looping is requiered. This format also allows 
    for the hyperparameters to be stored in a way as to be able to use 
    tensorboard.

    '''


    context_feature_columns, example_feature_columns = create_feature_columns()
    # Using a Canned Network
    ranking_network = tfr.keras.canned.DNNRankingNetwork(
      context_feature_columns=context_feature_columns,
      example_feature_columns=example_feature_columns,
      hidden_layer_dims=[hparams[HP_NUM_UNITS_DIM]],
      activation=tf.nn.relu,
      )
    
    #Compiles the model with the fixed loss and metric but with variable learning rate
    ranker = tfr.keras.model.create_keras_model(network=ranking_network,
                                            loss=_loss_obj,
                                            metrics=[tfr.keras.metrics.NDCGMetric(topn = 27)],
                                            optimizer=tf.keras.optimizers.Adagrad(learning_rate=hparams[HP_LEARNING_RATE]),
                                            size_feature_name=None)
    
    #Fits the complied model with validation set (test set in our case) while
    #allowing for variable epochs and steps. Callbacks to the tensorboard directory
    hist = ranker.fit(train_dataset,
            validation_data=vali_dataset,
            steps_per_epoch=hparams[HP_STEPS],
            epochs=hparams[HP_EPOCHS],
            validation_steps=1,
            callbacks=[
        tf.keras.callbacks.TensorBoard(logdir),  # log metrics
        hp.KerasCallback(logdir, hparams),  # log hparams
    ],)
    
    # this will print a dictionary object, now you need to grab the metrics / score you're looking for
    print(hist.history) 
    # Obtains the validation metric which we will use to select model. It is stored
    #in a dictionary where we extract the value vector (one value per epoch) and
    # we return the mas 
    loss = next(v for k,v in hist.history.items() if k.startswith('val_lo'))
    ndcg = next(v for k,v in hist.history.items() if k.startswith('val_ndcg'))
    for idx, val in enumerate(loss):
        if val == min(loss):
            best_epoch = idx +1



    return min(loss), best_epoch, ndcg[best_epoch]



## Evaluate model performance

In [26]:
session_num = 0
# Iterates over all combinations and stores the in the log directory where it can
# later be accessed by the tensorboard
cv = {}
for num_units in HP_NUM_UNITS_DIM.domain.values:
    for lr in HP_LEARNING_RATE.domain.values:
        for step in HP_STEPS.domain.values:
            for epoch in HP_EPOCHS.domain.values:
                hparams = {
                    HP_NUM_UNITS_DIM: num_units,
                    HP_LEARNING_RATE: lr,
                    HP_STEPS: step,
                    HP_EPOCHS: epoch,
                }
                run_name = "run-%d" % session_num
                loss, best_epoch, ndcg = train_test_model(hparams)
                cv[run_name] = {h.name: hparams[h] for h in hparams}
                cv[run_name]["ndcg"] = ndcg
                cv[run_name]["val_loss"] = loss
                cv[run_name]["best_epoch"] = best_epoch
                print('--- Starting trial: %s' % run_name)
                print({h.name: hparams[h] for h in hparams})
                #run('/logs/hparam_tuning/' + run_name, hparams)
                session_num += 1

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
{'loss': [350.29608154296875, 344.2957458496094, 342.247314453125, 340.7732849121094, 340.2157287597656, 339.1365661621094, 339.20758056640625, 338.4549865722656, 337.604248046875, 336.6279296875, 337.3997802734375, 337.1488037109375, 336.8110046386719, 336.51678466796875, 336.581787109375, 337.0479431152344, 336.3515930175781, 335.9701843261719, 335.4295349121094, 336.0412902832031, 334.8956298828125, 335

In [27]:
pd.DataFrame(cv)

Unnamed: 0,run-0
best_epoch,15.0
epochs,50.0
learning_rate,0.004
ndcg,0.559631
num_units_Dim,15.0
steps,100.0
val_loss,334.352051
