In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q tensorflow
!pip install -q tensorflow_ranking

In [181]:
import numpy as np
import pandas as pd
import datetime as dt
import os

PATH = os.getcwd()
#PATH = '/content/drive/Shareddrives/Master Tesis/Tesis'

In [182]:
import tensorflow_ranking as tfr
import tensorflow as tf
from tensorflow_serving.apis import input_pb2

final_table = pd.read_csv(os.path.join(PATH, 'Tables', 'final_table.csv'))
final_table['Query'] = pd.to_datetime(final_table['Query']).dt.date
final_table

Unnamed: 0,Query,Ticker,big_log_ret,big_RCV,big_RVT,big_positivePartscr,big_negativePartscr,big_splogscr,big_linscr,big_lag1_log_ret,big_lag4_log_ret,big_lag1_month_log_ret
0,2012-01-08,AAL,0.063980,8.476000,0.000280,0.020240,0.013340,0.010760,54.733580,0.099426,0.094986,-0.003565
1,2012-01-08,AAPL,-0.006150,13.162167,0.003400,0.016050,0.015650,0.020333,48.380133,0.042066,-0.032534,0.070567
2,2012-01-08,ABC,-0.020683,0.404000,0.000760,0.027820,0.012980,0.017000,65.547120,0.036953,0.010394,0.059249
3,2012-01-08,ABT,0.000864,-24.607200,0.000320,0.010820,0.018540,-0.012980,27.375300,-0.006602,0.005847,0.023364
4,2012-01-08,AMZN,-0.023212,-3.370000,0.001100,0.016180,0.020000,-0.024640,40.688020,0.053483,-0.062913,-0.055493
...,...,...,...,...,...,...,...,...,...,...,...,...
42050,2021-11-28,UPS,-0.022512,-42.160333,0.000317,0.018200,0.024717,-0.008817,46.677617,-0.015323,-0.024901,-0.034095
42051,2021-11-28,USB,-0.033782,-28.514833,0.000483,0.020517,0.010367,0.037033,68.229333,-0.001564,0.005287,-0.048707
42052,2021-11-28,VZ,-0.007363,-22.016000,0.000686,0.018957,0.036629,-0.066443,28.958114,0.018313,-0.014255,-0.022713
42053,2021-11-28,WFC,-0.014140,-37.779000,0.000983,0.004283,0.017333,-0.061550,15.077883,-0.008021,-0.010320,-0.050985


In [264]:
# Store the paths to files containing training and test instances.
_TRAIN_DATA_PATH = "./Dataset-tfrecords/70v30Split/train.tfrecord"
_VALID_DATA_PATH =  "./Dataset-tfrecords/70v30Split/test.tfrecord"

# The maximum number of documents per query in the dataset.
# Document lists are padded or truncated to this size.
_LIST_SIZE = 108 #For each query we will be taking the max number of docs/companies - 108

# The document relevance label in the tf-records.
_LABEL_FEATURE_NAME = "rel"
_NAME_FEATURES = ['big_RCV', 'big_RVT', 'big_positivePartscr', 'big_negativePartscr',
                  'big_splogscr', 'big_linscr', 'big_lag1_log_ret', 'big_lag4_log_ret',
                  'big_lag1_month_log_ret'] #Name of the doc features ("doc id" and "rel" are not features)
_NUM_FEATURES = len(_NAME_FEATURES)

# Padding labels are set negative so that the corresponding examples can be
# ignored in loss and metrics.
_PADDING_LABEL = -1

# Learning rate for optimizer.
_LEARNING_RATE = 0.05

# Parameters to the scoring function.
_BATCH_SIZE = 32
_DROPOUT_RATE = 0.5

### Creating input pipeline

In [453]:
def create_feature_columns():
    '''
    This function specifies Features via Feature Columns: (see https://developers.googleblog.com/2017/11/introducing-tensorflow-feature-columns.html)

    Feature Columns are TensorFlow abstractions that are used to capture rich information about each feature.
    It allows for easy transformations for a diverse range of raw features and for interfacing with Estimators.
    Specifying the feature columns (and optionally, their transformaitons) is necessary to provide the parsing specifications.

    Consistent with our input formats for ranking, such as ELWC format, we create feature columns for context features 
    and example features.
    '''
    # We dont have context features in in our datasets (query id is not a feature)
    context_feature_columns = {}
    
    #Specifying the features that we will take from our ELWC, and their transformations (in our case all are numeric, so no transformations)
    feature_names = _NAME_FEATURES
    example_feature_columns = {
        name:
        tf.feature_column.numeric_column(name, shape=(1,), default_value=0.0)
        for name in feature_names}
    
    return context_feature_columns, example_feature_columns


def create_dataset_from_tfrecords(input_path:str,
                                  batch_sz:int,
                                  list_sz:int,
                                  shuffle:bool = False,
                                  num_epochs:int = None,
                                  data_format:str = "ELWC",
                                  compression_type:str = ''):
    '''
    Function to read ELWC tfrecords and convert them into a Ranking Dataset according to the parsing specs described in the next lines
    '''
    
    #Specify the TensorFlow abstractions of each feature inside context and example dictionaries
    context_feature_columns, example_feature_columns = create_feature_columns()

    #Create parsing spec dictionary from input feature_columns. The returned dictionary can be used as arg in
    #tfr.dataset.build_ranking_dataset and specifies how it should parse the document in "file_pattern" argument
    context_feature_spec = tf.feature_column.make_parse_example_spec(context_feature_columns.values())
    label_column = tf.feature_column.numeric_column(_LABEL_FEATURE_NAME, dtype=tf.float32, default_value=_PADDING_LABEL)
    example_feature_spec = tf.feature_column.make_parse_example_spec(list(example_feature_columns.values()) + [label_column])
    
    #Define the "reader_args" that we'll pass to tfr.dataset.build_ranking_dataset to specify the compression type
    _reader_arg_list = []
    if compression_type:
        assert compression_type in ["", "GZIP","ZLIB"]
        _reader_arg_list = [compression_type]

    #Build Ranking Dataset
    dataset = tfr.data.build_ranking_dataset(
      file_pattern=input_path,
      data_format=tfr.data.ELWC,
      batch_size=batch_sz, #Rate at which we read our data into our model from file
      list_size=list_sz, #Amount of docs we will take for each query - in our case we take all companies
      context_feature_spec=context_feature_spec,
      example_feature_spec=example_feature_spec,
      reader=tf.data.TFRecordDataset,
      reader_args= _reader_arg_list,
      shuffle=shuffle, #Whether to shuffle the examples before taking them
      num_epochs=num_epochs,
      )
    
    #Define additional transformations we might want to apply to our features
    def _log1p_transform(features):
        '''
        computes elementwise log_e(|x|)*sign(x)
        '''
        transformed_feats = {
            f:tf.math.multiply(
                tf.math.log1p(
                    tf.math.abs(features[f])
                    ),
                tf.math.sign(features[f])
                )
            for f in features}
        return transformed_feats

    def _split_label_and_transform_features(features):
        label = tf.squeeze(features.pop(_LABEL_FEATURE_NAME), axis=2)
        label = tf.cast(label, tf.float32)
        features = features #_log1p_transform(features)

        return features, label

    dataset = dataset.map(_split_label_and_transform_features)
    return dataset

In [452]:
train_dataset = create_dataset_from_tfrecords(_TRAIN_DATA_PATH,
                                              _BATCH_SIZE,
                                              _LIST_SIZE,
                                              compression_type="")

vali_dataset = create_dataset_from_tfrecords(_VALID_DATA_PATH,
                                             _BATCH_SIZE,
                                             _LIST_SIZE,
                                             shuffle=False,
                                             num_epochs=1, 
                                             compression_type="")

### Defining Loss function and evaluation Metrics

In [None]:
#This loss and the evaluation functions will be passed to Keras when we build our Model

#Loss:
#Different Losses knowing that the softmax loss is representative of the ListNet apporach
_loss_obj = tfr.keras.losses.get(tfr.losses.RankingLossKey.SOFTMAX_LOSS) # Contains all ranking metrics, including NDCG @ {1, 3, 5, 10}.
#_loss_obj = tfr.keras.losses.get(tfr.losses.RankingLossKey.UNIQUE_SOFTMAX_LOSS)
#_loss_obj = tfr.keras.losses.get(tfr.losses.RankingLossKey.LIST_MLE_LOSS)

#Evaluation metrics:
def _make_eval_metric_fns():
    """Returns a list of ranking metrics for the keras ranker"""
    metric_fns = [tfr.keras.metrics.get(**kwargs) 
                        for kwargs in [dict(key="ndcg", topn=topn, 
                                        name="metric/ndcg_{}".format(topn)) 
                                            for topn in [1, 3, 5, 10]]
                ]
    return metric_fns

default_metrics = _make_eval_metric_fns()

### Create TensorFlow model - Using pre-made (canned) Estimator

In [225]:
# tf.feature_columns specifications can be passed to the "feature_columns" argument of our DNN Estimator, when we instanciate it 
context_feature_columns, example_feature_columns = create_feature_columns()

# Using a Canned Network - (See to know about Estimators: https://developers.googleblog.com/2017/09/introducing-tensorflow-datasets.html)
ranking_network = tfr.keras.canned.DNNRankingNetwork(
      context_feature_columns=context_feature_columns,
      example_feature_columns=example_feature_columns,
      hidden_layer_dims=[64, 24, 10],
      activation=tf.nn.relu,
      dropout=_DROPOUT_RATE,
      use_batch_norm=True,
      batch_norm_moment=0.4)


### Putting It All Together in a Model Builder

In [236]:
# Build ranker as a Functional Keras model.
ranker = tfr.keras.model.create_keras_model(network=ranking_network,
                                            loss=_loss_obj,
                                            metrics=default_metrics,
                                            optimizer=tf.keras.optimizers.Adagrad(learning_rate=_LEARNING_RATE),
                                            size_feature_name=None)


### Additional run config parameters for our Estimator

In [237]:
#Not necessary unless you want to create an Estimator out of the model we will create in the next cell by using tfr.keras.estimator.model_to_estimator
run_config = tf.estimator.RunConfig(
      model_dir=_MODEL_DIR,
      keep_checkpoint_max=10,
      save_checkpoints_secs=200)

In [243]:
# Directory where we save the log records of the training and validation.
_MODEL_DIR = f"./Models/model_{dt.datetime.now().strftime('%Y-%m-%d_%H-%M')}"


# setting as shell env for tensorboard stuff
os.environ["models_dir"] = _MODEL_DIR

In [245]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(_MODEL_DIR)

ranker.fit(train_dataset,
           validation_data=vali_dataset,
           steps_per_epoch=100,
           epochs=100,
           validation_steps=1,
           callbacks=[tensorboard_callback],
           verbose='auto')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1f05d4eb070>

## Evaluate model performance

#### TensorBoard for Train & Eval tracking

In [242]:
%load_ext tensorboard
%tensorboard --logdir=./Models #--port 25952

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 27076), started 1 day, 13:55:20 ago. (Use '!kill 27076' to kill it.)

In [146]:
#The tensorboard extension is already loaded. To reload it, use:
%reload_ext tensorboard

### Testing the ranking model

The model returns as predictions an array of size (num_queries,list_size), where each row contains the scores of each document in the same order they were inputted by the data pipeline (note that if the num_docs < list_size, the scores of the last docs are dummy docs that were padded).
I order to associate each score with the correct doc (company), we will parse the TFRecords, and obtain the list of companies for each query (date), in the same order that were inputted to the predict() method.

In [459]:
#Let's parse the TFRecords in order to obtain the id of each query + the list of ids of the documents in each query
raw_dataset = tf.data.TFRecordDataset([_VALID_DATA_PATH])

query_docs_list = [] #A list that will be populated with tuples (query_id, list_of_docs)
for raw_record in raw_dataset.take(-1): #take(-1) takes all the records in raw_dataset
    ELWC = input_pb2.ExampleListWithContext()
    v = ELWC.FromString(raw_record.numpy()) #v is composed by v.context and a set of examples in v.examples
    
    query_id = v.context.features.feature['qid'].bytes_list.value[0].decode('UTF-8')
    docs_list = []
    for e in v.examples:
        doc_id = e.features.feature['doc'].bytes_list.value[0].decode('UTF-8')
        docs_list.append(doc_id)
        
    query_docs_list.append((query_id, docs_list))

In [463]:
predictions = ranker.predict(vali_dataset,
                             batch_size=None,
                             verbose='auto',
                             steps=None,
                             callbacks=None)

predictions_record = [] #List that will contain  the prediction tuples (date, ranking_of_companies)
queries = []
docs_per_query = []
for idx, query_docs in enumerate(query_docs_list):
    query, docs = query_docs
    scores = list(predictions[idx])
    dict_doc_scr = dict(zip(docs, scores))
    sorted_dict = dict(sorted(dict_doc_scr.items(), key=lambda item: item[1], reverse=True))
    queries.append(query)
    docs_per_query.append(list(sorted_dict.keys()))

In [467]:
result_dict = {'date': queries, 'ranking': docs_per_query}

df = pd.DataFrame.from_dict(result_dict, orient='index')
df = df.transpose()
df.to_csv('ranking_results.csv', index=False, header=True,  encoding='utf-8')