# Session-based Recs with Transformers4Rec: Transformer

Followed a step by step tutorial:
https://nvidia-merlin.github.io/Transformers4Rec/main/examples/tutorial/index.html

## Imports

In [1]:
import os
import glob
import pandas as pd
import numpy as np

from transformers4rec import tf as tr
import tensorflow as tf
from transformers4rec.tf.ranking_metric import NDCGAt, RecallAt

## Instantiates Schema object from schema file

In [2]:
# define the input file path
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", '../data/')
# define the output file path
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "../data/sessions_by_day")
# features chosen to train on
chosen_features = ['product_id-list_seq']
from merlin_standard_lib import Schema
# define schema object to pass it to the TabularSeqeunceFeatures class
SCHEMA_PATH = os.path.join(INPUT_DATA_DIR, 'schema.pb')
schema = Schema().from_proto_text(SCHEMA_PATH)
schema = schema.select_by_name(chosen_features)

## Define Input Block

use MLM as the training method

In [3]:
# Input
sequence_length, d_model = 20, 192
# Define input module to process tabular input-features and to prepare masked inputs
inputs = tr.TabularSequenceFeatures.from_schema(
    schema,
    max_sequence_length = sequence_length,
    d_output = d_model,
    masking = 'mlm'
)

2021-12-22 13:00:58.159201: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Build Transformer Block

In [55]:
class NextItemPredictionTask_temp(tr.NextItemPredictionTask):
    def calculate_metrics(
        self, predictions, targets=None, sample_weight=None, forward=True, loss=None
    ):
        if isinstance(targets, dict) and self.target_name:
            targets = targets[self.target_name]

        if forward:
            predictions = self(predictions)
            # retrieve labels from masking
            if self.masking:
                targets = self.masking.masked_targets
            # flatten labels and remove padding index
            targets = tf.reshape(targets, -1)
            non_pad_mask = targets != self.padding_idx
            targets = tf.boolean_mask(targets, non_pad_mask)
        
        print(self.masking.masked_targets)

        update_ops = []

        for metric in self.eval_metrics:
            update_ops.append(
                metric.update_state(y_true=targets, y_pred=predictions, sample_weight=sample_weight)
            )

        for metric in self.prediction_metrics:
            update_ops.append(metric.update_state(predictions, sample_weight=sample_weight))

        for metric in self.label_metrics:
            update_ops.append(metric.update_state(targets, sample_weight=sample_weight))

        for metric in self.loss_metrics:
            if not loss:
                loss = self.loss(y_true=targets, y_pred=predictions, sample_weight=sample_weight)
            update_ops.append(metric.update_state(loss, sample_weight=sample_weight))

        return update_ops

In [56]:
# define XLNetConfig class and set default parameters for HF XLNet config
transformer_config = tr.XLNetConfig.build(
    d_model = d_model, n_head=4, n_layer=2, total_seq_length=sequence_length
)

# define the model block including: inputs, masking, projection and transformer block.

body = tr.SequentialBlock(
    [inputs,
    tr.MLPBlock([192]),
    tr.TransformerBlock(transformer_config, masking=inputs.masking)]
)

# define the head for to the next item prediction task

head = tr.Head(
    body,
    NextItemPredictionTask_temp(
        weight_tying=True, 
        # hf_format=True, 
        metrics=[NDCGAt(top_ks=[10, 20], labels_onehot=True),RecallAt(top_ks=[10, 20], labels_onehot=True)],
        # loss = tf.keras.losses.CategoricalCrossentropy)
))
# head = tr.Head(
#     body,
#     tr.NextItemPredictionTask(
#         weight_tying=True, 
#         # hf_format=True, 
#         metrics=[NDCGAt_temp(top_ks=[10, 20], labels_onehot=False)]
#         # loss = tf.keras.losses.CategoricalCrossentropy)
# ))

# get the end-to-end Model class

model = tr.Model(head)

## Build Datasets

In [57]:
def pad(
    df: pd.DataFrame
):
    df_padded = pd.DataFrame()
    for column in df.columns:
        df_column = df[column].to_list()
        df_column_padded = tf.keras.preprocessing.sequence.pad_sequences(df_column,maxlen=20,padding='pre',truncating='pre')
        df_column_padded = df_column_padded.tolist()
        df_padded[column] = df_column_padded
    return df_padded

In [58]:
def iterate_over_df(
    ### iterator function as input for the tensorflow generator `from_generator` function
    df: pd.DataFrame
):  
    df['empty_list'] = [[] for _ in range(len(df))]
    def caller():
        for _,j in df.iterrows():
            yield(j['product_id-list_seq']),j['empty_list']
    return caller

In [59]:
def ds_from_df(
    ### generate tensorflow object from dataframe
    df: pd.DataFrame
):
    output_shape_x = (
        tf.TensorShape([None,])
    )
    df = tf.data.Dataset.from_generator(
        iterate_over_df(df),
        output_types=((tf.int32),tf.int32),
        output_shapes = (output_shape_x, tf.TensorShape([None,]))
    )
    return df

In [60]:
# def pad_dataset(
#         ## pad dataset so all session sequence data have length 20
#         df,
#         batch_size: int,
# ):
#         df = df.shuffle(5)
#         df = df.padded_batch(batch_size, padded_shapes = (([20,]),[0,]), padding_values = ((0),0),drop_remainder=True)
#         df = df.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

#         return df

In [61]:
def data_to_dict(
    ### create a dictionary tensor dataframe as input into the model
    df_list: list,
    df_len: int,
    chosen_features: list
):
    df_list = (tf.stack([df_list[i][0] for i in range(df_len)]),tf.stack([df_list[i][-1] for i in range(df_len)]))
    dict_features = {}
    dict_targets = {}
    if len(chosen_features) == 1:
        dict_features[chosen_features[0]] = df_list[0]
        dict_targets['target'] = df_list[-1]
    else:
        for i in range(len(df_list[0]),-1):
            dict_features[chosen_features[i]] = df_list[0][i]
    # return (dict_features,df_list[-1])
    return (dict_features,dict_targets)

In [62]:
def get_dataset(
    ### function to call all other functions necessary to build the dataset
    ### to input into the model
    df,
    batch_size,
    df_len
):
    df = pad(df)
    df = ds_from_df(df)
    # df = pad_dataset(df,df_len)
    df = data_to_dict(list(df),df_len,chosen_features)
    ds = tf.data.Dataset.from_tensor_slices(df).batch(batch_size)
    steps = int(np.floor(df_len/batch_size))

    # return ds, steps
    return ds, steps

## Daily Fine-tuning: Training over a time window


### Train the model

In [63]:
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.python.keras.engine import data_adapter

def make_print_data_and_train_step(keras_model):
    original_train_step = keras_model.train_step

    def print_data_and_train_step(original_data):
        data = data_adapter.expand_1d(original_data)
        x, y_true, w = data_adapter.unpack_x_y_sample_weight(data)
        y_pred = keras_model(x, training=True)
        # K.print_tensor(w, "Sample weight (w) =")
        K.print_tensor(x, "Batch input (x) =")
        tf.print(y_true, "Batch output (y_true) =")
        K.print_tensor(y_pred, "Prediction (y_pred) =")

        result = original_train_step(original_data)
        
        return result

    return print_data_and_train_step

In [64]:
# model.train_step = make_print_data_and_train_step(model)

In [65]:
model.compile(optimizer='adam',run_eagerly=True)

In [66]:
train_batch_size = 256
eval_batch_size = 32

In [67]:
%%time
# window
start_time_window_index = 1
final_time_window_index = 3
# Iterating over days of one week
for time_index in range(start_time_window_index, final_time_window_index):
    # Set data 
    time_index_train = time_index
    time_index_eval = time_index + 1
    train_paths = os.path.join(OUTPUT_DIR, f"{time_index_train}/train.parquet")
    eval_paths = os.path.join(OUTPUT_DIR, f"{time_index_eval}/valid.parquet")

    # Load data
    train_df = pd.read_parquet(train_paths)
    train_df = train_df[['product_id-list_seq']]
    eval_df = pd.read_parquet(eval_paths)
    eval_df = eval_df[['product_id-list_seq']]

    # find length of dataframes for argument into `get_dataset`
    train_len = len(train_df)
    eval_len = len(eval_df)

    # get datasets
    train_dataset, train_steps = get_dataset(train_df, train_batch_size,train_len)
    eval_dataset, eval_steps = get_dataset(eval_df, eval_batch_size,eval_len)
    
    # Train on day related to time_index 
    print('*'*20)
    print("Launch training for day %s are:" %time_index)
    print('*'*20 + '\n')
    losses = model.fit(train_dataset, epochs=5)
    model.reset_metrics()
    # Evaluate on the following day
    eval_metrics = model.evaluate(eval_dataset, return_dict=True)
    print('*'*20)
    print("Eval results for day %s are:\t" %time_index_eval)
    print('\n' + '*'*20 + '\n')
    for key in sorted(eval_metrics.keys()):
        print(" %s = %s" % (key, str(eval_metrics[key])))

Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x164a76a30>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/usr/local/lib/python3.9/site-packages/tensorflow/python/autograph/operators/control_flow.py", line 529, in aug_body
    iterate_index += 1  File "/var/folders/4j/6q5w3p6s7zb16_63n6np2z1c0000gr/T/__autograph_generated_file4e5x68ik.py", line 22, in loop_body
    gather_indices = ag__.converted_call(ag__.ld(gather_indices).write, (ag__.ld(i), ag__.converted_call(ag__.ld(tf).concat, ([ag__.ld(i) * ag__.converted_call(ag__.ld(tf).ones, ((ag__.ld(max_k), 1), ag__.ld(tf).int32), None, fscope), ag__.converted_call(ag__.ld(tf).expand_dims, (ag__.ld(indices)[ag__.ld(i), :], -1), None, fscope)],), dict(axis=1), fscope)), None, fscope)  File "/usr/local/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py", line 337, i

Projecting inputs of NextItemPredictionTask to'64' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '64'


<tf.Variable 'Variable:0' shape=(None, None) dtype=int32, numpy=
array([[     0,      0,      0, ...,      0,   6048,      0],
       [     0,      0,      0, ...,      0,      0,   9519],
       [     0,      0,      0, ...,      0,      0,      0],
       ...,
       [     0,      0,      0, ..., 105848,      0,      0],
       [     0,      0,      0, ...,      0,    785,      0],
       [     0,      0,      0, ...,      0,   1061,   1054]], dtype=int32)>
  1/438 [..............................] - ETA: 21:39 - ndcg_at_4: 0.0000e+00 - recall_at_4: 0.0000e+00 - loss: 11.9023 - regularization_loss: 0.0000e+00 - total_loss: 11.9023<tf.Variable 'Variable:0' shape=(None, None) dtype=int32, numpy=
array([[    0,     0, 57517, ...,     0,     0,     0],
       [    0,     0,     0, ...,     0, 13010,     0],
       [    0,     0, 14165, ...,  1256,     0,     0],
       ...,
       [    0,     0,     0, ...,     0,  2488,     0],
       [    0,     0,     0, ..., 75691, 75789,     0],
    

KeyboardInterrupt: 

In [None]:
model.masked_sequence