# Session-based Recs with Transformers4Rec: Transformer - Causal Language Model

Followed a step by step tutorial:
https://nvidia-merlin.github.io/Transformers4Rec/main/examples/tutorial/index.html

## Imports

In [1]:
import os
import glob
import pandas as pd
import numpy as np

from transformers4rec import tf as tr
import tensorflow as tf
from transformers4rec.tf.ranking_metric import NDCGAt, RecallAt
from typing import Optional

## Instantiates Schema object from schema file

In [2]:
# define the input file path
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", '../data/')
# define the output file path
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "../data/sessions_by_day")
# features chosen to train on
chosen_features = ['product_id-list_seq']
from merlin_standard_lib import Schema
# define schema object to pass it to the TabularSeqeunceFeatures class
SCHEMA_PATH = os.path.join(INPUT_DATA_DIR, 'schema.pb')
schema = Schema().from_proto_text(SCHEMA_PATH)
schema = schema.select_by_name(chosen_features)

## Define Input Block

use MLM as the training method

In [3]:
# Input
sequence_length, d_model = 20, 192
# Define input module to process tabular input-features and to prepare masked inputs
inputs = tr.TabularSequenceFeatures.from_schema(
    schema,
    max_sequence_length = sequence_length,
    d_output = d_model,
    masking = 'clm'
)

## Build Transformer Block

In [4]:
# define XLNetConfig class and set default parameters for HF XLNet config
transformer_config = tr.XLNetConfig.build(
    d_model = d_model, n_head=4, n_layer=2, total_seq_length=sequence_length
)

# define the model block including: inputs, masking, projection and transformer block.

body = tr.SequentialBlock(
    [inputs,
    tr.MLPBlock([192]),
    tr.TransformerBlock(transformer_config, masking=inputs.masking)]
)

# define the head for to the next item prediction task

head = tr.Head(
    body,
    tr.NextItemPredictionTask(
        weight_tying=True, 
        # hf_format=True, 
        metrics=[NDCGAt(top_ks=[10, 20], labels_onehot=True),RecallAt(top_ks=[10, 20], labels_onehot=True)],
        # loss = tf.keras.losses.CategoricalCrossentropy)
))
# head = tr.Head(
#     body,
#     tr.NextItemPredictionTask(
#         weight_tying=True, 
#         # hf_format=True, 
#         metrics=[NDCGAt_temp(top_ks=[10, 20], labels_onehot=False)]
#         # loss = tf.keras.losses.CategoricalCrossentropy)
# ))

# get the end-to-end Model class

model = tr.Model(head)

## Build Datasets

In [5]:
def iterate_over_df(
    ### iterator function as input for the tensorflow generator `from_generator` function
    df: pd.DataFrame
):  
    df['empty_list'] = [[] for _ in range(len(df))]
    def caller():
        for _,j in df.iterrows():
            yield(j['product_id-list_seq']),j['empty_list']
    return caller

In [6]:
def ds_from_df(
    ### generate tensorflow object from dataframe
    df: pd.DataFrame
):
    output_shape_x = (
        tf.TensorShape([None,])
    )
    df = tf.data.Dataset.from_generator(
        iterate_over_df(df),
        output_types=((tf.int32),tf.int32),
        output_shapes = (output_shape_x, tf.TensorShape([None,]))
    )
    return df

In [7]:
def pad_dataset(
        ## pad dataset so all session sequence data have length 20
        df,
        batch_size: int,
):
        df = df.shuffle(5)
        df = df.padded_batch(batch_size, padded_shapes = (([20,]),[0,]), padding_values = ((0),0),drop_remainder=True)
        df = df.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

        return df

In [8]:
def data_to_dict(
    ### create a dictionary tensor dataframe as input into the model
    df_list: list,
    chosen_features: list
):
    df_dictionary = {}
    df_dictionary2 = {}
    if len(chosen_features) == 1:
        df_dictionary[chosen_features[0]] = df_list[0][0]
    else:
        for i in range(len(chosen_features)):
            df_dictionary[chosen_features[i]] = df_list[0][0][i]
        df_dictionary2['labels'] = df_list[0][-1]
    return (df_dictionary, df_dictionary2)

In [9]:
def get_dataset(
    ### function to call all other functions necessary to build the dataset
    ### to input into the model
    df,
    batch_size,
    df_len
):
    df = ds_from_df(df)
    df = pad_dataset(df,df_len)
    df = data_to_dict(list(df),chosen_features)
    ds = tf.data.Dataset.from_tensor_slices(df).batch(batch_size)
    steps = int(np.floor(df_len/batch_size))
    ds = ds.map(lambda X, y: (X, []))


    return ds, steps

## Daily Fine-tuning: Training over a time window


### Train the model

In [10]:
model.compile(optimizer='adam',run_eagerly=True)

In [11]:
train_batch_size = 256
eval_batch_size = 32

In [12]:
%%time
# window
start_time_window_index = 1
final_time_window_index = 3
# Iterating over days of one week
for time_index in range(start_time_window_index, final_time_window_index):
    # Set data 
    time_index_train = time_index
    time_index_eval = time_index + 1
    train_paths = os.path.join(OUTPUT_DIR, f"{time_index_train}/train.parquet")
    eval_paths = os.path.join(OUTPUT_DIR, f"{time_index_eval}/valid.parquet")

    # Load data
    train_df = pd.read_parquet(train_paths)
    train_df = train_df[['product_id-list_seq']]
    eval_df = pd.read_parquet(eval_paths)
    eval_df = eval_df[['product_id-list_seq']]

    # find length of dataframes for argument into `get_dataset`
    train_len = len(train_df)
    eval_len = len(eval_df)

    # get datasets
    train_dataset, train_steps = get_dataset(train_df, train_batch_size,train_len)
    eval_dataset, eval_steps = get_dataset(eval_df, eval_batch_size,eval_len)
    
    # Train on day related to time_index 
    print('*'*20)
    print("Launch training for day %s are:" %time_index)
    print('*'*20 + '\n')
    losses = model.fit(train_dataset, epochs=1)
    model.reset_metrics()
    # Evaluate on the following day
    eval_metrics = model.evaluate(eval_dataset, return_dict=True)
    print('*'*20)
    print("Eval results for day %s are:\t" %time_index_eval)
    print('\n' + '*'*20 + '\n')
    for key in sorted(eval_metrics.keys()):
        print(" %s = %s" % (key, str(eval_metrics[key])))

********************
Launch training for day 1 are:
********************



Projecting inputs of NextItemPredictionTask to'64' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '64'


********************
Eval results for day 2 are:	

********************

 eval_ndcg@10 = 0.7777777910232544
 eval_ndcg@20 = 0.7870370149612427
 eval_recall@10 = 0.7777777910232544
 eval_recall@20 = 0.8148148059844971
 loss = 1.9266173839569092
 regularization_loss = 0
 total_loss = 1.9266173839569092
********************
Launch training for day 2 are:
********************

********************
Eval results for day 3 are:	

********************

 eval_ndcg@10 = 0.6198317408561707
 eval_ndcg@20 = 0.6273594498634338
 eval_recall@10 = 0.6307692527770996
 eval_recall@20 = 0.6615384817123413
 loss = 3.389691114425659
 regularization_loss = 0
 total_loss = 3.389691114425659
CPU times: user 1h 26min 9s, sys: 23.5 s, total: 1h 26min 32s
Wall time: 1h 28min 44s
