In [1]:
import os
import glob
import pandas as pd
import numpy as np

from transformers4rec import tf as tr
import tensorflow as tf
from transformers4rec.tf.ranking_metric import NDCGAt, RecallAt
from transformers4rec.tf.utils import testing_utils as test_utils

In [2]:
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", '../data/')
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "../data/sessions_by_day")

In [3]:
from merlin_standard_lib import Schema
# define schema object to pass it to the TabularSeqeunceFeatures class
SCHEMA_PATH = os.path.join(INPUT_DATA_DIR, 'schema_test.pb')
schema = Schema().from_proto_text(SCHEMA_PATH)

In [4]:
def tf_yoochoose_like():
    return tr.data.tabular_sequence_testing_data.tf_synthetic_data(
    num_rows=100, min_session_length=5, max_session_length=20
    )
df = tf_yoochoose_like()

2021-12-20 13:57:14.341879: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
def tf_masking_inputs():
    # fixed parameters for tests
    NUM_EXAMPLES = 20
    MAX_LEN = 10
    PAD_TOKEN = 0
    NUM_EXAMPLES = 1000
    MAX_CARDINALITY = 100
    hidden_dim = 16
    features = {}
    # generate random tensors for test
    features["input_tensor"] = tf.convert_to_tensor(
        np.random.uniform(0, 1, (NUM_EXAMPLES, MAX_LEN, hidden_dim))
    )
    # create sequences
    labels = np.random.randint(1, MAX_CARDINALITY, (NUM_EXAMPLES, MAX_LEN))
    # replace last 2 items by zeros to mimic padding
    labels[:, MAX_LEN - 2 :] = 0
    labels = tf.convert_to_tensor(labels)
    features["labels"] = labels
    features["padding_idx"] = PAD_TOKEN
    features["vocab_size"] = MAX_CARDINALITY

    return features

In [6]:
tf_masking_inputs = tf_masking_inputs()

In [9]:
tf_masking_inputs['padding_idx']

0

In [10]:
### test_task_output_shape
lm = tr.masking.masking_registry['causal'](padding_idx=tf_masking_inputs["padding_idx"])
out = lm(tf_masking_inputs["input_tensor"], tf_masking_inputs["labels"], training=True)
assert tf.shape(lm.masked_targets)[0] == tf_masking_inputs["input_tensor"].shape[0]
assert tf.shape(lm.masked_targets)[1] == tf_masking_inputs["input_tensor"].shape[1]
assert out.shape[2] == tf_masking_inputs["input_tensor"].shape[2]

In [12]:
### test serialization masking
lm = tr.masking.masking_registry['causal'](padding_idx=tf_masking_inputs["padding_idx"])
copy_layer = test_utils.assert_serialization(lm)
assert tf.shape(copy_layer.masked_targets)[0] == tf_masking_inputs["input_tensor"].shape[0]
# assert out.shape[2] == tf_masking_inputs["input_tensor"].shape[2]

ValueError: Attempt to convert a value (None) with an unsupported type (<class 'NoneType'>) to a Tensor.

In [23]:
### test eager + graph modes 
input_module = tr.TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=20,
        continuous_projection=64,
        d_output=64,
        masking='causal',
    )
body = tr.SequentialBlock([input_module, tr.MLPBlock([64])])
test_utils.assert_body_works_in_model(df, input_module, body, run_eagerly=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
### test only last item is masked when eval_on_last_item_seq_only
lm = tr.masking.masking_registry['causal'](
    padding_idx=tf_masking_inputs["padding_idx"], eval_on_last_item_seq_only=True
)
lm.compute_masked_targets(tf_masking_inputs["labels"], training=False)
# get non padded last items
non_padded_mask = tf_masking_inputs["labels"] != tf_masking_inputs["padding_idx"]
rows_ids = tf.range(tf_masking_inputs["labels"].shape[0], dtype=tf.int64)
last_item_sessions = tf.reduce_sum(tf.cast(non_padded_mask, tf.int64), axis=1) - 1
indices = tf.concat(
    [tf.expand_dims(rows_ids, 1), tf.expand_dims(last_item_sessions, 1)], axis=1
)
last_labels = tf.gather_nd(tf_masking_inputs["labels"], indices).numpy()
# get the last labels from output
trgt_pad = lm.masked_targets != tf_masking_inputs["padding_idx"]
out_last = tf.boolean_mask(lm.masked_targets, trgt_pad).numpy()

# check that only one item is masked for each session
assert (
    tf.reduce_sum(tf.cast(lm.mask_schema, tf.int32)).numpy()
    == tf_masking_inputs["input_tensor"].shape[0]
)

# check only the last non-paded item is masked
assert all(last_labels == out_last)

In [26]:
### test mask all next item for eval
lm = tr.masking.masking_registry['causal'](
    padding_idx=tf_masking_inputs["padding_idx"],
    eval_on_last_item_seq_only=False,
)
masking_info = lm.compute_masked_targets(tf_masking_inputs["labels"], training=False)
# get the labels from output
trgt_pad = masking_info.targets != tf_masking_inputs["padding_idx"]
labels = masking_info.targets[trgt_pad].numpy()
# get non padded items when shifting input sequence
shift_inputs = tf_masking_inputs["labels"][:, 1:]
non_padded_mask = shift_inputs != tf_masking_inputs["padding_idx"]
n_labels_sessions = non_padded_mask.numpy().sum(1)
all_labels = tf.boolean_mask(shift_inputs, non_padded_mask).numpy()

# check that number of labels per session matches
assert all(masking_info.schema.numpy().sum(1) == n_labels_sessions)
# check all next items are masked
assert all(all_labels == labels)

In [27]:
### test at least one item is masked when trained

lm = tr.masking.masking_registry['causal'](padding_idx=tf_masking_inputs["padding_idx"])
masking_info = lm.compute_masked_targets(tf_masking_inputs["labels"], training=True)
trgt_mask = tf.cast(masking_info.targets != tf_masking_inputs["padding_idx"], tf.int32)
assert all(tf.reduce_sum(trgt_mask, axis=1).numpy() > 0)

In [28]:
### check that not all items are masked when training

lm = tr.masking.masking_registry['causal'](padding_idx=tf_masking_inputs["padding_idx"])
lm.compute_masked_targets(tf_masking_inputs["labels"], training=True)
trgt_mask = lm.masked_targets != tf_masking_inputs["padding_idx"]
non_padded_mask = tf_masking_inputs["labels"] != tf_masking_inputs["padding_idx"]
assert all(trgt_mask.numpy().sum(axis=1) != non_padded_mask.numpy().sum(axis=1))

In [29]:
### check number of masked positions equal to number of targets

lm = tr.masking.masking_registry['causal'](padding_idx=tf_masking_inputs["padding_idx"])
lm.compute_masked_targets(tf_masking_inputs["labels"], training=True)
trgt_pad = lm.masked_targets != tf_masking_inputs["padding_idx"]
assert lm.mask_schema.numpy().sum() == trgt_pad.numpy().sum()

In [30]:
### Test only last item is masked when training clm on last item
lm = tr.masking.masking_registry["causal"](
    padding_idx=tf_masking_inputs["padding_idx"],
    train_on_last_item_seq_only=True,
)
lm.compute_masked_targets(tf_masking_inputs["labels"], training=True)
# get non padded last items
non_padded_mask = tf_masking_inputs["labels"] != tf_masking_inputs["padding_idx"]
rows_ids = tf.range(tf_masking_inputs["labels"].shape[0], dtype=tf.int64)
last_item_sessions = tf.reduce_sum(tf.cast(non_padded_mask, tf.int64), axis=1) - 1
indices = tf.concat(
    [tf.expand_dims(rows_ids, 1), tf.expand_dims(last_item_sessions, 1)], axis=1
)
last_labels = tf.gather_nd(tf_masking_inputs["labels"], indices).numpy()
# get the last labels from output
trgt_pad = lm.masked_targets != tf_masking_inputs["padding_idx"]
out_last = tf.boolean_mask(lm.masked_targets, trgt_pad).numpy()

# check that only one item is masked for each session
assert (
    tf.reduce_sum(tf.cast(lm.mask_schema, tf.int32)).numpy()
    == tf_masking_inputs["input_tensor"].shape[0]
)

# check only the last non-paded item is masked
assert all(last_labels == out_last)