In [1]:
import os
import glob
import pandas as pd
import numpy as np

from transformers4rec import tf as tr
import tensorflow as tf
from transformers4rec.tf.ranking_metric import NDCGAt, RecallAt
from transformers4rec.tf.utils import testing_utils as test_utils

### Get Data

In [2]:
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", '../data/')
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "../data/sessions_by_day")

In [3]:
from merlin_standard_lib import Schema
# define schema object to pass it to the TabularSeqeunceFeatures class
SCHEMA_PATH = os.path.join(INPUT_DATA_DIR, 'schema_test.pb')
schema = Schema().from_proto_text(SCHEMA_PATH)

In [4]:
def tf_yoochoose_like():
    return tr.data.tabular_sequence_testing_data.tf_synthetic_data(
    num_rows=100, min_session_length=5, max_session_length=20
    )
df = tf_yoochoose_like()

2021-12-22 14:07:01.103452: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
def tf_masking_inputs():
    # fixed parameters for tests
    NUM_EXAMPLES = 20
    MAX_LEN = 10
    PAD_TOKEN = 0
    NUM_EXAMPLES = 1000
    MAX_CARDINALITY = 100
    hidden_dim = 16
    features = {}
    # generate random tensors for test
    features["input_tensor"] = tf.convert_to_tensor(
        np.random.uniform(0, 1, (NUM_EXAMPLES, MAX_LEN, hidden_dim))
    )
    # create sequences
    labels = np.random.randint(1, MAX_CARDINALITY, (NUM_EXAMPLES, MAX_LEN))
    # replace last 2 items by zeros to mimic padding
    labels[:, MAX_LEN - 2 :] = 0
    labels = tf.convert_to_tensor(labels)
    features["labels"] = labels
    features["padding_idx"] = PAD_TOKEN
    features["vocab_size"] = MAX_CARDINALITY
    
    return features

In [6]:
tf_masking_inputs = tf_masking_inputs()

In [7]:
tf_masking_inputs

{'input_tensor': <tf.Tensor: shape=(1000, 10, 16), dtype=float64, numpy=
 array([[[0.01431019, 0.28938477, 0.47462903, ..., 0.94949935,
          0.2554671 , 0.28995679],
         [0.35112655, 0.66784172, 0.40659763, ..., 0.11024972,
          0.16774864, 0.50167527],
         [0.78267262, 0.93375795, 0.19398625, ..., 0.46027355,
          0.28415953, 0.14766647],
         ...,
         [0.20773027, 0.15688596, 0.47617317, ..., 0.63368863,
          0.486057  , 0.25837122],
         [0.81021811, 0.01800621, 0.70325015, ..., 0.54219969,
          0.9969783 , 0.61658851],
         [0.76325322, 0.1749273 , 0.35946897, ..., 0.79285905,
          0.47163301, 0.70362971]],
 
        [[0.25288281, 0.78916138, 0.6658682 , ..., 0.01169676,
          0.86997157, 0.12544535],
         [0.84108847, 0.817121  , 0.21151408, ..., 0.47893457,
          0.01829097, 0.46402322],
         [0.84205325, 0.39426052, 0.18440333, ..., 0.81939748,
          0.22822109, 0.83757334],
         ...,
         [0.60

### test output shape and masking output of CLM

In [8]:
lm = tr.masking.masking_registry['causal'](padding_idx=tf_masking_inputs["padding_idx"])
out = lm(tf_masking_inputs["input_tensor"], tf_masking_inputs["labels"], training=True)
assert tf.shape(lm.masked_targets)[0] == tf_masking_inputs["input_tensor"].shape[0]
assert tf.shape(lm.masked_targets)[1] == tf_masking_inputs["input_tensor"].shape[1]
assert out.shape[2] == tf_masking_inputs["input_tensor"].shape[2]

In [9]:
print(tf_masking_inputs["labels"][0])
print(tf_masking_inputs["labels"][1])

tf.Tensor([ 5 83 94 58  6 43 89 44  0  0], shape=(10,), dtype=int64)
tf.Tensor([ 9 62 12 60 91  9 88 56  0  0], shape=(10,), dtype=int64)


In [10]:
print(lm.masked_targets[0])
print(lm.masked_targets[1])

tf.Tensor([ 0  0  0  0  0  0 44  0  0  0], shape=(10,), dtype=int32)
tf.Tensor([ 0  0  0  0  0  0 56  0  0  0], shape=(10,), dtype=int32)


causal masking shifts the true values the left by one (can't predict the first label with no information) and then adds a zero vector as a masked item.

### test output shape and masking output of MLM

In [14]:
lm = tr.masking.masking_registry['masked'](padding_idx=tf_masking_inputs["padding_idx"])
out = lm(tf_masking_inputs["input_tensor"], tf_masking_inputs["labels"], training=True)
assert tf.shape(lm.masked_targets)[0] == tf_masking_inputs["input_tensor"].shape[0]
assert tf.shape(lm.masked_targets)[1] == tf_masking_inputs["input_tensor"].shape[1]
assert out.shape[2] == tf_masking_inputs["input_tensor"].shape[2]

In [15]:
print(tf_masking_inputs["labels"][0])
print(tf_masking_inputs["labels"][1])

tf.Tensor([ 5 83 94 58  6 43 89 44  0  0], shape=(10,), dtype=int64)
tf.Tensor([ 9 62 12 60 91  9 88 56  0  0], shape=(10,), dtype=int64)


In [16]:
print(lm.masked_targets[0])
print(lm.masked_targets[1])

tf.Tensor([ 0  0  0  0  6 43  0  0  0  0], shape=(10,), dtype=int32)
tf.Tensor([ 0  0  0  0  0  0 88 56  0  0], shape=(10,), dtype=int32)


MLM doesn't shift values - just randomly masks items with zeroes

### test class serialisation

In [16]:
### test serialization masking
lm = tr.masking.masking_registry['causal'](padding_idx=tf_masking_inputs["padding_idx"])
copy_layer = test_utils.assert_serialization(lm)
# assert tf.shape(copy_layer.masked_targets)[0] == tf_masking_inputs["input_tensor"].shape[0]
# assert out.shape[2] == tf_masking_inputs["input_tensor"].shape[2]

```
def assert_serialization(layer):
    copy_layer = layer.from_config(layer.get_config())

    assert isinstance(copy_layer, layer.__class__)

    return copy_layer

### test eager + graph modes

In [17]:
### test eager + graph modes 
input_module = tr.TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=20,
        continuous_projection=64,
        d_output=64,
        masking='causal',
    )
body = tr.SequentialBlock([input_module, tr.MLPBlock([64])])
test_utils.assert_body_works_in_model(df, input_module, body, run_eagerly=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### test only last item is masked when eval_on_last_item_seq_only

In [18]:
lm = tr.masking.masking_registry['causal'](
    padding_idx=tf_masking_inputs["padding_idx"], eval_on_last_item_seq_only=True
)
lm.compute_masked_targets(tf_masking_inputs["labels"], training=False)
# get non padded last items
non_padded_mask = tf_masking_inputs["labels"] != tf_masking_inputs["padding_idx"]
rows_ids = tf.range(tf_masking_inputs["labels"].shape[0], dtype=tf.int64)
last_item_sessions = tf.reduce_sum(tf.cast(non_padded_mask, tf.int64), axis=1) - 1
indices = tf.concat(
    [tf.expand_dims(rows_ids, 1), tf.expand_dims(last_item_sessions, 1)], axis=1
)
last_labels = tf.gather_nd(tf_masking_inputs["labels"], indices).numpy()
# get the last labels from output
trgt_pad = lm.masked_targets != tf_masking_inputs["padding_idx"]
out_last = tf.boolean_mask(lm.masked_targets, trgt_pad).numpy()

# check that only one item is masked for each session
assert (
    tf.reduce_sum(tf.cast(lm.mask_schema, tf.int32)).numpy()
    == tf_masking_inputs["input_tensor"].shape[0]
)

# check only the last non-paded item is masked
assert all(last_labels == out_last)

In [19]:
### test mask all next item for eval
lm = tr.masking.masking_registry['causal'](
    padding_idx=tf_masking_inputs["padding_idx"],
    eval_on_last_item_seq_only=False,
)
masking_info = lm.compute_masked_targets(tf_masking_inputs["labels"], training=False)
# get the labels from output
trgt_pad = masking_info.targets != tf_masking_inputs["padding_idx"]
labels = masking_info.targets[trgt_pad].numpy()
# get non padded items when shifting input sequence
shift_inputs = tf_masking_inputs["labels"][:, 1:]
non_padded_mask = shift_inputs != tf_masking_inputs["padding_idx"]
n_labels_sessions = non_padded_mask.numpy().sum(1)
all_labels = tf.boolean_mask(shift_inputs, non_padded_mask).numpy()

# check that number of labels per session matches
assert all(masking_info.schema.numpy().sum(1) == n_labels_sessions)
# check all next items are masked
assert all(all_labels == labels)

In [20]:
masking_info

MaskingInfo(schema=<tf.Tensor: shape=(1000, 10), dtype=bool, numpy=
array([[ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       ...,
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False]])>, targets=<tf.Variable 'Variable:0' shape=(None, None) dtype=int32, numpy=
array([[37, 72, 34, ...,  0,  0,  0],
       [81, 34, 53, ...,  0,  0,  0],
       [98, 52, 29, ...,  0,  0,  0],
       ...,
       [24, 78, 71, ...,  0,  0,  0],
       [ 8, 11, 61, ...,  0,  0,  0],
       [ 1, 80, 53, ...,  0,  0,  0]], dtype=int32)>)

In [27]:
### test at least one item is masked when trained

lm = tr.masking.masking_registry['causal'](padding_idx=tf_masking_inputs["padding_idx"])
masking_info = lm.compute_masked_targets(tf_masking_inputs["labels"], training=True)
trgt_mask = tf.cast(masking_info.targets != tf_masking_inputs["padding_idx"], tf.int32)
assert all(tf.reduce_sum(trgt_mask, axis=1).numpy() > 0)

In [28]:
### check that not all items are masked when training

lm = tr.masking.masking_registry['causal'](padding_idx=tf_masking_inputs["padding_idx"])
lm.compute_masked_targets(tf_masking_inputs["labels"], training=True)
trgt_mask = lm.masked_targets != tf_masking_inputs["padding_idx"]
non_padded_mask = tf_masking_inputs["labels"] != tf_masking_inputs["padding_idx"]
assert all(trgt_mask.numpy().sum(axis=1) != non_padded_mask.numpy().sum(axis=1))

In [29]:
### check number of masked positions equal to number of targets

lm = tr.masking.masking_registry['causal'](padding_idx=tf_masking_inputs["padding_idx"])
lm.compute_masked_targets(tf_masking_inputs["labels"], training=True)
trgt_pad = lm.masked_targets != tf_masking_inputs["padding_idx"]
assert lm.mask_schema.numpy().sum() == trgt_pad.numpy().sum()

In [30]:
### Test only last item is masked when training clm on last item
lm = tr.masking.masking_registry["causal"](
    padding_idx=tf_masking_inputs["padding_idx"],
    train_on_last_item_seq_only=True,
)
lm.compute_masked_targets(tf_masking_inputs["labels"], training=True)
# get non padded last items
non_padded_mask = tf_masking_inputs["labels"] != tf_masking_inputs["padding_idx"]
rows_ids = tf.range(tf_masking_inputs["labels"].shape[0], dtype=tf.int64)
last_item_sessions = tf.reduce_sum(tf.cast(non_padded_mask, tf.int64), axis=1) - 1
indices = tf.concat(
    [tf.expand_dims(rows_ids, 1), tf.expand_dims(last_item_sessions, 1)], axis=1
)
last_labels = tf.gather_nd(tf_masking_inputs["labels"], indices).numpy()
# get the last labels from output
trgt_pad = lm.masked_targets != tf_masking_inputs["padding_idx"]
out_last = tf.boolean_mask(lm.masked_targets, trgt_pad).numpy()

# check that only one item is masked for each session
assert (
    tf.reduce_sum(tf.cast(lm.mask_schema, tf.int32)).numpy()
    == tf_masking_inputs["input_tensor"].shape[0]
)

# check only the last non-paded item is masked
assert all(last_labels == out_last)