In [1]:
import os
import glob
import pandas as pd
import numpy as np

from transformers4rec import tf as tr
import tensorflow as tf
from transformers4rec.tf.ranking_metric import NDCGAt, RecallAt

In [2]:
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", '../data/')
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "../data/sessions_by_day")

In [3]:
from merlin_standard_lib import Schema
# define schema object to pass it to the TabularSeqeunceFeatures class
SCHEMA_PATH = os.path.join(INPUT_DATA_DIR, 'schema.pb')
schema = Schema().from_proto_text(SCHEMA_PATH)
schema = schema.select_by_name(['user_session','product_id-list_seq'])

In [4]:
# generate short integer column dataframe
train_paths = os.path.join(OUTPUT_DIR, "1/train.parquet")
train_df = pd.read_parquet(train_paths)
# train_df = train_df[['user_session']]
train_df = train_df[:100]
train_df = train_df.reset_index(drop=True)

In [5]:
df = train_df

In [6]:
# def iterate_over_df(
#     df: pd.DataFrame
# ):  
#         def caller():
#             for i in range(len(df)):
#                 df_dictionary = {}
#                 for column in df.columns:
#                     df_dictionary[column] = df[column][i]
#                 yield df_dictionary
#         return caller

def iterate_over_df(
    df: pd.DataFrame
):  
    def caller():
        for _,j in df.iterrows():
            yield(j['user_session'],j['product_id-list_seq'])
    return caller

In [7]:
# def ds_from_df(
#     df: pd.DataFrame
# ):
#     output_shape_x = ({
#         'user_session':(1,)
#     })
#     df = tf.data.Dataset.from_generator(
#         iterate_over_df(df),
#         # output_types=((tf.int32), tf.int32),
#         # output_shapes = (output_shape_x, tf.TensorShape([]))
#         output_types=(tf.int32),
#         output_shapes = (output_shape_x)
#     )
#     return df
def ds_from_df(
    df: pd.DataFrame
):
    output_shape_x = (
        tf.TensorShape([]),
        tf.TensorShape([None,])
    )
    df = tf.data.Dataset.from_generator(
        iterate_over_df(df),
        # output_types=((tf.int32), tf.int32),
        # output_shapes = (output_shape_x, tf.TensorShape([]))
        output_types=(tf.int32,tf.int32),
        output_shapes = (output_shape_x)
    )
    return df

In [8]:
df = ds_from_df(df)

2021-12-06 15:20:41.908675: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
def batch_dataset(
        df,
        batch_size: int,
):
    df = df.shuffle(5)
#     df = df.padded_batch(batch_size, padded_shapes = (([20,]),[]), padding_values = ((0),0),drop_remainder=True)
    df = df.padded_batch(batch_size, padded_shapes = (([],[20,])), padding_values = ((0,0)),drop_remainder=True)
    df = df.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return df

In [10]:
df = batch_dataset(df,100)

In [14]:
df_list = list(df)

In [16]:
batch_dictionary = {}
cols = ['user_session','product_id-list_seq']
for i in range(len(df_list[0])):
    batch_dictionary[cols[i]] = df_list[0][i]
print(batch_dictionary)

{'user_session': <tf.Tensor: shape=(100,), dtype=int32, numpy=
array([ 163,   58,   39,   24,  169,  104,  204,  180,  212,  208,  250,
        278,  190,  299,  328,  281,  327,  455,  406,  203,  405,  464,
        472,  476,  348,  579,  485,  460,  716,  731,  625,  697,  752,
        734,  805,  753,  824,  681,  845,  732,  847,  791,  853,  851,
        878,  921,  960,  963,  985,  919,  986, 1025,  939, 1047,  936,
       1159, 1032,  882, 1170, 1168, 1212,  995, 1216, 1244, 1205, 1105,
       1304, 1256, 1230, 1308, 1354, 1177, 1322, 1264, 1400, 1357, 1406,
       1366, 1388, 1438, 1490, 1380, 1499, 1470, 1565, 1599, 1495, 1402,
       1582, 1419, 1687, 1618, 1610, 1693, 1611, 1766, 1689, 1716, 1742,
       1670], dtype=int32)>, 'product_id-list_seq': <tf.Tensor: shape=(100, 20), dtype=int32, numpy=
array([[ 54343,  91623, 113630, ...,      0,      0,      0],
       [   131,    895,  29351, ...,      0,      0,      0],
       [  9741,   9519,      0, ...,      0,      0,   

In [19]:
dataset = batch_dictionary

In [20]:
targets = {"target": tf.cast(tf.random.uniform((100,), maxval=2, dtype=tf.int32), tf.float32)}


In [21]:
dataset = tf.data.Dataset.from_tensor_slices((dataset, targets)).batch(50,drop_remainder=True)

In [24]:
sequence_length = 20
inputs = tr.TabularSequenceFeatures.from_schema(
    schema,
    max_sequence_length = sequence_length,
    masking = 'causal'
)

In [25]:
body = tr.SequentialBlock([inputs, tr.MLPBlock([64])])
targets = {"target": tf.cast(tf.random.uniform((100,), maxval=2, dtype=tf.int32), tf.float32)}
model = tr.BinaryClassificationTask("target").to_model(body, inputs)
model.compile(optimizer="adam", run_eagerly=False)

In [29]:
losses = model.fit(dataset,epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3
