In [1]:
from datalib import Datastore, Fields
import pandas as pd
import random
import numpy as np

In [2]:
data = Datastore().load_from_dir("data/sample42/")

Loaded data/sample42/


In [3]:
import tensorflow_recommenders as tfrs
import tensorflow as tf

In [4]:
unique_user_ids = np.asarray(data.customers.customer_id.values)
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32)
])

2022-03-03 12:14:44.826456: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-03 12:14:44.830476: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-03 12:14:44.830831: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-03 12:14:44.831653: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [5]:
user_model.compile()

ValueError: This model has not yet been built. Build the model first by calling `build()` or by calling the model on a batch of data.

In [7]:
unique_movie_titles = np.unique(list(data.transactions.article_id.values))
movies = tf.data.Dataset.from_tensor_slices(unique_movie_titles)
article_model = tf.keras.Sequential([
    tf.keras.layers.IntegerLookup(
      vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 32)
]) 

In [8]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.batch(128).map(article_model)
)

In [9]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [10]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model, task):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features, training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features['customer_id'])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_movie_embeddings = self.movie_model(features['article_id'])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_movie_embeddings)

In [11]:
model = MovielensModel(user_model, article_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [12]:
train_customer_id = data.transactions.customer_id.values
train_article = np.asarray(data.transactions.article_id.values, dtype=np.int32)

In [13]:
train_ds = tf.data.Dataset.from_tensor_slices(dict(data.transactions[['customer_id', 'article_id']]))

In [14]:
for elem in train_ds.take(3):
    print(elem)

{'customer_id': <tf.Tensor: shape=(), dtype=string, numpy=b'001fd23db1109a94bba1319bb73df0b479059027c182da490e1161b34cd3af61'>, 'article_id': <tf.Tensor: shape=(), dtype=int64, numpy=631744002>}
{'customer_id': <tf.Tensor: shape=(), dtype=string, numpy=b'001fd23db1109a94bba1319bb73df0b479059027c182da490e1161b34cd3af61'>, 'article_id': <tf.Tensor: shape=(), dtype=int64, numpy=562252035>}
{'customer_id': <tf.Tensor: shape=(), dtype=string, numpy=b'00708c3da4d07706d4cad77c6aecc1b1ce33d21d73022ca12737c91d85bff070'>, 'article_id': <tf.Tensor: shape=(), dtype=int64, numpy=255396006>}


In [15]:
cached_train = train_ds.batch(128).cache()

In [16]:
for i in cached_train.take(1):
    print(i)

{'customer_id': <tf.Tensor: shape=(128,), dtype=string, numpy=
array([b'001fd23db1109a94bba1319bb73df0b479059027c182da490e1161b34cd3af61',
       b'001fd23db1109a94bba1319bb73df0b479059027c182da490e1161b34cd3af61',
       b'00708c3da4d07706d4cad77c6aecc1b1ce33d21d73022ca12737c91d85bff070',
       b'00708c3da4d07706d4cad77c6aecc1b1ce33d21d73022ca12737c91d85bff070',
       b'00708c3da4d07706d4cad77c6aecc1b1ce33d21d73022ca12737c91d85bff070',
       b'007ea762fbbebfad288a49f1d8f7c2c0c62b189a8816fa6968e60f2c197ce938',
       b'00b6ec8613e51d8eadc5157f5a12ae1366ca29da7d44cc8d0ae1e667dea6f268',
       b'00b6ec8613e51d8eadc5157f5a12ae1366ca29da7d44cc8d0ae1e667dea6f268',
       b'00b6ec8613e51d8eadc5157f5a12ae1366ca29da7d44cc8d0ae1e667dea6f268',
       b'00d316a9b10b3d21094e4931f41a42ec7d093bbbd53c7b656ad830be8e6d1565',
       b'00d316a9b10b3d21094e4931f41a42ec7d093bbbd53c7b656ad830be8e6d1565',
       b'00d781e94d9a533dceb85a8d50a898d5987ad9921ea8c6945c247be7220bdd81',
       b'00d781e94d9a533d

2022-03-03 12:15:18.744763: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [17]:
model.fit(cached_train)

 1743/24708 [=>............................] - ETA: 2:50:17 - factorized_top_k/top_1_categorical_accuracy: 4.4822e-06 - factorized_top_k/top_5_categorical_accuracy: 0.0086 - factorized_top_k/top_10_categorical_accuracy: 0.0129 - factorized_top_k/top_50_categorical_accuracy: 0.0296 - factorized_top_k/top_100_categorical_accuracy: 0.0414 - loss: 620.8128 - regularization_loss: 0.0000e+00 - total_loss: 620.8128

KeyboardInterrupt: 

In [26]:
np.unique(data.articles.product_type_name.values).shape

(131,)