# Efficient serving

In [1]:
from typing import Dict, Text

import os
import pprint
import tempfile

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

2022-11-09 05:32:16.400312: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-09 05:32:16.519851: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-09 05:32:16.525124: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl/lib:/usr/local/nvidia/lib64:/usr/local/lib/R/lib/
2022-11-

In [2]:
import tensorflow_recommenders as tfrs

In [3]:
# Load the MovieLens 100K data.
ratings = tfds.load(
    "movielens/100k-ratings",
    split="train"
)

# Get the ratings data.
ratings = (ratings
           # Retain only the fields we need.
           .map(lambda x: {"user_id": x["user_id"], "movie_title": x["movie_title"]})
           # Cache for efficiency.
           .cache(tempfile.NamedTemporaryFile().name)
)

# Get the movies data.
movies = tfds.load("movielens/100k-movies", split="train")
movies = (movies
          # Retain only the fields we need.
          .map(lambda x: x["movie_title"])
          # Cache for efficiency.
          .cache(tempfile.NamedTemporaryFile().name))

2022-11-09 05:32:18.757023: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl/lib:/usr/local/nvidia/lib64:/usr/local/lib/R/lib/
2022-11-09 05:32:18.757111: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl/lib:/usr/local/nvidia/lib64:/usr/local/lib/R/lib/
2022-11-09 05:32:18.757160: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl/lib:/usr/local/nvidia/lib64:/usr/local/lib/R/l

In [4]:
user_ids = ratings.map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(user_ids.batch(1000))))

2022-11-09 05:32:19.041837: W tensorflow/core/kernels/data/cache_dataset_ops.cc:296] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2022-11-09 05:32:21.229109: W tensorflow/core/kernels/data/cache_dataset_ops.cc:296] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [5]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

### Model definition

In [6]:
class MovielensModel(tfrs.Model):

    def __init__(self):
        super().__init__()
        embedding_dimension = 32
        # Set up a model for representing movies.
        self.movie_model = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
            vocabulary=unique_movie_titles, mask_token=None),
          # We add an additional embedding to account for unknown tokens.
          tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
        ])
        # Set up a model for representing users.
        self.user_model = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids, mask_token=None),
            # We add an additional embedding to account for unknown tokens.
          tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])
        # Set up a task to optimize the model and compute metrics.
        self.task = tfrs.tasks.Retrieval(
          metrics=tfrs.metrics.FactorizedTopK(
            candidates=(
                movies
                .batch(128)
                .cache()
                .map(lambda title: (title, self.movie_model(title)))
            )
          )
        )
        
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user_id"])
        # And pick out the movie features and pass them into the movie model,
        # getting embeddings back.
        positive_movie_embeddings = self.movie_model(features["movie_title"])
        # The task computes the loss and the metrics.
        return self.task(
            user_embeddings,
            positive_movie_embeddings,
            candidate_ids=features["movie_title"],
            compute_metrics=not training
        )

In [7]:
model = MovielensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [8]:
model.fit(train.batch(8192), epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ff940222990>

In [9]:
model.evaluate(test.batch(8192), return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.001500000013038516,
 'factorized_top_k/top_5_categorical_accuracy': 0.009600000455975533,
 'factorized_top_k/top_10_categorical_accuracy': 0.02160000056028366,
 'factorized_top_k/top_50_categorical_accuracy': 0.11885000020265579,
 'factorized_top_k/top_100_categorical_accuracy': 0.23385000228881836,
 'loss': 28262.845703125,
 'regularization_loss': 0,
 'total_loss': 28262.845703125}

### Approximate prediction

In [10]:
brute_force = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
brute_force.index_from_dataset(
    movies.batch(128).map(lambda title: (title, model.movie_model(title)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7ff9102becd0>

In [11]:
# Get predictions for user 42.
_, titles = brute_force(np.array(["42"]), k=3)

print(f"Top recommendations: {titles[0]}")

Top recommendations: [b'Rudy (1993)' b"Kid in King Arthur's Court, A (1995)"
 b'Homeward Bound: The Incredible Journey (1993)']


In [12]:
%timeit _, titles = brute_force(np.array(["42"]), k=3)

1.31 ms ± 12.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [13]:
# Construct a dataset of movies that's 1,000 times larger. We 
# do this by adding several million dummy movie titles to the dataset.
lots_of_movies = tf.data.Dataset.concatenate(
    movies.batch(4096),
    movies.batch(4096).repeat(1_000).map(lambda x: tf.zeros_like(x))
)

# We also add lots of dummy embeddings by randomly perturbing
# the estimated embeddings for real movies.
lots_of_movies_embeddings = tf.data.Dataset.concatenate(
    movies.batch(4096).map(model.movie_model),
    movies.batch(4096).repeat(1_000)
      .map(lambda x: model.movie_model(x))
      .map(lambda x: x * tf.random.uniform(tf.shape(x)))
)

In [14]:
brute_force_lots = tfrs.layers.factorized_top_k.BruteForce()
brute_force_lots.index_from_dataset(
    tf.data.Dataset.zip((lots_of_movies, lots_of_movies_embeddings))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7ff9303a08d0>

In [15]:
_, titles = brute_force_lots(model.user_model(np.array(["42"])), k=3)

print(f"Top recommendations: {titles[0]}")

Top recommendations: [b'Rudy (1993)' b"Kid in King Arthur's Court, A (1995)"
 b'Homeward Bound: The Incredible Journey (1993)']


In [16]:
%timeit _, titles = brute_force_lots(model.user_model(np.array(["42"])), k=3)

7.82 ms ± 157 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
scann = tfrs.layers.factorized_top_k.ScaNN(
    num_reordering_candidates=500,
    num_leaves_to_search=30
)
scann.index_from_dataset(
    tf.data.Dataset.zip((lots_of_movies, lots_of_movies_embeddings))
)

2022-11-09 05:32:58.607233: I scann/partitioning/partitioner_factory_base.cc:59] Size of sampled dataset for training partition: 100347
2022-11-09 05:32:58.687784: I ./scann/partitioning/kmeans_tree_partitioner_utils.h:88] PartitionerFactory ran in 80.462599ms.


<tensorflow_recommenders.layers.factorized_top_k.ScaNN at 0x7ff9302fb610>

In [18]:
_, titles = scann(model.user_model(np.array(["42"])), k=3)

print(f"Top recommendations: {titles[0]}")

Top recommendations: [b'Rudy (1993)' b"Kid in King Arthur's Court, A (1995)"
 b'Homeward Bound: The Incredible Journey (1993)']


In [19]:
%timeit _, titles = scann(model.user_model(np.array(["42"])), k=3)

2.2 ms ± 13.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Evaluating the approximation

In [20]:
# Override the existing streaming candidate source.
model.task.factorized_metrics = tfrs.metrics.FactorizedTopK(
    candidates=tf.data.Dataset.zip((lots_of_movies, lots_of_movies_embeddings))
)
# Need to recompile the model for the changes to take effect.
model.compile()

%time baseline_result = model.evaluate(test.batch(8192), return_dict=True, verbose=False)

CPU times: user 26min 4s, sys: 29.3 s, total: 26min 34s
Wall time: 1min 44s


In [21]:
model.task.factorized_metrics = tfrs.metrics.FactorizedTopK(
    candidates=scann
)
model.compile()

# We can use a much bigger batch size here because ScaNN evaluation
# is more memory efficient.
%time scann_result = model.evaluate(test.batch(8192), return_dict=True, verbose=False)

CPU times: user 20.8 s, sys: 1.08 s, total: 21.9 s
Wall time: 1.67 s


In [22]:
print(f"Brute force top-100 accuracy: {baseline_result['factorized_top_k/top_100_categorical_accuracy']:.2f}")
print(f"ScaNN top-100 accuracy:       {scann_result['factorized_top_k/top_100_categorical_accuracy']:.2f}")

Brute force top-100 accuracy: 0.15
ScaNN top-100 accuracy:       0.13


### Deploying the approximate model

In [23]:
lots_of_movies_embeddings

<ConcatenateDataset element_spec=TensorSpec(shape=(None, 32), dtype=tf.float32, name=None)>

In [24]:
# We re-index the ScaNN layer to include the user embeddings in the same model.
# This way we can give the saved model raw features and get valid predictions
# back.
scann = tfrs.layers.factorized_top_k.ScaNN(model.user_model, num_reordering_candidates=1000)
scann.index_from_dataset(
    tf.data.Dataset.zip((lots_of_movies, lots_of_movies_embeddings))
)

# Need to call it to set the shapes.
_ = scann(np.array(["42"]))

with tempfile.TemporaryDirectory() as tmp:
    path = os.path.join(tmp, "model")
    tf.saved_model.save(
        scann,
        path,
        options=tf.saved_model.SaveOptions(namespace_whitelist=["Scann"])
    )
    loaded = tf.saved_model.load(path)

2022-11-09 05:34:54.901860: I scann/partitioning/partitioner_factory_base.cc:59] Size of sampled dataset for training partition: 100347
2022-11-09 05:34:54.977946: I ./scann/partitioning/kmeans_tree_partitioner_utils.h:88] PartitionerFactory ran in 76.0116ms.


INFO:tensorflow:Assets written to: /tmp/tmpk26wunf1/model/assets


INFO:tensorflow:Assets written to: /tmp/tmpk26wunf1/model/assets


In [25]:
_, titles = loaded(tf.constant(["42"]))

print(f"Top recommendations: {titles[0][:3]}")

Top recommendations: [b'Rudy (1993)' b"Kid in King Arthur's Court, A (1995)"
 b'Homeward Bound: The Incredible Journey (1993)']


### Tuning ScaNN

In [26]:
# Process queries in groups of 1000; processing them all at once with brute force
# may lead to out-of-memory errors, because processing a batch of q queries against
# a size-n dataset takes O(nq) space with brute force.
titles_ground_truth = tf.concat([
  brute_force_lots(queries, k=10)[1] for queries in
  test.batch(1000).map(lambda x: model.user_model(x["user_id"]))
], axis=0)

In [27]:
# Get all user_id's as a 1d tensor of strings
test_flat = np.concatenate(list(test.map(lambda x: x["user_id"]).batch(1000).as_numpy_iterator()), axis=0)

# ScaNN is much more memory efficient and has no problem processing the whole
# batch of 20000 queries at once.
_, titles = scann(test_flat, k=10)

In [28]:
def compute_recall(ground_truth, approx_results):
    return np.mean([
        len(np.intersect1d(truth, approx)) / len(truth)
        for truth, approx in zip(ground_truth, approx_results)
    ])

In [29]:
print(f"Recall: {compute_recall(titles_ground_truth, titles):.3f}")

Recall: 0.936


In [30]:
%timeit -n 1000 scann(np.array(["42"]), k=10)

2.09 ms ± 38.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [31]:
scann2 = tfrs.layers.factorized_top_k.ScaNN(
    model.user_model, 
    num_leaves=1000,
    num_leaves_to_search=100,
    num_reordering_candidates=1000)
scann2.index_from_dataset(
    tf.data.Dataset.zip((lots_of_movies, lots_of_movies_embeddings))
)

_, titles2 = scann2(test_flat, k=10)

print(f"Recall: {compute_recall(titles_ground_truth, titles2):.3f}")

2022-11-09 05:36:12.172058: I scann/partitioning/partitioner_factory_base.cc:59] Size of sampled dataset for training partition: 100347
2022-11-09 05:36:12.374992: I ./scann/partitioning/kmeans_tree_partitioner_utils.h:88] PartitionerFactory ran in 202.851223ms.


Recall: 0.970


In [32]:
%timeit -n 1000 scann2(np.array(["42"]), k=10)

2.19 ms ± 37 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [33]:
scann3 = tfrs.layers.factorized_top_k.ScaNN(
    model.user_model,
    num_leaves=1000,
    num_leaves_to_search=70,
    num_reordering_candidates=400)
scann3.index_from_dataset(
    tf.data.Dataset.zip((lots_of_movies, lots_of_movies_embeddings))
)

_, titles3 = scann3(test_flat, k=10)
print(f"Recall: {compute_recall(titles_ground_truth, titles3):.3f}")

2022-11-09 05:36:49.175864: I scann/partitioning/partitioner_factory_base.cc:59] Size of sampled dataset for training partition: 100347
2022-11-09 05:36:49.366556: I ./scann/partitioning/kmeans_tree_partitioner_utils.h:88] PartitionerFactory ran in 190.614808ms.


Recall: 0.963


In [34]:
%timeit -n 1000 scann3(np.array(["42"]), k=10)

1.98 ms ± 39.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
