In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
import json

In [2]:
with open("books-cyan.json", "r", encoding="utf-8") as f:
    books: list[dict[str, any]] = json.load(f)

with open("pairs-cyan.json", "r", encoding="utf-8") as f:
    pairs: list[list[int]] = json.load(f)

In [3]:
books = list(filter(lambda x: x["isbn13"] is not None and x["category"] is not None, books))

books = list(map(lambda x: {"isbn13": str(x["isbn13"])}, books))

books_dataset = tf.data.Dataset.from_tensor_slices([x["isbn13"] for x in books]).cache()

In [6]:
pairs_ndarray = np.array(pairs).astype(str)

pairs_tmp = [{"query": tf.convert_to_tensor(x[0], dtype=tf.string),
              "candidate": tf.convert_to_tensor(x[1], dtype=tf.string)} for x in pairs_ndarray]

pairs_dataset = tf.data.Dataset.from_generator(
    lambda: pairs_tmp,
    {"query": tf.string, "candidate": tf.string},
    {"query": (), "candidate": ()}
).cache()

In [5]:
unique_isbns = np.unique(np.array([str(book["isbn13"]) for book in books]))

In [None]:
embedding_dimension = 32

query_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary=unique_isbns, mask_token=None),
    tf.keras.layers.Embedding(len(unique_isbns) + 1, embedding_dimension)
], name="query_model")

candidate_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary=unique_isbns, mask_token=None),
    tf.keras.layers.Embedding(len(unique_isbns) + 1, embedding_dimension)
], name="candidate_model")

In [None]:
metrics = tfrs.metrics.FactorizedTopK(candidates=books_dataset.batch(128).map(candidate_model))

task = tfrs.tasks.Retrieval(metrics=metrics)

In [None]:
class BookModel(tfrs.Model):
    def __init__(self, query_model, candidate_model):
        super().__init__()
        self.query_model: tf.keras.Model = query_model
        self.candidate_model: tf.keras.Model = candidate_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: dict[tf.string, tf.string], training=False) -> tf.Tensor:
        query_embeddings = self.query_model(features["query"])
        candidate_embeddings = self.candidate_model(features["candidate"])

        return self.task(query_embeddings, candidate_embeddings)

In [None]:
model = BookModel(query_model, candidate_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [None]:
shuffled = pairs_dataset.shuffle(10000, reshuffle_each_iteration=False).cache()

train = shuffled.batch(1000).cache()

cached_train = train.shuffle(1000)

In [None]:
model.fit(cached_train, epochs=20)

In [None]:
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)

index.index_from_dataset(
    tf.data.Dataset.zip((books_dataset.batch(100), books_dataset.batch(100).map(model.candidate_model)))
)

In [None]:
# Get recommendations.
_, titles = index(tf.constant(["9791196918033"]))
print(f"Related: {titles[0, :10]}")

In [None]:
tf.saved_model.save(index, "./result")

In [None]:
loaded = tf.saved_model.load("./result")

query_isbn = "9788966263363"
x = loaded(tf.constant([query_isbn]))