In [None]:
!pip install tensorflow_recommenders
!pip install scann
from typing import Dict, Text
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from google.colab import drive

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_recommenders
  Downloading tensorflow_recommenders-0.7.2-py3-none-any.whl (89 kB)
[K     |████████████████████████████████| 89 kB 3.4 MB/s 
Installing collected packages: tensorflow-recommenders
Successfully installed tensorflow-recommenders-0.7.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scann
  Downloading scann-1.2.9-cp38-cp38-manylinux_2_27_x86_64.whl (10.5 MB)
[K     |████████████████████████████████| 10.5 MB 5.0 MB/s 
Collecting tensorflow~=2.11.0
  Downloading tensorflow-2.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[K     |████████████████████████████████| 588.3 MB 17 kB/s 
Collecting tensorboard<2.12,>=2.11
  Downloading tensorboard-2.11.0-py3-none-any.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 55.0 MB/s 
Collecting tensorflow-estimator<2.1

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
main_path="/content/drive/MyDrive/Colab Notebooks/Msc2/DAT-901"

In [None]:
tf.__version__

'2.11.0'

In [None]:
# Create sub datasets : data will be used to train a query model and a candidate model
carts_df = pd.read_csv(f'{main_path}/all_carts.csv', index_col=0)
items_df = pd.read_csv(f'{main_path}/all_libelles.csv', index_col=0)
items_df = items_df.drop_duplicates(subset = "LIBELLE")

In [None]:
# List of first 10 000 unique ticket id
first_carts = pd.unique(carts_df['TICKET_ID'])
carts_df = carts_df.loc[carts_df['TICKET_ID'].isin(first_carts)]


In [None]:
# Use tensorflow to convert pandas df to tensor slices
carts = tf.data.Dataset.from_tensor_slices(dict(carts_df)).prefetch(tf.data.AUTOTUNE)
items = tf.data.Dataset.from_tensor_slices(dict(items_df)).prefetch(tf.data.AUTOTUNE)

In [None]:
for x in carts.take(5).as_numpy_iterator():
    print(x)

{'TICKET_ID': 35592159, 'LIBELLE': b'GD JDM4 PAMPLEMOUSSE FL 200ML'}
{'TICKET_ID': 35592159, 'LIBELLE': b'GD JDM4 PAMPLEMOUSSE FL 200ML'}
{'TICKET_ID': 35592159, 'LIBELLE': b'CR JR PARF BIO.SPE AC.SENT.50ML'}
{'TICKET_ID': 35592159, 'LIBELLE': b'EAU MICELLAIRE 3 THES FL200ML'}
{'TICKET_ID': 35592159, 'LIBELLE': b'GD JDM4 TIARE FL 200ML'}


In [None]:
for x in items.take(5).as_numpy_iterator():
    print(x)

{'LIBELLE': b'GD JDM4 PAMPLEMOUSSE FL 200ML'}
{'LIBELLE': b'CR JR PARF BIO.SPE AC.SENT.50ML'}
{'LIBELLE': b'EAU MICELLAIRE 3 THES FL200ML'}
{'LIBELLE': b'GD JDM4 TIARE FL 200ML'}
{'LIBELLE': b'EDT  UN MATIN AU JARDIN  100ML  MUGUET'}


In [None]:
carts = carts.map(lambda x: {
    "LIBELLE": x["LIBELLE"],
    "TICKET_ID": x["TICKET_ID"],
}, num_parallel_calls=tf.data.AUTOTUNE)
items = items.map(lambda x: x["LIBELLE"], num_parallel_calls=tf.data.AUTOTUNE)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [None]:
tf.random.set_seed(42)
shuffled = carts.shuffle(7500000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(7000000)
test = shuffled.skip(7000000).take(500000)

In [None]:
libelle_items = items.batch(200)
cart_ids = carts.batch(200).map(lambda x: x["TICKET_ID"])

unique_items = np.unique(np.concatenate(list(libelle_items)))
unique_carts = np.unique(np.concatenate(list(cart_ids)))
cart_id_vocabulary = tf.keras.layers.experimental.preprocessing.IntegerLookup(mask_token=None, vocabulary=unique_carts)
libelles_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None, vocabulary=unique_items)

In [None]:
embedding_dimension = 64

In [None]:
cart_model = tf.keras.Sequential([cart_id_vocabulary,tf.keras.layers.Embedding(cart_id_vocabulary.vocabulary_size(),embedding_dimension)])
libelle_model = tf.keras.Sequential([libelles_vocabulary,tf.keras.layers.Embedding(libelles_vocabulary.vocabulary_size(),embedding_dimension)])

In [None]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates=items.batch(200).map(libelle_model)
)

In [None]:
task = tfrs.tasks.Retrieval(
    metrics=metrics
)

In [None]:
class LibelleRecModel(tfrs.Model):

  def __init__(self, cart_model, libelle_model):
    super().__init__()
    self.libelle_model: tf.keras.Model = libelle_model
    self.cart_model: tf.keras.Model = cart_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    cart_embeddings = self.cart_model(features["TICKET_ID"])
    libelle_embeddings = self.libelle_model(features["LIBELLE"])
    return self.task(cart_embeddings, libelle_embeddings)

In [None]:
model = LibelleRecModel(cart_model, libelle_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(
    learning_rate=0.5, 
    initial_accumulator_value=0.1,
    epsilon=1e-07,
))

In [None]:
cached_train = train.shuffle(7000000).batch(200).cache()
cached_test = test.batch(200).cache()

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)

In [None]:
model.fit(cached_train, epochs=10, callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f73f817ac70>

In [None]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.018544163554906845,
 'factorized_top_k/top_5_categorical_accuracy': 0.1261231154203415,
 'factorized_top_k/top_10_categorical_accuracy': 0.1928544044494629,
 'factorized_top_k/top_50_categorical_accuracy': 0.34666138887405396,
 'factorized_top_k/top_100_categorical_accuracy': 0.42365652322769165,
 'loss': 899.8477172851562,
 'regularization_loss': 0,
 'total_loss': 899.8477172851562}

In [None]:
scann_index = tfrs.layers.factorized_top_k.ScaNN(model.cart_model)
scann_index.index_from_dataset(
  tf.data.Dataset.zip((items.batch(100), items.batch(100).map(model.libelle_model)))
)

<tensorflow_recommenders.layers.factorized_top_k.ScaNN at 0x7f73f8571fd0>

In [None]:
_, rec = scann_index(np.array([int("35592159")]))
print(f"Recommendations for user 35592159: {rec[0, :5]}")

Recommendations for user 35592159: [b'GD JDM4 PAMPLEMOUSSE FL 200ML' b'GLM JDM YLANG YLANG FP200ML'
 b'SERUM INT JEUNESSE ELX7.9 30ml' b'LAIT DEMAQ 3 THES FL200ML'
 b'GM FRUITS NOIRS 200 ML']


In [None]:
tf.saved_model.save(scann_index,f'{main_path}/retrieval__model__ticket_id',options=tf.saved_model.SaveOptions(namespace_whitelist=["Scann"]))

