In [None]:
!pip install tensorflow_recommenders
!pip install scann
from typing import Dict, Text
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from google.colab import drive

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
main_path="/content/drive/MyDrive/Colab Notebooks/Msc2/DAT-901"

In [None]:
# Create sub datasets : data will be used to train a query model and a candidate model
purchases_df = pd.read_csv(f'{main_path}/all_purchases.csv', index_col=0)
items_df = pd.read_csv(f'{main_path}/all_unique_libelles.csv', index_col=0)

In [None]:
unique_clients = pd.unique(purchases_df['CLI_ID'])
unique_libelles = pd.unique(items_df['LIBELLE'])

In [None]:
# Use tensorflow to convert pandas df to tensor slices
purchases = tf.data.Dataset.from_tensor_slices(dict(purchases_df)).prefetch(tf.data.AUTOTUNE)
items = tf.data.Dataset.from_tensor_slices(dict(items_df)).prefetch(tf.data.AUTOTUNE)

In [None]:
for x in purchases.take(5).as_numpy_iterator():
    print(x)

{'LIBELLE': b'GD JDM4 PAMPLEMOUSSE FL 200ML', 'CLI_ID': 1490281}
{'LIBELLE': b'GD JDM4 PAMPLEMOUSSE FL 200ML', 'CLI_ID': 1490281}
{'LIBELLE': b'CR JR PARF BIO.SPE AC.SENT.50ML', 'CLI_ID': 1490281}
{'LIBELLE': b'EAU MICELLAIRE 3 THES FL200ML', 'CLI_ID': 1490281}
{'LIBELLE': b'GD JDM4 TIARE FL 200ML', 'CLI_ID': 1490281}


In [None]:
for x in items.take(5).as_numpy_iterator():
    print(x)

{'LIBELLE': b'GD JDM4 PAMPLEMOUSSE FL 200ML'}
{'LIBELLE': b'CR JR PARF BIO.SPE AC.SENT.50ML'}
{'LIBELLE': b'EAU MICELLAIRE 3 THES FL200ML'}
{'LIBELLE': b'GD JDM4 TIARE FL 200ML'}
{'LIBELLE': b'EDT  UN MATIN AU JARDIN  100ML  MUGUET'}


In [None]:
purchases = purchases.map(lambda x: {
    "LIBELLE": x["LIBELLE"],
    "CLI_ID": x["CLI_ID"],
}, num_parallel_calls=tf.data.AUTOTUNE)
items = items.map(lambda x: x["LIBELLE"], num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
tf.random.set_seed(42)
shuffled = purchases.shuffle(7500000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(7000000)
test = shuffled.skip(7000000).take(500000)

In [None]:
clients_vocabulary = tf.keras.layers.experimental.preprocessing.IntegerLookup(mask_token=None, vocabulary=unique_clients)
libelles_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None, vocabulary=unique_libelles)

In [None]:
embedding_dimension = 64

In [None]:
client_model = tf.keras.Sequential([clients_vocabulary,tf.keras.layers.Embedding(clients_vocabulary.vocabulary_size(),embedding_dimension)])
libelle_model = tf.keras.Sequential([libelles_vocabulary,tf.keras.layers.Embedding(libelles_vocabulary.vocabulary_size(),embedding_dimension)])

In [None]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates=items.batch(200).map(libelle_model)
)

In [None]:
task = tfrs.tasks.Retrieval(
    metrics=metrics
)

In [None]:
class TowerModel(tfrs.Model):

  def __init__(self, client_model, libelle_model):
    super().__init__()
    self.libelle_model: tf.keras.Model = libelle_model
    self.client_model: tf.keras.Model = client_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    client_embeddings = self.client_model(features["CLI_ID"])
    libelle_embeddings = self.libelle_model(features["LIBELLE"])
    return self.task(client_embeddings, libelle_embeddings)

In [None]:
model = TowerModel(client_model, libelle_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(
    learning_rate=0.2, 
    initial_accumulator_value=0.1,
    epsilon=1e-07,
))

In [None]:
cached_train = train.shuffle(7000000).batch(200).cache()
cached_test = test.batch(200).cache()

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)

In [None]:
model.fit(cached_train, epochs=3, callbacks=[early_stop])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f006fa60e20>

In [None]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.026531226933002472,
 'factorized_top_k/top_5_categorical_accuracy': 0.13694903254508972,
 'factorized_top_k/top_10_categorical_accuracy': 0.212457537651062,
 'factorized_top_k/top_50_categorical_accuracy': 0.4146471619606018,
 'factorized_top_k/top_100_categorical_accuracy': 0.5111150741577148,
 'loss': 463.2264099121094,
 'regularization_loss': 0,
 'total_loss': 463.2264099121094}

In [None]:
scann_index = tfrs.layers.factorized_top_k.ScaNN(model.client_model)
scann_index.index_from_dataset(
  tf.data.Dataset.zip((items.batch(100), items.batch(100).map(model.libelle_model)))
)

<tensorflow_recommenders.layers.factorized_top_k.ScaNN at 0x7f006fa60bb0>

In [None]:
_, rec = scann_index(np.array([int("1490281")]))
print(f"Recommendations for user 1490281: {rec[0, :5]}")

Recommendations for user 1490281: [b'EAU MICELLAIRE 3 THES FL200ML' b'GD FL200ML JDM PAMPLEMOUSSE'
 b'GD JDM4 CIT VERT FL 200ML' b'GD JDM4 ORANGE FL 200ML'
 b'CR ANT/ROUG B/SP AC.SENS P50']


In [None]:
tf.saved_model.save(scann_index,f'{main_path}/retrieval__model__cli_id',options=tf.saved_model.SaveOptions(namespace_whitelist=["Scann"]))

