This notebook has the model creation for first time running with original data. The retraining on real time with ongoing data is on file `main.py`

In [1]:
# All imports
import json
from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs




In [2]:
data_activities = []
with open("data_activities.json", "r") as f:
    data_activities = json.load(f)

data_feedback = []
with open("data_feedback.json", "r") as f:
    data_feedback = json.load(f)

unique_activity_ids = set(activity["act_id"] for activity in data_activities)
unique_user_ids = set(u["user_id"] for u in data_feedback)

# convert to bytes (tensorflow requires)
unique_user_ids = np.array(list(map(str.encode, unique_user_ids)))
unique_activity_ids = np.array(list(map(str.encode, unique_activity_ids)))

# cast activity data into TF tensor
activities_list = [act["act_id"] for act in data_activities]
activities = tf.data.Dataset.from_tensor_slices(activities_list)

# cast feedback data into TF tensor
user_id_list = []
act_id_list = []
act_class_list = []
for entry in data_feedback:
    user_id_list.append(entry["user_id"])
    act_id_list.append(entry["act_id"])
    # from 'act_id' extract also the activity class
    act_class_list.append(
        next(filter(lambda x: x["act_id"] == entry["act_id"], data_activities), {}).get(
            "act_class"
        )
    )


feedback = tf.data.Dataset.from_tensor_slices(
    {
        "user_id": user_id_list,
        "act_id": act_id_list,
        "act_class": act_class_list,
    }
)

In [3]:
EMBEDDING_DIMENSION = 32  # Higher values will correspond to models that may be more accurate, but will also be slower to fit and more prone to overfitting.
BATCH_SIZE = 128
NUM_EPOCH = 3
NUM_SAMPLES = len(data_feedback)

In [4]:
# ----------Train/test split ---------------
shuffled = feedback.shuffle(buffer_size=NUM_SAMPLES, reshuffle_each_iteration=False)

train_percent = 0.8  # 0.2 test

train_size = int(train_percent * NUM_SAMPLES)
test_size = NUM_SAMPLES - train_size

train = shuffled.take(train_size)
test = shuffled.skip(train_size).take(test_size)

In [5]:
print("Unique activities: {}".format(len(unique_activity_ids)))
print("Unique users: {}".format(len(unique_user_ids)))

Unique activities: 11
Unique users: 3


In [6]:
class ActivityModel(tfrs.models.Model):

    def __init__(self, retrieval_weight: float) -> None:

        super().__init__()

        self.activity_model: tf.keras.layers.Layer = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_activity_ids, mask_token=None
                ),
                tf.keras.layers.Embedding(
                    len(unique_activity_ids) + 1, EMBEDDING_DIMENSION
                ),
            ]
        )
        self.user_model: tf.keras.layers.Layer = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_user_ids, mask_token=None
                ),
                tf.keras.layers.Embedding(
                    len(unique_user_ids) + 1, EMBEDDING_DIMENSION
                ),
            ]
        )

        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=activities.batch(128).map(self.activity_model)
            )
        )

        self.retrieval_weight = retrieval_weight

    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user_id"])
        # And pick out the activity features and pass them into the activity model.
        activity_embeddings = self.activity_model(features["act_id"])

        return (user_embeddings, activity_embeddings)

    def compute_loss(
        self, features: Dict[Text, tf.Tensor], training=False
    ) -> tf.Tensor:

        user_embeddings, activity_embeddings = self(features)

        retrieval_loss = self.retrieval_task(user_embeddings, activity_embeddings)

        return self.retrieval_weight * retrieval_loss

In [7]:
model = ActivityModel(retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

cached_train = train.shuffle(NUM_SAMPLES).batch(BATCH_SIZE).cache()
cached_test = test.batch(BATCH_SIZE).cache()

model.fit(cached_train, epochs=3)



Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x221c02e08d0>

In [8]:
metrics = model.evaluate(cached_test, return_dict=True)
metrics



{'factorized_top_k/top_1_categorical_accuracy': 0.0,
 'factorized_top_k/top_5_categorical_accuracy': 1.0,
 'factorized_top_k/top_10_categorical_accuracy': 1.0,
 'factorized_top_k/top_50_categorical_accuracy': 1.0,
 'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'loss': 1.349341630935669,
 'regularization_loss': 0,
 'total_loss': 1.349341630935669}

In [9]:
model.summary()

Model: "activity_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential (Sequential)     (None, 32)                384       
                                                                 
 sequential_1 (Sequential)   (None, 32)                128       
                                                                 
 retrieval (Retrieval)       multiple                  1         
                                                                 
Total params: 513 (2.00 KB)
Trainable params: 512 (2.00 KB)
Non-trainable params: 1 (4.00 Byte)
_________________________________________________________________


In [10]:
def predicting(user, top_n=3):

    # Create a model that takes in raw query features, and
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
    # recommends activities out of the entire activity dataset.
    index.index_from_dataset(
        tf.data.Dataset.zip(
            (
                activities.batch(BATCH_SIZE),
                activities.batch(BATCH_SIZE).map(model.activity_model),
            )
        )
    )

    # Get recommendations.
    _, titles = index(tf.constant([str(user)]))

    print("Top {} recommendations for user {}:\n".format(top_n, user))
    for i, title in enumerate(titles[0, :top_n].numpy()):
        print("{}. {}".format(i + 1, title.decode("utf-8")))

In [11]:
predicting("U3", 5)

Top 5 recommendations for user U3:

1. A4
2. A2
3. A5
4. A3
5. A7


In [12]:
# https://stackoverflow.com/questions/70358350/how-to-deal-with-tf-saved-model-savemodel-filepath-error
model.retrieval_task = tfrs.tasks.Retrieval()  # Removes the metrics.
model.compile()
model.save("./trained_model")



INFO:tensorflow:Assets written to: ./trained_model\assets


INFO:tensorflow:Assets written to: ./trained_model\assets


In [13]:
loaded_model = tf.keras.models.load_model("./trained_model")
loaded_model







<keras.src.saving.legacy.saved_model.load.ActivityModel at 0x221c28b6ed0>