In [1]:
#install tensorflow recommenders
!pip install -q tensorflow_recommenders
!pip install -q ScaNN

[K     |████████████████████████████████| 85 kB 2.4 MB/s 
[K     |████████████████████████████████| 10.9 MB 4.5 MB/s 
[?25h

In [2]:
#upload dataset
from google.colab import files
files.upload()

Saving rec_data.csv to rec_data.csv


{'rec_data.csv': b'User_Id,Class_Id,Order_Rating\r\n228,1,4\r\n263,28,4\r\n310,4,4\r\n72,27,1\r\n238,25,5\r\n183,28,5\r\n269,39,4\r\n271,28,2\r\n62,41,4\r\n346,45,1\r\n352,8,5\r\n250,38,3\r\n100,42,2\r\n234,36,3\r\n185,29,2\r\n21,2,1\r\n284,21,1\r\n71,47,3\r\n239,32,1\r\n6,48,2\r\n86,2,1\r\n117,12,2\r\n50,4,2\r\n198,14,5\r\n215,46,1\r\n186,32,2\r\n47,8,2\r\n113,42,1\r\n25,16,4\r\n317,9,1\r\n70,2,3\r\n323,10,1\r\n78,39,4\r\n198,23,4\r\n142,5,4\r\n148,19,3\r\n351,8,1\r\n243,29,3\r\n150,2,5\r\n27,17,3\r\n229,13,2\r\n81,19,5\r\n257,40,5\r\n127,30,5\r\n284,49,4\r\n342,45,5\r\n220,28,2\r\n330,38,3\r\n136,37,2\r\n112,26,4\r\n163,28,2\r\n222,40,5\r\n238,37,2\r\n88,44,5\r\n239,10,1\r\n233,7,4\r\n177,10,5\r\n165,11,4\r\n38,21,4\r\n312,40,4\r\n74,43,5\r\n142,13,3\r\n231,38,4\r\n263,34,3\r\n46,7,1\r\n6,35,5\r\n90,45,4\r\n78,16,3\r\n62,15,2\r\n99,36,5\r\n324,5,1\r\n326,38,5\r\n106,35,3\r\n301,32,5\r\n268,2,2\r\n25,38,1\r\n209,25,4\r\n81,46,5\r\n344,19,4\r\n286,21,3\r\n5,13,5\r\n124,13,5\r\n241,31,2

In [3]:
#preparing utilized library
from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_recommenders as tfrs

import pandas as pd

In [4]:
#load the dataset
#make the User_Id and Class_Id column as string data type
#change Order_Rating column to float data type
dataframe = pd.read_csv("/content/rec_data.csv")
dataframe[["User_Id", "Class_Id"]] = dataframe[["User_Id", "Class_Id"]].astype(str)
dataframe[["Order_Rating"]] = dataframe[["Order_Rating"]].astype(float)
dataframe

Unnamed: 0,User_Id,Class_Id,Order_Rating
0,228,1,4.0
1,263,28,4.0
2,310,4,4.0
3,72,27,1.0
4,238,25,5.0
...,...,...,...
995,34,24,4.0
996,143,22,3.0
997,122,3,4.0
998,160,16,3.0


In [17]:
#group User_Id and Class_Id so there is no duplicate and calculate the mean for the Order_Rating
#Make dictionary with column name as key and the data as value
#Make the tensorslicedataset from the dictionary
#map the slicedataset with only data that we're gonna use(User_Id, Class_Id, Order_Rating)
inter_dict = dataframe.groupby(["User_Id", "Class_Id"])["Order_Rating"].agg(np.mean).reset_index()
inter_dict = {name: np.array(value) for name, value in inter_dict.items()}
interaction = tf.data.Dataset.from_tensor_slices(inter_dict)

#drop duplicate in Class_Id column
#make dictionary
#make tensorslicedataset from dictionary
#map the slicedataset
prod_dict = dataframe[["Class_Id"]].drop_duplicates()
prod_dict = {name: np.array(value) for name, value in prod_dict.items()}
product = tf.data.Dataset.from_tensor_slices(prod_dict)

interaction = interaction.map(lambda x: {
    "User_Id": x["User_Id"],
    "Class_Id": x["Class_Id"],
    "Order_Rating": float(x["Order_Rating"])
})

product = product.map(lambda x: x["Class_Id"])

In [18]:
#take User_Id and Class_Id from mapdataset
user_ids = interaction.batch(100).map(lambda x: x["User_Id"])
product_ids = interaction.batch(100).map(lambda x: x["Class_Id"])
#make a list out of unique user_ids and product_ids
unique_user = np.unique(np.concatenate(list(user_ids)))
unique_product = np.unique(np.concatenate(list(product_ids)))

In [27]:
#set the training and testing dataset and shuffle it
tf.random.set_seed(42)
shuffled = interaction.shuffle(100, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(800)
test = shuffled.skip(800).take(164)

cached_train = train.shuffle(100).batch(100)
cached_test = test.batch(10).cache()


In [32]:
class Model(tfrs.models.Model):
  def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
    super().__init__()

    embed_dim = 32
    #define product model, map string to indices and turn them into dense vectors with 32 dimension
    self.product_model: tf.keras.layers.Layer = tf.keras.Sequential([
          tf.keras.layers.StringLookup(vocabulary = unique_product, mask_token = None),
          tf.keras.layers.Embedding(len(unique_product) + 1, embed_dim)
    ])
    #define user model
    self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
          tf.keras.layers.StringLookup(vocabulary = unique_user, mask_token = None),
          tf.keras.layers.Embedding(len(unique_user) + 1, embed_dim)
    ])
    #define rating model with dense layer with one output for rating prediction
    self.rating_model = tf.keras.Sequential([
          tf.keras.layers.Dense(256, activation="relu"),
          tf.keras.layers.Dense(128, activation="relu"),
          tf.keras.layers.Dense(64, activation="relu"),
          tf.keras.layers.Dense(1),
        ])
    #define metrics for retrieval task with factorizedtopk
    self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
          metrics = tfrs.metrics.FactorizedTopK(
            candidates = product.batch(128).map(self.product_model), k = 50
        )
    )
    #define metrics and loss for rating task with RMSE
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
          loss = tf.keras.losses.MeanSquaredError(),
          metrics = [tf.keras.metrics.RootMeanSquaredError()]
    )

    self.retrieval_weight = retrieval_weight
    self.rating_weight = rating_weight
  ###function for predicting rating
  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    user_embeddings = self.user_model(features["User_Id"])
    product_embeddings = self.product_model(features["Class_Id"])
    return(
        user_embeddings, product_embeddings, self.rating_model(tf.concat([user_embeddings, product_embeddings], axis = 1),)
    )
  ###function for computing loss and metrics
  def compute_loss(self, features: Dict[Text, tf.Tensor], training = False) -> tf.Tensor:
    ratings = features.pop("Order_Rating")
    user_embeddings, product_embeddings, rating_predictions = self(features)

    retrieval_loss = self.retrieval_task(user_embeddings, product_embeddings)
    rating_loss = self.rating_task(
        labels = ratings,
        predictions = rating_predictions
    )
    return (self.rating_weight * rating_loss + self.retrieval_weight * retrieval_loss)

In [33]:
#compile the model with taking account rating and retrieval using gradient descent optmizer
model = Model(rating_weight=1, retrieval_weight=1)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [34]:
model.fit(cached_train, epochs=50)
metrics = model.evaluate(cached_test, return_dict=True)

print(f"top 5 accuracy: {metrics['factorized_top_k/top_5_categorical_accuracy']:.3f}.")
print(f"RMSE Ranking: {metrics['root_mean_squared_error']:.3f}.")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
top 5 accuracy: 0.085.
RMSE Ranking: 1.682.


In [35]:
#define the retrieval method and then save the model
retrieval = tfrs.layers.factorized_top_k.ScaNN(model.user_model, k= 3, num_leaves = 10)
retrieval.index_from_dataset(
    tf.data.Dataset.zip((product.batch(5), product.batch(5).map(model.product_model))))
_ = retrieval(np.array(["29"]))
tf.saved_model.save(retrieval, "/content/sample_data/Model", options=tf.saved_model.SaveOptions(namespace_whitelist=["Scann"]))



INFO:tensorflow:Assets written to: /content/sample_data/Model/assets


INFO:tensorflow:Assets written to: /content/sample_data/Model/assets


In [36]:
load = tf.saved_model.load("/content/sample_data/Model")

In [37]:
rating, suggestion = load(np.array(["110"]))
print(rating)
print(suggestion)

tf.Tensor([[8.153923  3.3021047 2.9373307]], shape=(1, 3), dtype=float32)
tf.Tensor([[b'27' b'32' b'50']], shape=(1, 3), dtype=string)
