In [1]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

2022-04-04 14:09:19.676165: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-04 14:09:19.676292: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## 1. Prepare the dataset.

In [2]:
ratings = tfds.load('movielens/100k-ratings', split = 'train')

ratings = ratings.map(lambda x: {
    'movie_title' : x['movie_title'],
    'user_id' : x['user_id'],
    'user_rating' : x['user_rating']
})

2022-04-04 14:11:29.966623: W tensorflow/core/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata".


[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /home/debonair/tensorflow_datasets/movielens/100k-ratings/0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling /home/debonair/tensorflow_datasets/movielens/100k-ratings/0.1.0.incomplete8ZGD0D/movielens-train.tfr…

[1mDataset movielens downloaded and prepared to /home/debonair/tensorflow_datasets/movielens/100k-ratings/0.1.0. Subsequent calls will reuse this data.[0m


2022-04-04 14:13:56.924387: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-04-04 14:13:56.924512: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-04 14:13:56.924582: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (debonair): /proc/driver/nvidia/version does not exist
2022-04-04 14:13:56.941349: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
## Split the train data by putting 80% of the ratings in the train set and 20% in the test set.
tf.random.set_seed(42)

shuffled = ratings.shuffle(100_000, seed = 42, reshuffle_each_iteration = False)
train = shuffled.take(80_000)
test = shuffled.take(20_000)

In [5]:
#Identify unique user ids and movie titles present in the data.
movie_titles = ratings.batch(1000000).map(lambda x: x['movie_title'])
user_ids = ratings.batch(1000000).map(lambda x : x['user_id'])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

2022-04-04 14:17:43.383037: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 24000000 exceeds 10% of free system memory.
2022-04-04 14:17:43.418807: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 24000000 exceeds 10% of free system memory.
2022-04-04 14:17:56.683720: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 24000000 exceeds 10% of free system memory.
2022-04-04 14:17:56.718714: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 24000000 exceeds 10% of free system memory.


In [9]:
unique_movie_titles

array([b"'Til There Was You (1997)", b'1-900 (1994)',
       b'101 Dalmatians (1996)', ..., b'Zeus and Roxanne (1997)',
       b'unknown', b'\xc3\x81 k\xc3\xb6ldum klaka (Cold Fever) (1994)'],
      dtype=object)

## 2. Model implementation.


### A model composed of multiple stacked dense layers is a relatively common architecture for ranking tasks.


In [24]:
class RankingModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        embedding_dimension = 32
        
        #Compute embeddings for users.
        self.user_embeddings = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary = unique_user_ids, mask_token = None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])
        
        #Compute embeddings for movies.
        self.movie_embeddings =  tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary = unique_movie_titles, mask_token = None),
            tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
        ])
        
        #Compute predictions.
        self.ratings = tf.keras.Sequential([
            #Learn multiple dense layers.
            tf.keras.layers.Dense(256, activation = 'relu'),
            tf.keras.layers.Dense(64, activation = 'relu'),
            tf.keras.layers.Dense(1)
        ])
        
    def call(self, inputs):
        user_id, movie_title = inputs
        #The function takes in the user id and movie title as inputs.
        user_embedding = self.user_embeddings(user_id)
        movie_embedding = self.movie_embeddings(movie_title)
        
        return self.ratings(tf.concat([user_embedding, movie_embedding], axis = 1))

- The model takes in *user ids* and *movie titles* and outputs a predicted rating.
RankingModel()((['42', ['One flew Over the Cuckoo's Nest (1975)']))

## 3. Define the loss and metrics.

In [25]:
task = tfrs.tasks.Ranking(
    loss = tf.keras.losses.MeanSquaredError(),
    metrics = [tf.keras.metrics.RootMeanSquaredError()]
)

## 4. ***The full model.***

In [26]:
class MovieLensModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()
        self.ranking_model : tf.keras.Model = RankingModel()
        self.task : tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss = tf.keras.losses.MeanSquaredError(),
            metrics = [tf.keras.metrics.RootMeanSquaredError()]
        )
        
    def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
        return self.ranking_model(
            (features['user_id'], features['movie_title']))
    
    def compute_loss(self, features: Dict[Text, tf.Tensor], training = False) -> tf.Tensor:
        labels = features.pop('user_rating')
        
        rating_predictions = self(features)
        
        #The task computes the loss and metrics.
        return self.task(labels = labels, predictions = rating_predictions)

In [27]:
model = MovieLensModel()
model.compile(optimizer = tf.keras.optimizers.Adagrad(learning_rate = 0.1))

In [29]:
#Shuffle, batch and cache the train and test data.
cached_train = train.shuffle(100000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [30]:
model.fit(cached_train, epochs = 5)

Epoch 1/5


2022-04-04 14:53:07.354101: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:390] Filling up shuffle buffer (this may take a while): 59231 of 100000
2022-04-04 14:53:08.659730: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:415] Shuffle buffer filled.
2022-04-04 14:53:08.659830: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:390] Filling up shuffle buffer (this may take a while): 1 of 100000
2022-04-04 14:53:08.908441: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:415] Shuffle buffer filled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f52a0a23850>

In [31]:
model.evaluate(cached_test, return_dict = True)



{'root_mean_squared_error': 1.0882740020751953,
 'loss': 1.1842567920684814,
 'regularization_loss': 0,
 'total_loss': 1.1842567920684814}

In [32]:
model.summary()

Model: "movie_lens_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 ranking_model_2 (RankingMod  multiple                 116641    
 el)                                                             
                                                                 
 ranking_4 (Ranking)         multiple                  0         
                                                                 
Total params: 116,641
Trainable params: 116,641
Non-trainable params: 0
_________________________________________________________________


In [33]:
from keras.utils.vis_utils import plot_model

In [35]:
plot_model(model, to_file = 'tfrs.svg', dpi = 40)

ValueError: Cannot embed the 'svg' image format

## 4. Test the ranking model.

In [None]:
test_ratings = {}
test_movie_titles = ['M*A*S*H (1970)', 'Dances with Wolves'