In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import tensorflow_recommenders as tfrs
from typing import Dict, Text

In [2]:
df=pd.read_csv('movie_data.csv')
print(df.head())

   movie_id  user_id  movie_rating        date  release_year      movie_title
0         1  1488844             3  2005-09-06        2003.0  Dinosaur Planet
1         1   822109             5  2005-05-13        2003.0  Dinosaur Planet
2         1   885013             4  2005-10-19        2003.0  Dinosaur Planet
3         1    30878             4  2005-12-26        2003.0  Dinosaur Planet
4         1   823519             3  2004-05-03        2003.0  Dinosaur Planet


In [3]:
movies_df=df[['movie_id','movie_title']]
movies_df.head()

Unnamed: 0,movie_id,movie_title
0,1,Dinosaur Planet
1,1,Dinosaur Planet
2,1,Dinosaur Planet
3,1,Dinosaur Planet
4,1,Dinosaur Planet


In [4]:
df['user_id'] = df['user_id'].astype(str)

ratings = tf.data.Dataset.from_tensor_slices(dict(df[['user_id', 'movie_title', 'movie_rating']]))
movies = tf.data.Dataset.from_tensor_slices(dict(movies_df[['movie_title']]))

ratings = ratings.map(lambda x: {"movie_title": x["movie_title"],"user_id": x["user_id"],"movie_rating": float(x["movie_rating"])})
movies = movies.map(lambda x: x["movie_title"])


In [5]:
print('Total Data: {}'.format(len(ratings)))

Total Data: 100480507


In [6]:
tf.random.set_seed(21)
shuffled = ratings.shuffle(100_000_000, seed=21, reshuffle_each_iteration=False)

train = ratings.take(80_000_000)
test = ratings.skip(80_000_000).take(20_480_507)

In [7]:
movie_titles = movies.batch(1_000_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])
print('created batch')
unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))
print('created unique')
print('Unique Movies: {}'.format(len(unique_movie_titles)))
print('Unique users: {}'.format(len(unique_user_ids)))

created batch
created unique
Unique Movies: 17297
Unique users: 480189


In [8]:
class MovieModel(tfrs.models.Model):

  def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
    # We take the loss weights in the constructor: this allows us to instantiate
    # several model objects with different loss weights.

    super().__init__()

    embedding_dimension = 64

    # User and movie models.
    self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
    ])
    self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # A small model to take in user and movie embeddings and predict ratings.
    # We can make this as complicated as we want as long as we output a scalar
    # as our prediction.
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(1),
    ])

    # The tasks.
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.movie_model)
        )
    )

    # The loss weights.
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight

  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model.
    movie_embeddings = self.movie_model(features["movie_title"])
    
    return (
        user_embeddings,
        movie_embeddings,
        # We apply the multi-layered rating model to a concatentation of
        # user and movie embeddings.
        self.rating_model(
            tf.concat([user_embeddings, movie_embeddings], axis=1)
        ),
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    ratings = features.pop("movie_rating")

    user_embeddings, movie_embeddings, rating_predictions = self(features)

    # We compute the loss for each task.
    rating_loss = self.rating_task(
        labels=ratings,
        predictions=rating_predictions,
    )
    retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

    # And combine them using the loss weights.
    return (self.rating_weight * rating_loss
            + self.retrieval_weight * retrieval_loss)

In [None]:
model = MovieModel(rating_weight=1.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

cached_train = train.shuffle(100_000_000).batch(1_000_000).cache()
cached_test = test.batch(1_000_000).cache()

model.fit(cached_train, epochs=3)

Epoch 1/3
