In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from typing import Dict, Text

In [2]:
RATINGS_DATA_PATH = './data/ratings.csv'
USER_DATA_PATH = './data/users.csv'
ITEM_DATA_PATH = './data/movies.csv'

In [3]:
# Load data
ratings_df = pd.read_csv(RATINGS_DATA_PATH)
ratings_df["user_id"] = ratings_df["user_id"].astype(str)  # for StringLookup
ratings_df["title"] = ratings_df["title"].astype(str) # for StringLookup
ratings_df["rating"] = ratings_df["rating"].astype(float)

# convert to tf datasets
ratings = tf.data.Dataset.from_tensor_slices(dict(ratings_df))

ratings = ratings.map(lambda x: {
    "movie_title": x["title"],
    "user_id": x["user_id"],
    "user_rating": x["rating"]
})

In [4]:
# train/test split
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [5]:
# vocabulary
movie_titles = ratings.batch(1_000_000).map(lambda x: x["movie_title"])
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

### Implement Ranking Model

In [6]:
# ranking model
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 64

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
    ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])
    
  # def call(self, inputs):
  #   user_id = inputs["user_id"]
  #   movie_title = inputs["movie_title"]

  #   user_embedding = self.user_embeddings(user_id)
  #   movie_embedding = self.movie_embeddings(movie_title)

  #   return self.ratings(tf.concat([user_embedding, movie_embedding], axis=1))

  def call(self, inputs):

    user_id, movie_title = inputs

    user_embedding = self.user_embeddings(user_id)
    movie_embedding = self.movie_embeddings(movie_title)

    return self.ratings(tf.concat([user_embedding, movie_embedding], axis=1))
  
# full model
class RecommendationModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = RankingModel()
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    return self.ranking_model(
        (features["user_id"], features["movie_title"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features["user_rating"]
    inputs = {key: features[key] for key in features if key != "user_rating"}

    rating_predictions = self(inputs)

    # The task computes the loss and the metrics.
    return self.task(labels=labels, predictions=rating_predictions)

### Train

In [7]:
LR = 0.1
EPOCHS = 5

model = RecommendationModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=LR))

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

model.fit(cached_train, epochs=EPOCHS)

Epoch 1/5




[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - loss: 4.3333 - regularization_loss: 0.0000e+00 - root_mean_squared_error: 2.6210 - total_loss: 4.3333
Epoch 2/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 1.2523 - regularization_loss: 0.0000e+00 - root_mean_squared_error: 1.1259 - total_loss: 1.2523
Epoch 3/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 1.2184 - regularization_loss: 0.0000e+00 - root_mean_squared_error: 1.1093 - total_loss: 1.2184
Epoch 4/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 1.1801 - regularization_loss: 0.0000e+00 - root_mean_squared_error: 1.0929 - total_loss: 1.1801
Epoch 5/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 1.1311 - regularization_loss: 0.0000e+00 - root_mean_squared_error: 1.0709 - total_loss: 1.1311


<keras.src.callbacks.history.History at 0x26066361550>

### Evaluate

In [8]:
model.evaluate(cached_test, return_dict=True)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 1.1304 - regularization_loss: 0.0000e+00 - root_mean_squared_error: 1.0617 - total_loss: 1.1304


{'loss': <tf.Tensor: shape=(), dtype=float32, numpy=1.1337839365005493>,
 'root_mean_squared_error': <tf.Tensor: shape=(), dtype=float32, numpy=1.062831997871399>,
 'regularization_loss': <tf.Tensor: shape=(), dtype=int32, numpy=0>,
 'total_loss': <tf.Tensor: shape=(), dtype=float32, numpy=1.1337839365005493>}

### Inference

In [9]:
test_ratings = {}
test_movie_titles = ["M*A*S*H (1970)", "Dances with Wolves (1990)", "Speed (1994)"]
for movie_title in test_movie_titles:
  test_ratings[movie_title] = model({
      "user_id": np.array(["42"]),
      "movie_title": np.array([movie_title])
  })

print("Ratings:")
for title, score in sorted(test_ratings.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score}")

Ratings:
M*A*S*H (1970): [[3.8083076]]
Dances with Wolves (1990): [[3.7757146]]
Speed (1994): [[3.7118106]]


In [10]:
def rank_movies_for_user(model, user_id: str, candidate_movies: list[str]) -> list[tuple[str, float]]:
    # Repeat user_id for each movie
    user_ids = tf.constant([user_id] * len(candidate_movies))
    movie_titles = tf.constant(candidate_movies)

    # Prepare input batch as dict
    # inputs = {
    #     "user_id": user_ids,
    #     "movie_title": movie_titles
    # }

    # Run inference
    # predictions = model.ranking_model(inputs)  # shape (N, 1)
    predictions = model.ranking_model((user_ids, movie_titles))
    predicted_ratings = tf.squeeze(predictions, axis=1).numpy()  # shape (N,)

    # Zip movie titles with scores and sort
    movie_scores = list(zip(candidate_movies, predicted_ratings))
    ranked = sorted(movie_scores, key=lambda x: x[1], reverse=True)

    return ranked

In [11]:
candidate_titles = [
    "Star Wars (1977)",
    "Toy Story (1995)",
    "Fargo (1996)",
    "L.A. Confidential (1997)",
    "Titanic (1997)"
]

user_id = "82"

ranked_results = rank_movies_for_user(model, user_id, candidate_titles)

for title, score in ranked_results:
    print(f"{title}: {score:.3f}")

Star Wars (1977): 4.075
Titanic (1997): 3.857
Fargo (1996): 3.839
Toy Story (1995): 3.745
L.A. Confidential (1997): 3.723


### Export model

In [12]:
tf.saved_model.save(model, "export")

INFO:tensorflow:Assets written to: export\assets


INFO:tensorflow:Assets written to: export\assets


In [15]:
# load the saved model and perform inference
tfrs_model = tf.saved_model.load("export")

user_id = "82"
candidate_movie_list = ["Speed (1994)", "Titanic (1997)"]

ranked_recs = rank_movies_for_user(model, user_id, candidate_movie_list)

for title, score in ranked_recs:
    print(f"{title}: {score:.3f}")

Titanic (1997): 3.857
Speed (1994): 3.594


### evaluation

In [16]:
from collections import defaultdict

def build_user_seen_dict(dataset):
    user_seen = defaultdict(set)
    for x in dataset:
        user_seen[x["user_id"].numpy().decode("utf-8")].add(x["movie_title"].numpy().decode("utf-8"))
    return user_seen

train_user_seen = build_user_seen_dict(train)
test_user_seen = build_user_seen_dict(test)

In [18]:
def evaluate_ranking_model(model, test_user_seen, train_user_seen, all_movies, k=10):
    hits, precision_sum, recall_sum, ndcg_sum = 0, 0.0, 0.0, 0.0
    total_users = 0

    for user_id, true_movies in test_user_seen.items():
        # Remove movies already seen in training set
        seen_train_movies = train_user_seen.get(user_id, set())
        candidate_movies = [title for title in all_movies if title not in seen_train_movies]

        # Get top-k predictions
        ranked = rank_movies_for_user(model, user_id, candidate_movies)
        top_k_preds = [title for title, score in ranked[:k]]

        hit_set = true_movies & set(top_k_preds)
        num_hits = len(hit_set)
        hits += int(num_hits > 0)
        precision_sum += num_hits / k
        recall_sum += num_hits / len(true_movies)

        # NDCG@k
        dcg = 0.0
        for i, movie in enumerate(top_k_preds):
            if movie in true_movies:
                dcg += 1 / np.log2(i + 2)
        idcg = sum(1 / np.log2(i + 2) for i in range(min(len(true_movies), k)))
        ndcg = dcg / idcg if idcg > 0 else 0
        ndcg_sum += ndcg

        total_users += 1

    return {
        'HitRate@k': hits / total_users,
        'Precision@k': precision_sum / total_users,
        'Recall@k': recall_sum / total_users,
        'NDCG@k': ndcg_sum / total_users
    }


In [None]:
# Get all unique movie titles from the dataset
movies_df = pd.read_csv(ITEM_DATA_PATH)
all_movie_titles = list(set(movies_df["title"].tolist()))

# Evaluate
K = 10
metrics = evaluate_ranking_model(
    model=tfrs_model,
    test_user_seen=test_user_seen,
    train_user_seen=train_user_seen,
    all_movies=all_movie_titles,
    k=K
)

# Print results
print(f"Ranking Model Evaluation (k={K}):")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")
