# Collaborative Filtering

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import recsys_utils

2025-12-01 12:23:55.796824: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-01 12:23:55.883811: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764591835.921823    7064 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764591835.933338    7064 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-12-01 12:23:56.021825: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
X, W, b, num_movies, num_features, num_users = recsys_utils.load_precalc_params_small()
# Y = ratings => movies x users, R = rated/not-rated
Y, R = recsys_utils.load_ratings_small()

print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (4778, 443) R (4778, 443)
X (4778, 10)
W (443, 10)
b (1, 443)
num_features 10
num_movies 4778
num_users 443


In [3]:

# rating mean for movie 1 across all users
users_rated_movie1 = R[0, :].astype(bool)
movie1_rates = Y[0, users_rated_movie1]
m1_mean = np.mean(movie1_rates) # since users_rated_movie1 is a boolean vector, only values in Y's second dimentions that cross a True in users_rated_movie1 will result
m1_mean

np.float64(3.4)

In [4]:
def cofi_cost_func(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    nm, nu = Y.shape
    J = 0

    for j in range(nu):
      for i in range(nm):
        if R[i, j] == 1:
          J += (np.dot(W[j, :], X[i, :]) + b[0, j] - Y[i, j])**2

    J = J / 2  + (lambda_ / 2) * np.sum(W**2) + (lambda_ / 2) * np.sum(X**2)
    return J

In [5]:
def cofi_cost_func_vectorized(X, W, b, Y, R, lambda_):
  # R will replace `if R[i, j] == 1`, love it!
  J = R * (tf.linalg.matmul(X, tf.transpose(W)) + b -Y)

  return tf.reduce_sum(J**2) * 0.5  + (lambda_ / 2) * tf.reduce_sum(W**2) + (lambda_ / 2) * tf.reduce_sum(X**2)

In [6]:
num_users_r = 4
num_movies_r = 5 
num_features_r = 3

X_r = X[:num_movies_r, :num_features_r]
W_r = W[:num_users_r,  :num_features_r]
b_r = b[0, :num_users_r].reshape(1,-1)
Y_r = Y[:num_movies_r, :num_users_r]
R_r = R[:num_movies_r, :num_users_r]

# Evaluate cost function
J = cofi_cost_func(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

J = cofi_cost_func(X_r, W_r, b_r, Y_r, R_r, 1.5);
print(f"Cost (with regularization): {J:0.2f}")

from public_tests import *
test_cofi_cost_func(cofi_cost_func)
# test_cofi_cost_func(cofi_cost_func_vectorized)

Cost: 13.67
Cost (with regularization): 28.09
[92mAll tests passed!


In [7]:
# no regularization
J = cofi_cost_func_vectorized(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

# with regularization 
J = cofi_cost_func_vectorized(X_r, W_r, b_r, Y_r, R_r, 1.5);
print(f"Cost (with regularization): {J:0.2f}")

I0000 00:00:1764591839.027110    7064 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13750 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


Cost: 13.67
Cost (with regularization): 28.09


# Learning Movies Recommendations

In [8]:
movie_list, movie_list_df = recsys_utils.load_Movie_List_pd()
my_ratings = np.zeros(num_movies) # stores my ratings

my_ratings[2700] = 5 
my_ratings[2609] = 2
my_ratings[929]  = 5   # Lord of the Rings: The Return of the King, The
my_ratings[246]  = 5   # Shrek (2001)
my_ratings[2716] = 3   # Inception
my_ratings[1150] = 5   # Incredibles, The (2004)
my_ratings[382]  = 2   # Amelie (Fabuleux destin d'Amélie Poulain, Le)
my_ratings[366]  = 5   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my_ratings[622]  = 5   # Harry Potter and the Chamber of Secrets (2002)
my_ratings[988]  = 3   # Eternal Sunshine of the Spotless Mind (2004)
my_ratings[2925] = 1   # Louis Theroux: Law & Disorder (2008)
my_ratings[2937] = 1   # Nothing to Declare (Rien à déclarer)
my_ratings[793]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]  # Indexes of the movies I rated

my_rated

[246, 366, 382, 622, 793, 929, 988, 1150, 2609, 2700, 2716, 2925, 2937]

In [9]:
Y, R = recsys_utils.load_ratings_small()

# Add new user ratings to Y 
Y = np.c_[my_ratings, Y]

# Add new user indicator matrix to R
R = np.c_[(my_ratings != 0).astype(int), R]

# Normalize the Dataset
Ynorm, Ymean = recsys_utils.normalizeRatings(Y, R)

In [10]:
num_movies, num_users = Y.shape
num_features = 100

tf.random.set_seed(1234)

# initialize parameters so they're tracked
W = tf.Variable(
  tf.random.normal((num_users, num_features), dtype=tf.float64),
  name='W'
)
X = tf.Variable(
  tf.random.normal((num_movies, num_features), dtype=tf.float64),
  name='X'
)
b = tf.Variable(
  tf.random.normal((1, num_users), dtype=tf.float64),
  name='b'
)

optimizer = keras.optimizers.Adam(learning_rate=1e-1)

n_iter = 200
lambda_ = 1

In [11]:
for i in range(n_iter):
  # record the operations for computing the gradient
  with tf.GradientTape() as tape:
    cost = cofi_cost_func_vectorized(X, W, b, Ynorm, R, lambda_)
  
  gradients = tape.gradient(cost, [X, W, b])
  optimizer.apply_gradients(zip(gradients, [X, W, b]))

  if i % 20 == 0:
    print(f'Training iteration {i}. Cost value: {cost:0.1f}')




Training iteration 0. Cost value: 2321191.3
Training iteration 20. Cost value: 136169.3
Training iteration 40. Cost value: 51863.7
Training iteration 60. Cost value: 24599.0
Training iteration 80. Cost value: 13630.6
Training iteration 100. Cost value: 8487.7
Training iteration 120. Cost value: 5807.8
Training iteration 140. Cost value: 4311.6
Training iteration 160. Cost value: 3435.3
Training iteration 180. Cost value: 2902.1


In [12]:
# Use trained W, X and b

# preds contains predictions for movies that had ALREADY been rated AND for movies that HADN'T yet been rated
preds = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

preds = preds + Ymean

print(f'preds shape: {preds.shape}')

my_preds = preds[:, 0] # user 0 is me after `R = np.c_[(my_ratings != 0).astype(int), R]`
print(f'My preds shape: {my_preds.shape}')
print(f'First 5 predictions for me as a user: {my_preds[:5]}')
my_preds_sorted_indexes = tf.argsort(my_preds, direction='DESCENDING') # Returns the indices of a tensor that give its sorted order along an axis.


# top 20
for i in range(20):
  pred_index = my_preds_sorted_indexes[i]
  # Only show movies for which I didn't provide a rating. Should NOT show "Shrek" or "Inception" 
  if pred_index not in my_rated:
    print(f'Predicted a rating of {my_preds[pred_index]:.2f} for movie {movie_list[pred_index]}')

print()

# Traverse the array of movies I rated
for i in range(len(my_ratings)):
  if my_ratings[i] != 0:
    name = movie_list[i]
    pred = my_preds[i]
    actual = my_ratings[i]
    error = actual - pred

    print(f'name: {movie_list[i]}')
    print(f'prediction: {my_preds[i]}')
    print(f'actual: {my_ratings[i]}')
    print(f'error: {error}')
    print()

preds shape: (4778, 444)
My preds shape: (4778,)
First 5 predictions for me as a user: [2.76811077 2.74113014 1.32460636 1.60469699 2.29810711]
Predicted a rating of 4.49 for movie My Sassy Girl (Yeopgijeogin geunyeo) (2001)
Predicted a rating of 4.48 for movie Martin Lawrence Live: Runteldat (2002)
Predicted a rating of 4.48 for movie Memento (2000)
Predicted a rating of 4.47 for movie Delirium (2014)
Predicted a rating of 4.47 for movie Laggies (2014)
Predicted a rating of 4.47 for movie One I Love, The (2014)
Predicted a rating of 4.46 for movie Particle Fever (2013)
Predicted a rating of 4.45 for movie Eichmann (2007)
Predicted a rating of 4.45 for movie Battle Royale 2: Requiem (Batoru rowaiaru II: Chinkonka) (2003)
Predicted a rating of 4.45 for movie Into the Abyss (2011)
Predicted a rating of 4.45 for movie Son of the Bride (Hijo de la novia, El) (2001)
Predicted a rating of 4.44 for movie Rivers and Tides (2001)
Predicted a rating of 4.44 for movie George Carlin: It's Bad for 