# Collabrative Filtering Recommender System

In [1]:
import numpy as np
import tensorflow as tf
from recsys_utils import *
from tensorflow import keras




## Load the dataset  

In [44]:
#Load data
X, W, b, num_movies, num_features, num_users = load_precalc_params_small()
Y, R = load_ratings_small()

print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (4778, 443) R (4778, 443)
X (4778, 10)
W (443, 10)
b (1, 443)
num_features 10
num_movies 4778
num_users 443


Here `load_precalc_params_small` loads the parameters from a file which are already calculated for the movies dataset  
Here  
X = The dataset conataining features of the movies **shape = (num_movies, num_features)**  
W = Parmeter w for all the users , **shape = (num_users, num_features)**  
b = parameter b for all the users, **shape = (num_users, )**  
Y = contains the rating of all the users for all the movies, **shape = (num_movies, num_users)**  
R = Contains the binary values 1 and 0, for either the user has rated corresponding movie or not respectively, **shape =(Y)**  

In [3]:
print(f"The average rating for the movie 1 is : {np.mean(Y[0, R[0, :].astype(bool)])} / 5")

The average rating for the movie 1 is : 3.4 / 5


The dataset contains the the rating from 0.5 to 5, the value 0 represents that the user has't rated the movie yet.

### 4.1 Collaborative filtering cost function

The collaborative filtering cost function is given by
$$J({\mathbf{x}^{(0)},...,\mathbf{x}^{(n_m-1)},\mathbf{w}^{(0)},b^{(0)},...,\mathbf{w}^{(n_u-1)},b^{(n_u-1)}})= \frac{1}{2}\sum_{(i,j):r(i,j)=1}(\mathbf{w}^{(j)} \cdot \mathbf{x}^{(i)} + b^{(j)} - y^{(i,j)})^2
+\underbrace{
\frac{\lambda}{2}
\sum_{j=0}^{n_u-1}\sum_{k=0}^{n-1}(\mathbf{w}^{(j)}_k)^2
+ \frac{\lambda}{2}\sum_{i=0}^{n_m-1}\sum_{k=0}^{n-1}(\mathbf{x}_k^{(i)})^2
}_{regularization}
\tag{1}$$

The first summation in (1) is "for all $i$, $j$ where $r(i,j)$ equals $1$" and could be written:

$$
= \frac{1}{2}\sum_{j=0}^{n_u-1} \sum_{i=0}^{n_m-1}r(i,j)*(\mathbf{w}^{(j)} \cdot \mathbf{x}^{(i)} + b^{(j)} - y^{(i,j)})^2
+\text{regularization}
$$


In [23]:
def cofi_cost_func(X, W, b, Y, R, lambda_):
    nm, nu = Y.shape
    J = 0
    for j in range(nu):
        w = W[j,:]
        b_j = b[0,j]
        for i in range(nm):
            x = X[i,:]
            y = Y[i,j]
            r = R[i,j]
            J += r * np.square((np.dot(w,x) + b_j - y ))
    J += (lambda_) * (np.sum(np.square(W)) + np.sum(np.square(X)))
    J = J/2
    return J

In [39]:
# Reduce the size of the dataset so that it runs faster
num_users_r = 4
num_movies_r = 5 
num_features_r = 3

X_r = X[:num_movies_r, :num_features_r]
W_r = W[:num_users_r,  :num_features_r]
b_r = b[0, :num_users_r].reshape(1,-1)
Y_r = Y[:num_movies_r, :num_users_r]
R_r = R[:num_movies_r, :num_users_r]

# Evaluate cost function
J = cofi_cost_func(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

Cost: 13.67


In [27]:
reg_Jr = cofi_cost_func(xr, wr, br, yr, rr, 1.5)
print(f"The cost after adding the regularization term is: {reg_Jr:.3f}")

The cost after adding the regularization term is: 28.094


### Vectorized implementation of the function `cofi_cost_func`

In [34]:
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [41]:
# Evaluate cost function
J = cofi_cost_func_v(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

# Evaluate cost function with regularization 
J = cofi_cost_func_v(X_r, W_r, b_r, Y_r, R_r, 1.5);
print(f"Cost (with regularization): {J:0.2f}")

Cost: 13.67
Cost (with regularization): 28.09


## Learning movie Recommendation

In [45]:
movieList, movieList_df = load_Movie_List_pd()

my_ratings = np.zeros(num_movies)          #  Initialize my ratings

# Check the file small_movie_list.csv for id of each movie in our dataset
# For example, Toy Story 3 (2010) has ID 2700, so to rate it "5", you can set
my_ratings[2700] = 5 

#Or suppose you did not enjoy Persuasion (2007), you can set
my_ratings[2609] = 2;

# We have selected a few movies we liked / did not like and the ratings we
# gave are as follows:
my_ratings[929]  = 5   # Lord of the Rings: The Return of the King, The
my_ratings[246]  = 5   # Shrek (2001)
my_ratings[2716] = 3   # Inception
my_ratings[1150] = 5   # Incredibles, The (2004)
my_ratings[382]  = 2   # Amelie (Fabuleux destin d'Amélie Poulain, Le)
my_ratings[366]  = 5   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my_ratings[622]  = 5   # Harry Potter and the Chamber of Secrets (2002)
my_ratings[988]  = 3   # Eternal Sunshine of the Spotless Mind (2004)
my_ratings[2925] = 1   # Louis Theroux: Law & Disorder (2008)
my_ratings[2937] = 1   # Nothing to Declare (Rien à déclarer)
my_ratings[793]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for  {movieList_df.loc[i,"title"]}');


New user ratings:

Rated 5.0 for  Shrek (2001)
Rated 5.0 for  Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Rated 2.0 for  Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Rated 5.0 for  Harry Potter and the Chamber of Secrets (2002)
Rated 5.0 for  Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Rated 5.0 for  Lord of the Rings: The Return of the King, The (2003)
Rated 3.0 for  Eternal Sunshine of the Spotless Mind (2004)
Rated 5.0 for  Incredibles, The (2004)
Rated 2.0 for  Persuasion (2007)
Rated 5.0 for  Toy Story 3 (2010)
Rated 3.0 for  Inception (2010)
Rated 1.0 for  Louis Theroux: Law & Disorder (2008)
Rated 1.0 for  Nothing to Declare (Rien à déclarer) (2010)


In [49]:
# Relode the rating from the dataset
Y, R = load_ratings_small()
Y = np.c_[my_ratings, Y]
R = np.c_[(my_ratings != 0).astype(int), R]

# Normalize the features
Ynorm, Ymean = normalizeRatings(Y, R)

In [52]:
Ymean.shape

(4778, 1)

### Let's train the model

In [54]:
# Useful features 
num_movies, num_users = Y.shape
num_features = 100

# Initiliaze the parameters
tf.random.set_seed(1234)
W = tf.Variable(tf.random.normal((num_users, num_features), dtype=tf.float64), name="W")
b = tf.Variable(tf.random.normal((1, num_users), dtype=tf.float64), name="b")
X = tf.Variable(tf.random.normal((num_movies, num_features), dtype=tf.float64), name="X")

# Initiliaze the optimizer 
optimizer = keras.optimizers.Adam(learning_rate = 1e-1)

The operations involved in learning $w$, $b$, and $x$ simultaneously do not fall into the typical 'layers' offered in the TensorFlow neural network package. 

In [59]:
iterations = 200
lambda_ = 1

for iter in range(iterations):
    # Use tensorflows GradientTape function to efficiently store and calculate the derivative
    with tf.GradientTape() as tape:

        # Use the cost function to calculate the derivatives with respect to the cost function
        cost_value =cofi_cost_func_v(X, W, b, Y, R, lambda_)

    # Use the gradient tape to automatically calculate the gradients with respect to the variables
    grad = tape.gradient(cost_value, [X, W, b])

    # Run one step of the gradient descent by updating the values of the parameters for the variables
    optimizer.apply_gradients(zip(grad, [X, W, b]))

    if iter % 20 == 0:
        print(f"iter: {iter}, cost: {cost_value:.2f}")

iter: 0, cost: 2435.30
iter: 20, cost: 2380.98
iter: 40, cost: 2333.12
iter: 60, cost: 2290.94
iter: 80, cost: 2253.84
iter: 100, cost: 2221.30
iter: 120, cost: 2192.91
iter: 140, cost: 2168.29
iter: 160, cost: 2147.10
iter: 180, cost: 2129.01


## Recommendations

In [63]:
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

# Add the mean which we subtrcted while normalizing the features
pm = p

my_pred = pm[:, 0]

# Sort the predictions
ix = tf.argsort(my_pred, direction='DESCENDING')

for i in range(17):
    if ix[i] not in my_rated:
        print(f"The predicted rating is: {my_pred[ix[i]]:.2f}, for movie: {movieList[ix[i]]}")

print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_pred[i]:0.2f} for {movieList[i]}')

The predicted rating is: 4.35, for movie: Finding Nemo (2003)
The predicted rating is: 4.24, for movie: Lord of the Rings: The Two Towers, The (2002)
The predicted rating is: 4.24, for movie: Lord of the Rings: The Fellowship of the Ring, The (2001)
The predicted rating is: 4.20, for movie: Spider-Man 2 (2004)
The predicted rating is: 4.11, for movie: Star Trek (2009)
The predicted rating is: 4.06, for movie: Harry Potter and the Prisoner of Azkaban (2004)
The predicted rating is: 4.00, for movie: Harry Potter and the Order of the Phoenix (2007)
The predicted rating is: 3.88, for movie: Harry Potter and the Deathly Hallows: Part 1 (2010)
The predicted rating is: 3.87, for movie: Harry Potter and the Goblet of Fire (2005)
The predicted rating is: 3.85, for movie: Harry Potter and the Deathly Hallows: Part 2 (2011)


Original vs Predicted ratings:

Original 5.0, Predicted 4.91 for Shrek (2001)
Original 5.0, Predicted 4.81 for Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and

In [65]:
filter=(movieList_df["number of ratings"] > 20)
movieList_df["pred"] = my_pred
movieList_df = movieList_df.reindex(columns=["pred", "mean rating", "number of ratings", "title"])
movieList_df.loc[ix[:300]].loc[filter].sort_values("mean rating", ascending=False)

Unnamed: 0,pred,mean rating,number of ratings,title
1743,3.439561,4.252336,107,"Departed, The (2006)"
2112,3.788749,4.238255,149,"Dark Knight, The (2008)"
5,3.250716,4.220930,43,"Boondock Saints, The (2000)"
155,3.756505,4.155914,93,Snatch (2000)
580,3.294574,4.155172,87,Spirited Away (Sen to Chihiro no kamikakushi) ...
...,...,...,...,...
71,3.107034,3.086957,46,"Me, Myself & Irene (2000)"
296,3.068028,3.066667,45,American Pie 2 (2001)
98,3.078551,3.044444,45,"Cell, The (2000)"
2175,3.036619,3.034483,29,Hancock (2008)
