# Recommender Systems

# Case Study | Deep Collaborative Filtering with MovieLens Dataset

## 1. Initial Imports

In [1]:
import tensorflow as tf
from zipfile import ZipFile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import get_file

## 2. Loading the Data

In [3]:
URL = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
movielens_path = get_file("movielens.zip", URL, extract=True)

Downloading data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip


In [5]:
with ZipFile(movielens_path) as z:
  with z.open("ml-latest-small/ratings.csv") as f:
    df = pd.read_csv(f)
    
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## 3. Processing the Data

### Processing User IDs

In [6]:
user_ids = df["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
user_encoded2user = {i: x for i, x in enumerate(user_ids)}
df["user"] = df["userId"].map(user2user_encoded)
num_users = len(user_encoded2user)

### Processing Movie IDs

In [7]:
movie_ids = df["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
df["movie"] = df["movieId"].map(movie2movie_encoded)
num_movies = len(movie_encoded2movie)

In [8]:
print("Number of users: ", num_users,"\nNumber of Movies: ", num_movies)

Number of users:  610 
Number of Movies:  9724


### Processing the Ratings

In [10]:
min, max = df["rating"].min(), df["rating"].max()

df["rating"] = df["rating"].apply(lambda x:(x-min)/(max-min))

df.head()

Unnamed: 0,userId,movieId,rating,timestamp,user,movie
0,1,1,0.777778,964982703,0,0
1,1,3,0.777778,964981247,0,1
2,1,6,0.777778,964982224,0,2
3,1,47,1.0,964983815,0,3
4,1,50,1.0,964982931,0,4


## 4. Splitting the Dataset

In [11]:
X = df[["user", "movie"]].values
y = df["rating"].values

In [12]:
(x_train, x_val, y_train, y_val) = train_test_split(X, y,test_size=0.1,random_state=42)

print("Shape of the x_train: ", x_train.shape)
print("Shape of the y_train: ", y_train.shape)
print("Shape of the x_val: ", x_val.shape)
print("Shape of the x_val: ", y_val.shape)

Shape of the x_train:  (90752, 2)
Shape of the y_train:  (90752,)
Shape of the x_val:  (10084, 2)
Shape of the x_val:  (10084,)


## 5. Building the Model

In [13]:
# In TensorFlow, apart from Sequential API and Functional API, 
# there is a third option to build models: Model Subclassing. 


class RecommenderNet(tf.keras.Model):
  # __init function is to initialize the values of instance members for the new object
  def __init__(self, num_users, num_movies, embedding_size,**kwargs):
    super(RecommenderNet, self).__init__(**kwargs)
    # Variable for embedding size
    self.embedding_size = embedding_size
    # Variables for user count, and related weights and biases
    self.num_users = num_users
    self.user_embedding = Embedding(num_users,embedding_size,embeddings_initializer="he_normal",
                                    embeddings_regularizer=tf.keras.regularizers.l2(1e-6),)
    self.user_bias = Embedding(num_users, 1)
    # Variables for movie count, and related weights and biases
    self.num_movies = num_movies
    self.movie_embedding = Embedding(num_movies,embedding_size,embeddings_initializer="he_normal",
                                     embeddings_regularizer=tf.keras.regularizers.l2(1e-6),)
    self.movie_bias = Embedding(num_movies, 1)
    
  def call(self, inputs):
    # call function is for the dot products of user and movie vectors
    # It also accepts the inputs, feeds them into the layers,and feed into the final sigmoid layer
    # User vector and bias values with input values
    user_vector = self.user_embedding(inputs[:, 0])
    user_bias = self.user_bias(inputs[:, 0])
    # Movie vector and bias values with input values
    movie_vector = self.movie_embedding(inputs[:, 1])
    movie_bias = self.movie_bias(inputs[:, 1])
    # tf.tensordot calculcates the dot product
    dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
    # Add all the components (including bias)
    x = dot_user_movie + user_bias + movie_bias
    # The sigmoid activation forces the rating to between 0 and 1
    return tf.nn.sigmoid(x)
  
# create an instance of this custom class to build our custom RecommenderNet model  
model = RecommenderNet(num_users, num_movies, embedding_size=50)

## 6. Compile and Train the Model

In [14]:
model.compile(loss='mse',optimizer=tf.keras.optimizers.Adam(lr=0.001))

history = model.fit(x=x_train,y=y_train,
                    batch_size=64,
                    epochs=5,
                    verbose=1,
                    validation_data=(x_val, y_val),)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## 7. Make Recommendations

In [15]:
# selecting user_id
user_id = df.userId.sample(1).iloc[0]
print("The selected user ID is: ", user_id)

# checking and listing user not watching
movies_watched = df[df.userId == user_id]
not_watched = df[~df['movieId'].isin(movies_watched.movieId.values)]['movieId'].unique()
not_watched = [[movie2movie_encoded.get(x)] for x in not_watched]

print('The number of movies the user has not seen before: ',len(not_watched))

The selected user ID is:  387
The number of movies the user has not seen before:  8697


In [16]:
# generate the predicted movie ratings
user_encoder = user2user_encoded.get(user_id)
user_movie_array = np.hstack(([[user_encoder]] * len(not_watched), not_watched ))
ratings = model.predict(user_movie_array).flatten()

In [17]:
# selecting top10 with order of ratings
top10_indices = ratings.argsort()[-10:][::-1]

In [18]:
# get the movie ids for the top10
recommended_movie_ids = [movie_encoded2movie.get(not_watched[x][0]) for x in top10_indices]

In [19]:
# Create a DataFrame from Movies.csv file
with ZipFile(movielens_path) as z:
  with z.open("ml-latest-small/movies.csv") as f:
    movie_df = pd.read_csv(f)
    
movie_df.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [20]:
top_movies_user = (movies_watched.sort_values(by="rating", ascending=False).head(10).movieId.values)

In [21]:
movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]

In [22]:
print("Movies with high ratings from user")
movie_df_rows[['title','genres']]

Movies with high ratings from user


Unnamed: 0,title,genres
602,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
685,Vertigo (1958),Drama|Mystery|Romance|Thriller
694,Casablanca (1942),Drama|Romance
907,"Clockwork Orange, A (1971)",Crime|Drama|Sci-Fi|Thriller
909,Apocalypse Now (1979),Action|Drama|War
913,"Third Man, The (1949)",Film-Noir|Mystery|Thriller
951,Chinatown (1974),Crime|Film-Noir|Mystery|Thriller
3544,Mulholland Drive (2001),Crime|Drama|Film-Noir|Mystery|Thriller
4769,Nausicaä of the Valley of the Wind (Kaze no ta...,Adventure|Animation|Drama|Fantasy|Sci-Fi
5695,Old Boy (2003),Mystery|Thriller


In [23]:
recommended_movies = movie_df[movie_df["movieId"].
isin(recommended_movie_ids)]
print("Top 10 movie recommendations")
recommended_movies[['title','genres']]

Top 10 movie recommendations


Unnamed: 0,title,genres
4137,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
4159,Catch Me If You Can (2002),Crime|Drama
4800,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
4909,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi
5166,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX
6710,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
7355,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
8274,Captain Phillips (2013),Adventure|Drama|Thriller|IMAX
8475,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi
8879,The Martian (2015),Adventure|Drama|Sci-Fi
