# Recommendation System
### Using Matrix Factorization Embeddings and Deep Feed Forward Regressor

The Embeddings and Regressor will be trained using MovieLens 20M Dataset.

References:
F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages. DOI=<http://dx.doi.org/10.1145/2827872>

Exploratory Data Analysis, Model Definition, Training and Fine-Tunning described step by step on this Notebook

In [0]:
from IPython.display import clear_output
!pip install --upgrade tensorflow-gpu
!pip install wandb
clear_output()

In [0]:
#Downloads and extract Dataset to local, wait for download, i dont want to put a progress bar here sorry
#You can run this on google colab for get faster downloads speeds
import os
import zipfile
import requests

if(not os.path.exists("./Datasets/MoviLens20M.zip")):

  resp = requests.get("http://files.grouplens.org/datasets/movielens/ml-20m.zip")

  os.mkdir("./Datasets")

  with open("./Datasets/MoviLens20M.zip", "wb") as f:
    f.write(resp.content)

  with zipfile.ZipFile("./Datasets/MoviLens20M.zip", "r") as zip_ref:
    zip_ref.extractall("./Datasets")


In [0]:
#Imports
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import wandb
import os

wandb.login()
from google.colab import drive
drive.mount('/content/drive')
clear_output()

## Exploratory Data Analysis

Simple exploration without too much fancy graphs

In [4]:
#Loads Dataset, we only need ratings.csv and movies.csv files, we can drop timestamp and genres for now
ratings_df = pd.read_csv("./Datasets/ml-20m/ratings.csv").drop(["timestamp"], axis=1)
movies_df = pd.read_csv("./Datasets/ml-20m/movies.csv").drop(["genres"], axis=1)

ml_df = ratings_df.merge(movies_df, on="movieId")

ml_df = ml_df.reindex(columns=["userId", "movieId", "title", "rating"])
ml_df.head()

Unnamed: 0,userId,movieId,title,rating
0,1,2,Jumanji (1995),3.5
1,5,2,Jumanji (1995),3.0
2,13,2,Jumanji (1995),3.0
3,29,2,Jumanji (1995),3.0
4,34,2,Jumanji (1995),3.0


In [5]:
#Check info about the Dataset
ml_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
userId     int64
movieId    int64
title      object
rating     float64
dtypes: float64(1), int64(2), object(1)
memory usage: 762.9+ MB


In [6]:
#Check for NaNs
ml_df.isna().sum()

userId     0
movieId    0
title      0
rating     0
dtype: int64

In [7]:
#List unique values of each column
n_users = ml_df["userId"].max()
n_movies = ml_df["movieId"].nunique()

print("Unique Users: " + str(n_users))
print("Unique Movies: " + str(n_movies))

Unique Users: 138493
Unique Movies: 26744


In [8]:
#Top movies with more rating count (dont confuse with more views or more rating score, but are correlated)
count = ml_df["title"].value_counts()
count[:15]

Pulp Fiction (1994)                          67310
Forrest Gump (1994)                          66172
Shawshank Redemption, The (1994)             63366
Silence of the Lambs, The (1991)             63299
Jurassic Park (1993)                         59715
Star Wars: Episode IV - A New Hope (1977)    54502
Braveheart (1995)                            53769
Terminator 2: Judgment Day (1991)            52244
Matrix, The (1999)                           51334
Schindler's List (1993)                      50054
Toy Story (1995)                             49695
Fugitive, The (1993)                         49581
Apollo 13 (1995)                             47777
Independence Day (a.k.a. ID4) (1996)         47048
Usual Suspects, The (1995)                   47006
Name: title, dtype: int64

## Preprocessing

In [9]:
#Normalize ratings
ml_df["rating_norm"] = ml_df["rating"] / 5.0
ml_df["userId"] = ml_df["userId"].astype("category").cat.codes.values
ml_df["movieId"] = ml_df["movieId"].astype("category").cat.codes.values
ml_df.head()

Unnamed: 0,userId,movieId,title,rating,rating_norm
0,0,1,Jumanji (1995),3.5,0.7
1,4,1,Jumanji (1995),3.0,0.6
2,12,1,Jumanji (1995),3.0,0.6
3,28,1,Jumanji (1995),3.0,0.6
4,33,1,Jumanji (1995),3.0,0.6


In [0]:
users = ml_df["userId"].values
movies = ml_df["movieId"].values
ratings = ml_df["rating_norm"].values.reshape([-1, 1])

In [0]:
#Create Datasets for train, evaluation and testing, and a full version of the dataset
ml_ds = tf.data.Dataset.from_tensor_slices(({"userId":users, "movieId":movies}, ratings)).shuffle(2048)
full_ds = ml_ds
eval_ds = ml_ds.take(10000).batch(10000)
ml_ds = ml_ds.skip(10000)
test_ds = ml_ds.take(500000).batch(50000)
train_ds = ml_ds.skip(500000)

## Matrix Factorization

Optimize embeddings for users and movies. This embeddings will be used later in the Regressor. For train the Factorizer, full dataset will be used, for optimal embeddings aver all the users / movies

In [0]:
#Model Definition
class MatrixFactorizer(tf.keras.Model):
  """This model will be used for optimize the embeddings, later will be discarded, just keeping the embedding layers weights"""

  def __init__(self, users, movies, emb_dim):
    super(MatrixFactorizer, self).__init__()

    self.user_emb = tf.keras.layers.Embedding(users, emb_dim)
    self.user_flat = tf.keras.layers.Flatten()

    self.movie_emb = tf.keras.layers.Embedding(movies, emb_dim)
    self.movie_flat = tf.keras.layers.Flatten()

    self.dot = tf.keras.layers.Dot(axes=1)

  def call(self, inputs):
    X_user = self.user_emb(inputs["userId"])
    X_user = self.user_flat(X_user)

    X_movie = self.movie_emb(inputs["movieId"])
    X_movie = self.movie_flat(X_movie)

    X = self.dot([X_user, X_movie])
    return X

In [15]:
#Start Recording to WandB servers
wandb.init(project="recommendation-system", group="MatrixFactorizer")

#Matrix Factorizer Hyperparams
f_emb_dim = 4
f_lr = 0.002
f_epochs = 10
f_batch_size = 40960

wandb.config.emb_dim = f_emb_dim
wandb.config.learning_rate = f_lr
wandb.config.epochs = f_epochs
wandb.config.batch_size = f_batch_size

In [16]:
#Model instantiation
factorizer = MatrixFactorizer(n_users, n_movies, f_emb_dim)
factorizer.compile(tf.keras.optimizers.Adam(f_lr), tf.keras.losses.MeanSquaredError(), metrics=[tf.keras.metrics.RootMeanSquaredError()])
factorizer.train_on_batch(train_ds.batch(1))
factorizer.summary()

Model: "matrix_factorizer_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      multiple                  553972    
_________________________________________________________________
flatten_2 (Flatten)          multiple                  0         
_________________________________________________________________
embedding_3 (Embedding)      multiple                  106976    
_________________________________________________________________
flatten_3 (Flatten)          multiple                  0         
_________________________________________________________________
dot_1 (Dot)                  multiple                  0         
Total params: 660,948
Trainable params: 660,948
Non-trainable params: 0
_________________________________________________________________


In [17]:
#Model fitting
factorizer.fit(full_ds.batch(f_batch_size), epochs=f_epochs, callbacks=[wandb.keras.WandbCallback(monitor="root_mean_squared_error", save_weights_only=True)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fd21c8fef60>

In [0]:
#Save Embedding Matrix to disk
path = "/content/drive/My Drive/RecommendationSystem/Checkpoints/MatrixFactorizer/" + wandb.run.name
if(not os.path.exists(path)):
  os.mkdir(path)
  np.save(path + "/user_emb.npy", np.asarray(factorizer.user_emb.get_weights())[0])
  np.save(path + "/movie_emb.npy", np.asarray(factorizer.movie_emb.get_weights())[0])

!cp -r "/content/drive/My Drive/RecommendationSystem/Checkpoints" .

## Deep Feedforward Network

Will use pretrained embeddings for return an score that represent the weight of the match between an user and a movie

In [0]:
class DenseBlock(tf.keras.layers.Layer):
  "Dense + Dropout + BatchNorm"

  def __init__(self, units, dropout=0.1, l2=0.1):
    super(DenseBlock, self).__init__()

    self.bn = tf.keras.layers.BatchNormalization()
    self.drop = tf.keras.layers.Dropout(dropout)
    self.dense = tf.keras.layers.Dense(units, 
                                       "relu",
                                       kernel_regularizer=tf.keras.regularizers.L1L2(l2=l2),
                                       kernel_constraint=tf.keras.constraints.UnitNorm())
    
  def call(self, inputs):
    X = self.bn(inputs)
    X = self.drop(X)
    X = self.dense(X)
    return X

#Model Definition
class Recommender(tf.keras.Model):
  """Scores the match between an user and a movie, higher scores mean more affinity o the user for the movie"""

  def __init__(self, user_emb, movie_emb, dense_struct, dropout=0.1, l2=0.001):
    super(Recommender, self).__init__()

    self.user_emb = tf.keras.layers.Embedding(np.shape(user_emb)[0], np.shape(user_emb)[1], weights=[user_emb], trainable=False)
    self.user_flat = tf.keras.layers.Flatten()

    self.movie_emb = tf.keras.layers.Embedding(np.shape(movie_emb)[0], np.shape(movie_emb)[1], weights=[movie_emb], trainable=False)
    self.movie_flat = tf.keras.layers.Flatten()

    self.concat = tf.keras.layers.Concatenate()
    
    self.dense_list = list()
    for layer in dense_struct:
      self.dense_list.append(DenseBlock(layer, dropout, l2))

    self.dense_out = tf.keras.layers.Dense(1, 
                                           "sigmoid",
                                           kernel_regularizer=tf.keras.regularizers.L1L2(l2=l2),
                                           kernel_constraint=tf.keras.constraints.UnitNorm())
    
  def call(self, inputs):

    X_user = self.user_emb(inputs["userId"])
    X_user = self.user_flat(X_user)

    X_movie = self.movie_emb(inputs["movieId"])
    X_movie = self.movie_flat(X_movie)

    X = self.concat([X_user, X_movie])

    for layer in self.dense_list:
      X = layer(X)

    X = self.dense_out(X)
    return X



In [37]:
#Start Recording to WandB servers
wandb.init(project="recommendation-system", group="Recommender")

#Recommender Hyperparams
r_emb_dim = 4
r_lr = 0.0001
r_epochs = 20
r_l2 = 0.0000
r_dropout = 0.0
r_batch_size = 40960
r_emb_name = "charmed-haze-26"
r_dense_struct = [16, 4]

wandb.config.emb_dim = r_emb_dim
wandb.config.learning_rate = r_lr
wandb.config.epochs = r_epochs
wandb.config.l2 = r_l2
wandb.config.dropout = r_dropout
wandb.config.batch_size = r_batch_size
wandb.config.emb_name = r_emb_name
wandb.config.dense_struct = r_dense_struct

In [0]:
#Load Embedding Matrix Generated by Matrix Factorizer
user_emb, movie_emb = None, None
path = "./Checkpoints/MatrixFactorizer/" + r_emb_name
if(os.path.exists(path)):
  user_emb = np.load(path + "/user_emb.npy")
  movie_emb = np.load(path + "/movie_emb.npy")

In [39]:
#Model instantiation
recommender = Recommender(user_emb, movie_emb, r_dense_struct, r_dropout, r_l2)
recommender.compile(tf.keras.optimizers.Adam(r_lr), tf.keras.losses.MeanSquaredError(), metrics=[tf.keras.metrics.R])
recommender.train_on_batch(train_ds.batch(1).take(1))
recommender.summary()

Model: "recommender_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     multiple                  553972    
_________________________________________________________________
flatten_12 (Flatten)         multiple                  0         
_________________________________________________________________
embedding_13 (Embedding)     multiple                  106976    
_________________________________________________________________
flatten_13 (Flatten)         multiple                  0         
_________________________________________________________________
concatenate_4 (Concatenate)  multiple                  0         
_________________________________________________________________
dense_block_7 (DenseBlock)   multiple                  176       
_________________________________________________________________
dense_block_8 (DenseBlock)   multiple                

In [40]:
#Model fit
recommender.fit(train_ds.batch(r_batch_size), epochs=r_epochs, callbacks=[wandb.keras.WandbCallback(monitor="root_mean_squared_error", save_weights_only=True)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fd21a6c7c50>

In [41]:
recommender.evaluate(test_ds)



[0.04490119442343712, 0.21189901]