# Deep Learning and Content-Based Filtering

In [2]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras, optimizers
from keras.layers import Dense, Input
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
import recsysNN_utils
import public_tests
pd.set_option("display.precision", 1)

In [3]:
top10_df = pd.read_csv("./data/content_top10_df.csv")
bygenre_df = pd.read_csv("./data/content_bygenre_df.csv")

In [4]:
top10_df

Unnamed: 0,movie id,num ratings,ave rating,title,genres
0,4993,198,4.1,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
1,5952,188,4.0,"Lord of the Rings: The Two Towers, The",Adventure|Fantasy
2,7153,185,4.1,"Lord of the Rings: The Return of the King, The",Action|Adventure|Drama|Fantasy
3,4306,170,3.9,Shrek,Adventure|Animation|Children|Comedy|Fantasy|Ro...
4,58559,149,4.2,"Dark Knight, The",Action|Crime|Drama
5,6539,149,3.8,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy
6,79132,143,4.1,Inception,Action|Crime|Drama|Mystery|Sci-Fi|Thriller
7,6377,141,4.0,Finding Nemo,Adventure|Animation|Children|Comedy
8,4886,132,3.9,"Monsters, Inc.",Adventure|Animation|Children|Comedy|Fantasy
9,7361,131,4.2,Eternal Sunshine of the Spotless Mind,Drama|Romance|Sci-Fi


In [5]:
bygenre_df

Unnamed: 0,genre,num movies,ave rating/genre,ratings per genre
0,Action,321,3.4,10377
1,Adventure,234,3.4,8785
2,Animation,76,3.6,2588
3,Children,69,3.4,2472
4,Comedy,326,3.4,8911
5,Crime,139,3.5,4671
6,Documentary,13,3.8,280
7,Drama,342,3.6,10201
8,Fantasy,124,3.4,4468
9,Horror,56,3.2,1345


In [6]:
item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre = recsysNN_utils.load_data()
num_user_features = user_train.shape[1] - 3  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items
print(f"Number of training vectors: {len(item_train)}")

Number of training vectors: 50884


In [7]:
recsysNN_utils.pprint_train(user_train, user_features, uvs,  u_s, maxcount=5)

[user id],[rating count],[rating ave],Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9


In [8]:
recsysNN_utils.pprint_train(item_train, item_features, ivs, i_s, maxcount=5, user=False)

[movie id],year,ave rating,Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
6874,2003,4.0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
8798,2004,3.8,1,0,0,0,0,1,0,1,0,0,0,0,0,1
46970,2006,3.2,1,0,0,0,1,0,0,0,0,0,0,0,0,0
48516,2006,4.3,0,0,0,0,0,1,0,1,0,0,0,0,0,1
58559,2008,4.2,1,0,0,0,0,1,0,1,0,0,0,0,0,0


In [9]:
# Feature Scaling
item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

print(f'y_train shape: {y_train.shape}')

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))

y_train.shape


y_train shape: (50884,)


(50884, 1)

In [10]:
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

movie/item training data shape: (40707, 17)
movie/item test data shape: (10177, 17)


## Neural Network

In [21]:
class Normalizer(keras.Layer):
  axis: int

  def __init__(self, axis=-1, **kwargs):
    super().__init__(**kwargs)
    self.axis = axis

  def call(self, x):
    return tf.linalg.l2_normalize(x, axis=self.axis)

  def get_config(self):
    cfg = super().get_config()
    cfg.update({"axis": self.axis})
    return cfg


user_vector_starting_index = 3
item_vector_starting_index = 1

In [17]:
n_output = 32
tf.random.set_seed(1)

user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(n_output, activation='linear'),
    Normalizer(axis=1, name='user_normalizer')
], name='user_NN')

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(n_output, activation='linear'), 
    Normalizer(axis=1, name='item_normalizer')
], name='item_NN')

input_user = Input(shape=(num_user_features,), name='input_user')
Vu = user_NN(input_user)

input_item = tf.keras.layers.Input(shape=(num_item_features,), name='input_item')
Vm = item_NN(input_item)

output = keras.layers.Dot(axes=1)([Vu, Vm])

whole_model = keras.Model(
  inputs= {
    'user_NN': input_user,
    'item_NN': input_item
  },
  outputs=output,
  name='whole_model')

whole_model.summary()



In [18]:
## Tests won't pass due to the Normalizer layer
# public_tests.test_tower(user_NN)
# public_tests.test_tower(item_NN)

In [19]:
tf.random.set_seed(1)

whole_model.build(input_shape=())

whole_model.compile(
  optimizer=optimizers.Adam(learning_rate=0.01),
  loss=keras.losses.MeanSquaredError()
)

In [22]:
whole_model.fit(
  {
    'user_NN': user_train[:, user_vector_starting_index:],
    'item_NN': item_train[:, item_vector_starting_index:]
  },  # User All columns starting from 4 (1,2,3, id, rating_count, rating_ave, are user features that are unrelated with the output); Movies: same, feature 0 is movie ID
  y_train,
  epochs=1
);

[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 891us/step - loss: 0.1138


In [23]:
whole_model.evaluate(
  {
    'user_NN': user_test[:, user_vector_starting_index:],
    'item_NN': item_test[:, item_vector_starting_index:]
  },
  y_test
)

[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.1117 


0.10825776308774948

In [None]:
# If training loss (0.10825776308774948) and test loss (0.1117) are comparable the model is not suffering from overfitting

## Predictions

In [25]:
# A user that enjoys fantasy and adventure
new_user_id = 5000
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 3

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
  new_action, new_adventure, new_animation, new_childrens,
  new_comedy, new_crime, new_documentary,
  new_drama, new_fantasy, new_horror, new_mystery,
  new_romance, new_scifi, new_thriller]])



In [26]:
user_vecs = recsysNN_utils.gen_user_vecs(user_vec, len(item_vecs))
scaled_user_vecs = scalerUser.transform(user_vecs)
scaled_item_vects = scalerItem.transform(item_vecs)

In [27]:
# Actual predictions

y_preds = whole_model.predict({
  'user_NN': scaled_user_vecs[:, user_vector_starting_index:],
  'item_NN': scaled_item_vects[:, item_vector_starting_index:]
})

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step


In [35]:
y_preds = scalerTarget.inverse_transform(y_preds) # revert the scaling

sorted_indexes = np.argsort(-y_preds, axis=0).reshape(-1).tolist()   # negate for descending order
sorted_predictions = y_preds[sorted_indexes] # advanced indexing
sorted_items = item_vecs[sorted_indexes]

recsysNN_utils.print_pred_movies(sorted_predictions, sorted_items, movie_dict, maxcount=10)

y_p,movie id,rating ave,title,genres
147.1,5618,4.2,Spirited Away (Sen to Chihiro no kamikakushi) (2001),Adventure|Animation|Fantasy
145.6,31658,4.1,Howl's Moving Castle (Hauru no ugoku shiro) (2004),Adventure|Animation|Fantasy|Romance
145.2,36708,3.7,Family Guy Presents Stewie Griffin: The Untold Story (2005),Adventure|Animation|Comedy
145.0,4993,4.1,"Lord of the Rings: The Fellowship of the Ring, The (2001)",Adventure|Fantasy
144.8,5952,4.0,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
144.2,8368,3.9,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy
143.7,4886,3.9,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
143.0,103141,3.9,Monsters University (2013),Adventure|Animation|Comedy
143.0,59387,4.0,"Fall, The (2006)",Adventure|Drama|Fantasy
142.8,4973,4.2,"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",Comedy|Romance


## Predictions for an existing user

In [38]:
uid = 2

user_vecs, y_vecs = recsysNN_utils.get_user_vecs(uid, user_train_unscaled, item_vecs, user_to_genre)

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# make a prediction
y_preds = whole_model.predict({
  'user_NN': suser_vecs[:, user_vector_starting_index:],
  'item_NN': sitem_vecs[:, item_vector_starting_index:]
})

# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_preds)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display
sorted_user  = user_vecs[sorted_index]
sorted_y     = y_vecs[sorted_index]

#print sorted predictions for movies rated by the user
recsysNN_utils.print_existing_user(sorted_ypu, sorted_y.reshape(-1,1), sorted_user, sorted_items, ivs, uvs, movie_dict, maxcount = 50)

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


y_p,y,user,user genre ave,movie rating ave,movie id,title,genres
4.3,4.0,2,"[4.1,4.0,3.9]",4.3,48516,"Departed, The (2006)",Crime|Drama|Thriller
4.2,4.0,2,"[4.0,4.1,4.0,4.0,3.9,3.9]",4.1,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller
4.2,4.5,2,"[4.1,4.0,3.9]",4.0,80489,"Town, The (2010)",Crime|Drama|Thriller
4.1,4.0,2,"[4.0,4.0,3.9]",4.0,74458,Shutter Island (2010),Drama|Mystery|Thriller
4.1,5.0,2,[4.0],4.3,80906,Inside Job (2010),Documentary
4.1,4.5,2,"[4.0,4.1,4.0]",4.2,58559,"Dark Knight, The (2008)",Action|Crime|Drama
4.1,3.0,2,"[4.0,4.0,3.0]",3.9,71535,Zombieland (2009),Action|Comedy|Horror
4.1,3.5,2,"[4.0,3.9,3.9]",3.9,115713,Ex Machina (2015),Drama|Sci-Fi|Thriller
4.0,4.0,2,"[4.0,4.1,3.9]",4.0,6874,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller
4.0,3.5,2,"[4.0,4.1,4.0,3.9]",3.8,8798,Collateral (2004),Action|Crime|Drama|Thriller


## Finding similar items

In [39]:
def squared_distance(a, b):
  return np.sum((a - b)**2)

In [42]:
a1 = np.array([1.0, 2.0, 3.0]); b1 = np.array([1.0, 2.0, 3.0])
a2 = np.array([1.1, 2.1, 3.1]); b2 = np.array([1.0, 2.0, 3.0])
a3 = np.array([0, 1, 0]);       b3 = np.array([1, 0, 0])
print(f"squared distance between a1 and b1: {squared_distance(a1, b1):0.3f}")
print(f"squared distance between a2 and b2: {squared_distance(a2, b2):0.3f}")
print(f"squared distance between a3 and b3: {squared_distance(a3, b3):0.3f}")

public_tests.test_sq_dist(squared_distance)

squared distance between a1 and b1: 0.000
squared distance between a2 and b2: 0.030
squared distance between a3 and b3: 2.000
[92mAll tests passed!


## Offline computation of a distance matrix for items

In [43]:
input_item_m = Input(shape=(num_item_features,))
Vm_m = item_NN(input_item_m)
model_for_item = keras.Model(input_item_m, Vm_m)
model_for_item.summary()

In [44]:
scaled_item_vecs = scalerItem.transform(item_vecs)
Vms = model_for_item.predict(scaled_item_vecs[:, item_vector_starting_index:])
Vms.shape

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step


(847, 32)

In [47]:
# 1. compute the matrix
# 2. find the lowest values (most similar) in the row

max_items = 50

l = len(Vms)  # the matrix will be of shape len(VMs) x len(VMs)
distance_matrix = np.zeros((l, l))

for i in range(l):
  for j in range(l):
    distance_matrix[i, j] = squared_distance(Vms[i, :], Vms[j, :])

masked_distance_matrix = ma.masked_array(distance_matrix, mask=np.identity(distance_matrix.shape[0]))

In [49]:
display = [['movie1', 'genres', 'movie2', 'genres']]

for i in range(max_items):
  min_idx = np.argmin(masked_distance_matrix[i])
  movie1_id = int(item_vecs[i, 0])
  movie2_id = int(item_vecs[min_idx, 0])
  display.append( [movie_dict[movie1_id]['title'], movie_dict[movie1_id]['genres'],
    movie_dict[movie2_id]['title'], movie_dict[movie1_id]['genres']]
  )

table = tabulate.tabulate(display, tablefmt='html', headers="firstrow")
table

movie1,genres,movie2,genres.1
Save the Last Dance (2001),Drama|Romance,Mona Lisa Smile (2003),Drama|Romance
"Wedding Planner, The (2001)",Comedy|Romance,Mr. Deeds (2002),Comedy|Romance
Hannibal (2001),Horror|Thriller,Final Destination 2 (2003),Horror|Thriller
Saving Silverman (Evil Woman) (2001),Comedy|Romance,Stuck on You (2003),Comedy|Romance
Down to Earth (2001),Comedy|Fantasy|Romance,"Legally Blonde 2: Red, White & Blonde (2003)",Comedy|Fantasy|Romance
"Mexican, The (2001)",Action|Comedy,Rush Hour 2 (2001),Action|Comedy
15 Minutes (2001),Thriller,Panic Room (2002),Thriller
Enemy at the Gates (2001),Drama,"Aviator, The (2004)",Drama
Heartbreakers (2001),Comedy|Crime|Romance,Fun with Dick and Jane (2005),Comedy|Crime|Romance
Spy Kids (2001),Action|Adventure|Children|Comedy,Maleficent (2014),Action|Adventure|Children|Comedy
