# Deep Learning for Content-Based Filtering


In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from torch.nn import Linear, ReLU, Sequential
from torch.nn.functional import normalize
from torch.utils.data import DataLoader, TensorDataset
from utils import format_predictions, gen_user_vecs, load_data

torch.set_default_dtype(torch.float64)

In [2]:
top10_df = pd.read_csv("./data/content_top10_df.csv")
bygenre_df = pd.read_csv("./data/content_bygenre_df.csv")
top10_df

Unnamed: 0,movie id,num ratings,ave rating,title,genres
0,4993,198,4.106061,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
1,5952,188,4.021277,"Lord of the Rings: The Two Towers, The",Adventure|Fantasy
2,7153,185,4.118919,"Lord of the Rings: The Return of the King, The",Action|Adventure|Drama|Fantasy
3,4306,170,3.867647,Shrek,Adventure|Animation|Children|Comedy|Fantasy|Ro...
4,58559,149,4.238255,"Dark Knight, The",Action|Crime|Drama
5,6539,149,3.778523,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy
6,79132,143,4.066434,Inception,Action|Crime|Drama|Mystery|Sci-Fi|Thriller
7,6377,141,3.960993,Finding Nemo,Adventure|Animation|Children|Comedy
8,4886,132,3.871212,"Monsters, Inc.",Adventure|Animation|Children|Comedy|Fantasy
9,7361,131,4.160305,Eternal Sunshine of the Spotless Mind,Drama|Romance|Sci-Fi


In [3]:
bygenre_df

Unnamed: 0,genre,num movies,ave rating/genre,ratings per genre
0,Action,321,3.37,10377
1,Adventure,234,3.42,8785
2,Animation,76,3.63,2588
3,Children,69,3.44,2472
4,Comedy,326,3.36,8911
5,Crime,139,3.54,4671
6,Documentary,13,3.81,280
7,Drama,342,3.61,10201
8,Fantasy,124,3.37,4468
9,Horror,56,3.2,1345


## Training Data


In [4]:
# Load Data, set configuration variables
(
    item_train,
    user_train,
    y_train,
    item_features,
    user_features,
    item_vecs,
    movie_dict,
    user_to_genre,
) = load_data()

num_user_features = (
    user_train.shape[1] - 3
)  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items
print(f"Number of training vectors: {len(item_train)}")

Number of training vectors: 50884


In [5]:
user_train_df = pd.DataFrame(user_train, columns=user_features)
user_train_df.head()

Unnamed: 0,user id,rating count,rating ave,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,2.0,22.0,4.0,3.95,4.25,0.0,0.0,4.0,4.12,4.0,4.04,0.0,3.0,4.0,0.0,3.88,3.89
1,2.0,22.0,4.0,3.95,4.25,0.0,0.0,4.0,4.12,4.0,4.04,0.0,3.0,4.0,0.0,3.88,3.89
2,2.0,22.0,4.0,3.95,4.25,0.0,0.0,4.0,4.12,4.0,4.04,0.0,3.0,4.0,0.0,3.88,3.89
3,2.0,22.0,4.0,3.95,4.25,0.0,0.0,4.0,4.12,4.0,4.04,0.0,3.0,4.0,0.0,3.88,3.89
4,2.0,22.0,4.0,3.95,4.25,0.0,0.0,4.0,4.12,4.0,4.04,0.0,3.0,4.0,0.0,3.88,3.89


In [6]:
item_train_df = pd.DataFrame(item_train, columns=item_features)
item_train_df.head()

Unnamed: 0,movie id,year,ave rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,6874.0,2003.0,3.961832,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,8798.0,2004.0,3.761364,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,46970.0,2006.0,3.25,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,48516.0,2006.0,4.252336,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,58559.0,2008.0,4.238255,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
print(f"y_train[:5]: {y_train[:5]}")

y_train[:5]: tensor([4.0000, 3.5000, 4.0000, 4.0000, 4.5000])


In [8]:
# scale training data
item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled = y_train

scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = torch.from_numpy(scalerItem.transform(item_train))

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = torch.from_numpy(scalerUser.transform(user_train))

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = torch.from_numpy(scalerTarget.transform(y_train.reshape(-1, 1)))
# ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))

print(
    torch.allclose(
        item_train_unscaled, torch.from_numpy(scalerItem.inverse_transform(item_train))
    )
)
print(
    torch.allclose(
        user_train_unscaled, torch.from_numpy(scalerUser.inverse_transform(user_train))
    )
)

True
True


In [9]:
item_train, item_test = train_test_split(
    item_train, train_size=0.80, shuffle=True, random_state=1
)
user_train, user_test = train_test_split(
    user_train, train_size=0.80, shuffle=True, random_state=1
)
y_train, y_test = train_test_split(
    y_train, train_size=0.80, shuffle=True, random_state=1
)

print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

movie/item training data shape: torch.Size([40707, 17])
movie/item test data shape: torch.Size([10177, 17])


## Neural Network


In [10]:
num_ouputs = 32


class L2Normalize(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, input: torch.Tensor):
        return normalize(input, p=2, dim=1)


user_NN = Sequential(
    Linear(num_user_features, 256),
    ReLU(),
    Linear(256, 128),
    ReLU(),
    Linear(128, num_ouputs),
    L2Normalize(),
)

item_NN = Sequential(
    Linear(num_item_features, 256),
    ReLU(),
    Linear(256, 128),
    ReLU(),
    Linear(128, num_ouputs),
    L2Normalize(),
)


class RecoModel(torch.nn.Module):
    def __init__(self, user_NN, item_NN):
        super().__init__()
        self.user_NN = user_NN
        self.item_NN = item_NN

    def forward(self, user_input, item_input):
        vu = self.user_NN(user_input)
        vm = self.item_NN(item_input)

        return (vu * vm).sum(dim=1).reshape(-1, 1)


model = RecoModel(user_NN, item_NN)

model

RecoModel(
  (user_NN): Sequential(
    (0): Linear(in_features=14, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=32, bias=True)
    (5): L2Normalize()
  )
  (item_NN): Sequential(
    (0): Linear(in_features=16, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=32, bias=True)
    (5): L2Normalize()
  )
)

## Training Loop


In [11]:
cost_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)

dataset = TensorDataset(user_train[:, u_s:], item_train[:, i_s:], y_train)
dataloader = DataLoader(dataset=dataset, batch_size=32)

epochs = 30

for epoch in range(epochs):
    print(f"epoch {epoch+1}/{epochs}")

    loss_history = []
    for user_input, item_input, labels in dataloader:
        optimizer.zero_grad()

        output = model(user_input, item_input)

        loss = cost_fn(output, labels)

        loss.backward()

        loss_history.append(loss.item())

        optimizer.step()

    mean_loss = torch.tensor(loss_history).mean().item()

    print("epoch {}, loss {}".format(epoch + 1, mean_loss))

epoch 1/30


epoch 1, loss 0.12386198396520565
epoch 2/30
epoch 2, loss 0.11588322229885556
epoch 3/30
epoch 3, loss 0.11207546551710791
epoch 4/30
epoch 4, loss 0.10850509003639484
epoch 5/30
epoch 5, loss 0.1050365433147336
epoch 6/30
epoch 6, loss 0.10215143194096875
epoch 7/30
epoch 7, loss 0.09953888913430471
epoch 8/30
epoch 8, loss 0.09739094018975786
epoch 9/30
epoch 9, loss 0.09535210099681009
epoch 10/30
epoch 10, loss 0.09331831955625342
epoch 11/30
epoch 11, loss 0.0914626475524305
epoch 12/30
epoch 12, loss 0.08986181498263655
epoch 13/30
epoch 13, loss 0.08826774822369327
epoch 14/30
epoch 14, loss 0.08667868377619695
epoch 15/30
epoch 15, loss 0.08519385586650574
epoch 16/30
epoch 16, loss 0.08378955262510014
epoch 17/30
epoch 17, loss 0.08256704870262417
epoch 18/30
epoch 18, loss 0.08134393330429568
epoch 19/30
epoch 19, loss 0.08022223713899983
epoch 20/30
epoch 20, loss 0.07921746720806809
epoch 21/30
epoch 21, loss 0.07828227612691721
epoch 22/30
epoch 22, loss 0.077434806122637

### Evaluate the model


In [12]:
model.eval()
with torch.no_grad():
    output = model(user_test[:, u_s:], item_test[:, i_s:])

cost_fn(output, y_test)

tensor(0.0839)

## Predictions


In [13]:
new_user_id = 5000
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 3

user_vec = torch.tensor(
    [
        [
            new_user_id,
            new_rating_count,
            new_rating_ave,
            new_action,
            new_adventure,
            new_animation,
            new_childrens,
            new_comedy,
            new_crime,
            new_documentary,
            new_drama,
            new_fantasy,
            new_horror,
            new_mystery,
            new_romance,
            new_scifi,
            new_thriller,
        ]
    ]
)

In [14]:
# generate and replicate the user vector to match the number movies in the data set.
user_vecs = gen_user_vecs(user_vec, len(item_vecs))

# scale our user and item vectors
suser_vecs = torch.from_numpy(scalerUser.transform(user_vecs))
sitem_vecs = torch.from_numpy(scalerItem.transform(item_vecs))

with torch.no_grad():
    # make a prediction
    y_p = model(suser_vecs[:, u_s:], sitem_vecs[:, i_s:])

# unscale y prediction
y_pu = torch.from_numpy(scalerTarget.inverse_transform(y_p))

# sort the results, highest prediction first
sorted_index = (
    torch.argsort(-y_pu, axis=0).reshape(-1).tolist()
)  # negate to get largest rating first
sorted_ypu = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  # using unscaled vectors for display

format_predictions(sorted_ypu, sorted_items, movie_dict)[:10]

Unnamed: 0,y_p,movie_id,avg_rating,title,genres
0,4.135366,69844,3.887931,Harry Potter and the Half-Blood Prince (2009),Adventure|Fantasy|Mystery|Romance
1,4.134282,98809,3.8125,"Hobbit: An Unexpected Journey, The (2012)",Adventure|Fantasy
2,4.115751,106489,3.58,"Hobbit: The Desolation of Smaug, The (2013)",Adventure|Fantasy
3,4.107003,54001,3.862069,Harry Potter and the Order of the Phoenix (2007),Adventure|Drama|Fantasy
4,4.101139,112175,3.766667,How to Train Your Dragon 2 (2014),Action|Adventure|Animation
5,4.100491,137857,3.636364,The Jungle Book (2016),Adventure|Drama|Fantasy
6,4.059742,59387,3.954545,"Fall, The (2006)",Adventure|Drama|Fantasy
7,4.03602,88125,3.91,Harry Potter and the Deathly Hallows: Part 2 (...,Action|Adventure|Drama|Fantasy|Mystery
8,4.022757,27846,3.9,"Corporation, The (2003)",Documentary
9,4.020507,5816,3.598039,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy


## Finding Similar Items


In [22]:
def sq_dist(a, b):
    """
    Returns the squared distance between two vectors
    Args:
      a (ndarray (n,)): vector with n features
      b (ndarray (n,)): vector with n features
    Returns:
      d (float) : distance
    """
    ### START CODE HERE ###
    d = torch.sum((a - b) ** 2)
    ### END CODE HERE ###
    return d

In [19]:
item_model = model.item_NN

item_model

Sequential(
  (0): Linear(in_features=16, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=128, bias=True)
  (3): ReLU()
  (4): Linear(in_features=128, out_features=32, bias=True)
  (5): L2Normalize()
)

In [21]:
scaled_item_vecs = torch.from_numpy(scalerItem.transform(item_vecs))
with torch.no_grad():
    vms = item_model(scaled_item_vecs[:, i_s:])

print(f"size of all predicted movie feature vectors: {vms.shape}")

size of all predicted movie feature vectors: torch.Size([847, 32])


In [24]:
count = 50  # number of movies to display
dim = len(vms)
dist = torch.zeros((dim, dim))

for i in range(dim):
    for j in range(dim):
        dist[i, j] = sq_dist(vms[i, :], vms[j, :])

m_dist = np.ma.masked_array(dist, mask=np.identity(dist.shape[0]))  # mask the diagonal
m_dist
disp = [["movie1", "genres", "movie2", "genres"]]
for i in range(count):
    min_idx = np.argmin(m_dist[i])
    movie1_id = int(item_vecs[i, 0])
    movie2_id = int(item_vecs[min_idx, 0])
    disp.append(
        [
            movie_dict[movie1_id]["title"],
            movie_dict[movie1_id]["genres"],
            movie_dict[movie2_id]["title"],
            movie_dict[movie1_id]["genres"],
        ]
    )

[['movie1', 'genres', 'movie2', 'genres'],
 ['Save the Last Dance (2001)',
  'Drama|Romance',
  'Mona Lisa Smile (2003)',
  'Drama|Romance'],
 ['Wedding Planner, The (2001)',
  'Comedy|Romance',
  'Mr. Deeds (2002)',
  'Comedy|Romance'],
 ['Hannibal (2001)',
  'Horror|Thriller',
  'Final Destination 2 (2003)',
  'Horror|Thriller'],
 ['Saving Silverman (Evil Woman) (2001)',
  'Comedy|Romance',
  'Down with Love (2003)',
  'Comedy|Romance'],
 ['Down to Earth (2001)',
  'Comedy|Fantasy|Romance',
  'Bewitched (2005)',
  'Comedy|Fantasy|Romance'],
 ['Mexican, The (2001)',
  'Action|Comedy',
  'Rush Hour 2 (2001)',
  'Action|Comedy'],
 ['15 Minutes (2001)', 'Thriller', 'Panic Room (2002)', 'Thriller'],
 ['Enemy at the Gates (2001)', 'Drama', 'Aviator, The (2004)', 'Drama'],
 ['Heartbreakers (2001)',
  'Comedy|Crime|Romance',
  'Fun with Dick and Jane (2005)',
  'Comedy|Crime|Romance'],
 ['Spy Kids (2001)',
  'Action|Adventure|Children|Comedy',
  'Scooby-Doo (2002)',
  'Action|Adventure|Child

In [25]:
pd.DataFrame(data=disp[1:], columns=disp[0])

Unnamed: 0,movie1,genres,movie2,genres.1
0,Save the Last Dance (2001),Drama|Romance,Mona Lisa Smile (2003),Drama|Romance
1,"Wedding Planner, The (2001)",Comedy|Romance,Mr. Deeds (2002),Comedy|Romance
2,Hannibal (2001),Horror|Thriller,Final Destination 2 (2003),Horror|Thriller
3,Saving Silverman (Evil Woman) (2001),Comedy|Romance,Down with Love (2003),Comedy|Romance
4,Down to Earth (2001),Comedy|Fantasy|Romance,Bewitched (2005),Comedy|Fantasy|Romance
5,"Mexican, The (2001)",Action|Comedy,Rush Hour 2 (2001),Action|Comedy
6,15 Minutes (2001),Thriller,Panic Room (2002),Thriller
7,Enemy at the Gates (2001),Drama,"Aviator, The (2004)",Drama
8,Heartbreakers (2001),Comedy|Crime|Romance,Fun with Dick and Jane (2005),Comedy|Crime|Romance
9,Spy Kids (2001),Action|Adventure|Children|Comedy,Scooby-Doo (2002),Action|Adventure|Children|Comedy
