In [1]:
import torch
import torch.nn.functional as F

from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import heapq
import math

import scipy.sparse as sp
import numpy as np
import pandas as pd

import random


https://grouplens.org/datasets/movielens/

In [2]:
movie_data = pd.read_csv('./datasets/movies/ml-latest-small/ratings.csv', 
                         sep=',', header=0)

movie_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
NUM_USERS = movie_data['userId'].max()

NUM_USERS

610

In [4]:
NUM_ITEMS = movie_data['movieId'].max()

NUM_ITEMS

193609

In [5]:
TEST_USER_IDS = [random.randint(1,NUM_USERS) for i in range(100)]
TEST_USER_IDS

[504,
 427,
 512,
 383,
 68,
 432,
 435,
 469,
 42,
 214,
 205,
 155,
 128,
 583,
 489,
 564,
 93,
 364,
 334,
 9,
 345,
 72,
 133,
 330,
 131,
 572,
 551,
 343,
 398,
 228,
 608,
 452,
 12,
 442,
 241,
 155,
 372,
 204,
 279,
 113,
 101,
 523,
 447,
 106,
 372,
 292,
 371,
 49,
 461,
 444,
 146,
 554,
 370,
 398,
 361,
 88,
 84,
 263,
 189,
 43,
 251,
 353,
 228,
 587,
 137,
 409,
 307,
 122,
 215,
 180,
 478,
 362,
 435,
 4,
 276,
 429,
 476,
 273,
 427,
 49,
 214,
 127,
 370,
 191,
 109,
 262,
 336,
 589,
 402,
 323,
 164,
 59,
 306,
 179,
 284,
 185,
 310,
 560,
 79,
 361]

In [6]:
test_movie_users = movie_data[movie_data['userId'].isin(TEST_USER_IDS)]

test_movie_users.head()

Unnamed: 0,userId,movieId,rating,timestamp
300,4,21,3.0,986935199
301,4,32,2.0,945173447
302,4,45,3.0,986935047
303,4,47,2.0,945173425
304,4,52,3.0,964622786


In [7]:
def load_ratings_matrix(movie_data):

    ratings_matrix = sp.dok_matrix((NUM_USERS + 1, NUM_ITEMS + 1), dtype=np.float32)

    for index, row in movie_data.iterrows():
        user, item, rating = int(row['userId']), int(row['movieId']), float(row['rating'])
        
        ratings_matrix[user, item] = rating
    
    random_user = np.random.randint(1, NUM_USERS)
    
    return ratings_matrix

In [8]:
ratings_matrix = load_ratings_matrix(movie_data)

ratings_matrix.shape

(611, 193610)

In [9]:
class RecommenderNN(nn.Module):

    def __init__(self, n_users, n_items, layers=[24, 16], dropout=0.2):

        super().__init__()

        assert (layers[0] % 2 == 0), "layers[0] must be an even number"

        self.dropout = dropout

        embedding_dim = int(layers[0] / 2)

        self.user_embedding = torch.nn.Embedding(n_users, embedding_dim)
        self.item_embedding = torch.nn.Embedding(n_items, embedding_dim)

        self.fc_layers = torch.nn.ModuleList()

        for _, (in_size, out_size) in enumerate(zip(layers[:-1], layers[1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))

        # Output of the last layer is just 1 for predicting ratings values
        self.output_layer = torch.nn.Linear(layers[-1], 1)

    def forward(self, users, items):
        user_embedding = self.user_embedding(users)
        item_embedding = self.item_embedding(items)

        # Concatenate user and item embeddings, this is the input to the NN
        x = torch.cat([user_embedding, item_embedding], 1)
        
        for idx, _ in enumerate(range(len(self.fc_layers))):
            x = self.fc_layers[idx](x)
            x = F.relu(x)
            x = F.dropout(x,  p=self.dropout, training=self.training)
        
        rating = self.output_layer(x)

        return rating

    def predict(self, users, items):
        output_scores = self.forward(users, items)

        return output_scores.cpu().detach().numpy()

In [10]:
def generate_training_instances(ratings_matrix):
    
    user_item_ratings = {}
    index = 0
    
    for user, item in ratings_matrix.keys():
        
        user_item_ratings[index] = (user, item, ratings_matrix[user, item])
        index += 1
    
    return user_item_ratings

In [11]:
train_user_item_ratings = generate_training_instances(ratings_matrix)

len(train_user_item_ratings)

100836

In [12]:
train_user_item_ratings[0], train_user_item_ratings[3]

((1, 1, 4.0), (1, 47, 5.0))

In [13]:
device = torch.device("mps" if torch.has_mps else "cpu")
device

device(type='mps')

In [14]:
def train(model, train_data_loader, criterion, optimizer, epoch):
    
    model.train()
    
    epoch_loss = []
    
    for users_items_rating in train_data_loader:
        
        users, items, ratings = users_items_rating
        users = users.to(device)
        items = items.to(device)
        ratings = ratings.to(device)
        predictions = model(users, items)
        

        ratings = ratings.float().view(predictions.size())
        
        loss = criterion(predictions, ratings)
        
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
        
        epoch_loss.append(loss.item())
        
    epoch_loss = np.mean(epoch_loss)

    print("Epoch completed", epoch)
    
    print("Train Loss: {%.4f}" % (epoch_loss))

In [15]:
def load_zero_rated(ratings_matrix, user_id, user_item_ratings): 
    
    for i in range(100):
        potential_zero_item = np.random.randint(1, NUM_ITEMS)
        
        while (user_id, potential_zero_item) in ratings_matrix:
            potential_zero_item = np.random.randint(1, NUM_ITEMS)
            
        user_item_ratings['users'] = \
            np.append(user_item_ratings['users'], np.array([user_id]))
        
        user_item_ratings['items'] = \
            np.append(user_item_ratings['items'], np.array([potential_zero_item]))
        
        user_item_ratings['ratings'] = \
            np.append(user_item_ratings['ratings'], np.array([0]))

In [16]:
def generate_test_instances(ratings_matrix, test_movie_users):

    test_list = []

    for user_id in TEST_USER_IDS:
        
        user_item_ratings_df = test_movie_users[test_movie_users['userId'] == user_id]
        user_item_ratings_df = user_item_ratings_df[user_item_ratings_df['rating'] >= 4]
        
        user_item_ratings = {}
        
        user_item_ratings['users'] = user_item_ratings_df['userId'].values[5:15]
        user_item_ratings['items'] = user_item_ratings_df['movieId'].values[5:15]
        user_item_ratings['ratings'] = user_item_ratings_df['rating'].values[5:15]
        
        load_zero_rated(ratings_matrix, user_id, user_item_ratings)
        
        test_list.append(user_item_ratings)
        
    return test_list

In [17]:
def get_apk(arr1, arr2):
    sum, so_far, num_correct = 0, 0, 0
    
    for num in (arr2):
        so_far += 1
        if num in arr1:
            num_correct += 1
        sum += (num_correct/so_far)
    return sum/so_far

In [18]:
i = [1, 2, 3]
2 in i

True

In [19]:
def evaluate(model, test_list):

    model.eval()
    
    apks = []

    for user_item_ratings in test_list:
        
        users = torch.tensor(user_item_ratings['users'])
        items = torch.tensor(user_item_ratings['items'])
        ratings = user_item_ratings['ratings']

        users = users.to(device)
        items = items.to(device)
        #ratings = ratings.to(device)

        predictions = model.predict(users, items)
        
        item_score_map = {}
        
        for i, item in enumerate(user_item_ratings['items']):
            item_score_map[item] = predictions[i]
        
        rank_list = heapq.nlargest(100, item_score_map, key=item_score_map.get)
        
        items_list = items.detach().cpu().numpy().tolist()
        rank_list = list(rank_list)

        apk = get_apk(items_list[:10], rank_list[:10])
        
        apks.append(apk)
        
    print("Evaluation mean APK : " + str(np.mean(apks)))
    return np.mean(apks)

In [20]:
test_list = generate_test_instances(ratings_matrix, test_movie_users)
test_list

[{'users': array([504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504,
         504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504,
         504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504,
         504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504,
         504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504,
         504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504,
         504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504,
         504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504, 504,
         504, 504, 504, 504, 504, 504]),
  'items': array([   852,   1288,   1307,   1682,   1797,   1968,   2289,   2321,
           2353,   2671, 118331,  73682, 190990, 131516,  29350,   2472,
          79522,   6844, 106670,  70144,  20489, 178175, 141764,  87243,
         126512, 156930,  37075,  57575, 143962,  46067, 130537, 190552,
           5798, 130322,  49568,  60609,  12754,  25635, 

In [21]:
n_epochs = [5, 10, 15]
n_lrs = [0.00001, 0.0001, 0.001]
best_apk = -1
best_epochs = None
best_lr = None

criterion = torch.nn.MSELoss()
train_data_loader = DataLoader(train_user_item_ratings, batch_size=100, shuffle=True, num_workers=0)

for epochs in n_epochs:
    for lr in n_lrs:
        print("For lr = " + str(lr) + " and epochs = " + str(epochs))
        model = RecommenderNN(NUM_USERS + 1, NUM_ITEMS + 1, [32, 16, 8], dropout=0.2)
        model = model.to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.00001)
        for epoch in range(1, epochs + 1):
            train(model, train_data_loader, criterion, optimizer, epoch)
            apk = evaluate(model, test_list)
        if apk > best_apk:
            best_apk = apk
            best_epochs = epochs
            best_lr = lr
        model = model.cpu()
        del model
print("Best apk: " + str(best_apk) + " resulted from the best hyperparameters: " + "lr = " + str(best_lr) + ", epochs = " + str(best_epochs))

For lr = 1e-05 and epochs = 5
Epoch completed 1
Train Loss: {9.8467}
Evaluation mean APK : 0.11890357142857143
Epoch completed 2
Train Loss: {9.1809}
Evaluation mean APK : 0.11893134920634921
Epoch completed 3
Train Loss: {8.4699}
Evaluation mean APK : 0.1285690476190476
Epoch completed 4
Train Loss: {7.7365}
Evaluation mean APK : 0.1356357142857143
Epoch completed 5
Train Loss: {7.0080}
Evaluation mean APK : 0.13773650793650793
For lr = 0.0001 and epochs = 5
Epoch completed 1
Train Loss: {9.9622}
Evaluation mean APK : 0.1393373015873016
Epoch completed 2
Train Loss: {3.2316}
Evaluation mean APK : 0.20319603174603176
Epoch completed 3
Train Loss: {2.2665}
Evaluation mean APK : 0.2766011904761905
Epoch completed 4
Train Loss: {2.0511}
Evaluation mean APK : 0.3423976190476191
Epoch completed 5
Train Loss: {1.8953}
Evaluation mean APK : 0.4195912698412698
For lr = 0.001 and epochs = 5
Epoch completed 1
Train Loss: {2.7752}
Evaluation mean APK : 0.6778571428571429
Epoch completed 2
Train L