In [19]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm


torch.manual_seed(1234)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Data Loader Function

In [3]:
def load_data_100k(path='./', delimiter='\t'):
    
    # Get the name of movies for inference.
    movies_names = np.genfromtxt(path + 'u.item', dtype='str', usecols=1, delimiter='|', encoding='ISO-8859-1')
    
    # Train and test data contains: 
    # user id, movie id, rating and timestamp
    train = np.loadtxt(path + 'u1.base', skiprows=0, delimiter=delimiter).astype('int32')
    test = np.loadtxt(path + 'u1.test', skiprows=0, delimiter=delimiter).astype('int32')
    total = np.concatenate((train, test), axis=0)

    users_num = np.unique(total[:,0]).size
    movies_num = np.unique(total[:,1]).size
    train_ratings_num = train.shape[0]
    test_ratings_num = test.shape[0]
    
    # Create matrices of shape (num of movies, num of users)
    train_ratings = np.zeros((movies_num, users_num), dtype='float32')
    test_ratings = np.zeros((movies_num, users_num), dtype='float32')
        
    # In single matrix, maps each user's movie rating.
    # i.e. train[mov_id, user_id] = rating
    # Timestamp is discarded.
    for i in range(train_ratings_num):
        train_ratings[train[i, 1] - 1, train[i, 0] - 1] = train[i, 2]

    for i in range(test_ratings_num):
        test_ratings[test[i, 1] - 1, test[i, 0] - 1] = test[i, 2]
            
    # Masks indicating non-zero entries
    # Used for calculating loss because not all movies rated by every user.
    train_masks = np.greater(train_ratings, 1e-12).astype('float32')  
    test_masks = np.greater(test_ratings, 1e-12).astype('float32')
    
    print('num of users: {}'.format(users_num))
    print('num of movies: {}'.format(movies_num))
    print('num of training ratings: {}'.format(train_ratings_num))
    print('num of test ratings: {}'.format(test_ratings_num))

    return movies_names, movies_num, users_num, train_ratings, train_masks, test_ratings, test_masks

# Load Data

In [7]:
data_path = '../data/raw/ml_100k/'

In [8]:
# Data Load
try:
    movies_names, movies_num, users_num, train_ratings, train_masks, \
        test_ratings, test_masks = load_data_100k(path=data_path, delimiter='\t')
except Exception as e:
    print('Error: Unable to load data', e)

num of users: 943
num of movies: 1682
num of training ratings: 80000
num of test ratings: 20000


# Set hyperparameters

In [9]:
n_hid = 500 # size of hidden layers
n_dim = 5 # inner AE embedding size
n_layers = 2 # number of hidden layers
gk_size = 3 # width=height of kernel for convolution

# Hyperparameters to tune for specific case
max_epoch_p = 500 # max number of epochs for pretraining (local kernel)
max_epoch_f = 1000 # max number of epochs for finetuning (global kernel)
patience_p = 5 # number of consecutive rounds of early stopping condition before actual stop for pretraining
patience_f = 10 # and finetuning
tol_p = 1e-4 # minimum threshold for the difference between consecutive values of train rmse, used for early stopping, for pretraining
tol_f = 1e-5 # and finetuning
lambda_2 = 20. # regularisation of number or parameters
lambda_s = 0.006 # regularisation of sparsity of the final matrix
dot_scale = 1 # dot product weight for global kernel

# Network Functions

In [12]:
def local_kernel(u, v):
    dist = torch.norm(u - v, p=2, dim=2)
    hat = torch.clamp(1. - dist ** 2, min=0.)
    return hat


class KernelLayer(nn.Module):
    """
    Class for using local kernel. Local kernel maps higher dimensional space to lower one.
    """
    def __init__(self, n_in, n_hid, n_dim, lambda_s, lambda_2, activation=nn.Sigmoid()):
        super().__init__()
        self.W = nn.Parameter(torch.randn(n_in, n_hid))
        self.u = nn.Parameter(torch.randn(n_in, 1, n_dim))
        self.v = nn.Parameter(torch.randn(1, n_hid, n_dim))
        self.b = nn.Parameter(torch.randn(n_hid))

        self.lambda_s = lambda_s
        self.lambda_2 = lambda_2

        nn.init.xavier_uniform_(self.W, gain=torch.nn.init.calculate_gain("relu"))
        nn.init.xavier_uniform_(self.u, gain=torch.nn.init.calculate_gain("relu"))
        nn.init.xavier_uniform_(self.v, gain=torch.nn.init.calculate_gain("relu"))
        nn.init.zeros_(self.b)
        self.activation = activation

    def forward(self, x):
        """
        Check section 2.1 of paper for details.
        
        Returns:
        y: all movies' ratings of users * local kernelised weight matrix 
        with non-linear activation function. 
        
        second parameter: sum of losses.
        """
        
        w_hat = local_kernel(self.u, self.v)

        sparse_reg = torch.nn.functional.mse_loss(w_hat, torch.zeros_like(w_hat))
        sparse_reg_term = self.lambda_s * sparse_reg

        l2_reg = torch.nn.functional.mse_loss(self.W, torch.zeros_like(self.W))
        l2_reg_term = self.lambda_2 * l2_reg

        W_eff = self.W * w_hat  # Local kernelised weight matrix
        y = torch.matmul(x, W_eff) + self.b
        y = self.activation(y)

        return y, sparse_reg_term + l2_reg_term

    
class KernelNet(nn.Module):
    """
    Network that consists of local kernel layers, and propagates input to/from them.
    """
    
    def __init__(self, n_u, n_hid, n_dim, n_layers, lambda_s, lambda_2):
        super().__init__()
        layers = []
        
        for i in range(n_layers):
            if i == 0:
                layers.append(KernelLayer(n_u, n_hid, n_dim, lambda_s, lambda_2))
            else:
                layers.append(KernelLayer(n_hid, n_hid, n_dim, lambda_s, lambda_2))
                
        layers.append(KernelLayer(n_hid, n_u, n_dim, lambda_s, lambda_2, activation=nn.Identity()))
        self.layers = nn.ModuleList(layers)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        total_reg = None
        for i, layer in enumerate(self.layers):
            x, reg = layer(x)
            if i < len(self.layers)-1:
                x = self.dropout(x)
        if total_reg is None:
            total_reg = reg
        else:
            total_reg += reg
            
        return x, total_reg

In [13]:
class CompleteNet(nn.Module):
    """
    Final network which consists of local and global kernels.
    Global kernel is a CNN convolution layer, which gather features from all dimensions of the input.
    """
    
    def __init__(self, kernel_net, n_u, n_m, n_hid, n_dim, n_layers, lambda_s, lambda_2, gk_size, dot_scale):
        super().__init__()
        self.gk_size = gk_size
        self.dot_scale = dot_scale
        self.local_kernel_net = kernel_net
        self.global_kernel_net = KernelNet(n_u, n_hid, n_dim, n_layers, lambda_s, lambda_2)
        self.conv_kernel = torch.nn.Parameter(torch.randn(n_m, gk_size ** 2) * 0.1)
        nn.init.xavier_uniform_(self.conv_kernel, gain=torch.nn.init.calculate_gain("relu"))
      
    def forward(self, train_r):
        x, _ = self.local_kernel_net(train_r)
        gk = self.global_kernel(x, self.gk_size, self.dot_scale)
        x = self.global_conv(train_r, gk)
        x, global_reg_loss = self.global_kernel_net(x)
        return x, global_reg_loss

    def global_kernel(self, input, gk_size, dot_scale):
        avg_pooling = torch.mean(input, dim=1)  # Item (axis=1) based average pooling
        avg_pooling = avg_pooling.view(1, -1)

        gk = torch.matmul(avg_pooling, self.conv_kernel) * dot_scale  # Scaled dot product
        gk = gk.view(1, 1, gk_size, gk_size)

        return gk

    def global_conv(self, input, W):
        input = input.unsqueeze(0).unsqueeze(0)
        conv2d = nn.LeakyReLU()(F.conv2d(input, W, stride=1, padding=1))
        return conv2d.squeeze(0).squeeze(0)

    
class Loss(nn.Module):
    def forward(self, pred_p, reg_loss, train_m, train_r):
        # L2 loss
        diff = train_m * (train_r - pred_p)
        sqE = torch.nn.functional.mse_loss(diff, torch.zeros_like(diff))
        loss_p = sqE + reg_loss
        return loss_p

# Network Instantiation

## Pre-training

In [9]:
model = KernelNet(users_num, n_hid, n_dim, n_layers, lambda_s, lambda_2).double().to(device)

## Fine-tuning

In [10]:
complete_model = CompleteNet(model, users_num, movies_num, n_hid, n_dim, n_layers, lambda_s, lambda_2, gk_size, dot_scale).double().to(device)

# Evaluation code

In [11]:
def dcg_k(score_label, k):
    dcg, i = 0., 0
    for s in score_label:
        if i < k:
            dcg += (2**s[1]-1) / np.log2(2+i)
            i += 1
    return dcg

In [12]:
def ndcg_k(y_hat, y, k):
    score_label = np.stack([y_hat, y], axis=1).tolist()
    score_label = sorted(score_label, key=lambda d:d[0], reverse=True)
    score_label_ = sorted(score_label, key=lambda d:d[1], reverse=True)
    norm, i = 0., 0
    for s in score_label_:
        if i < k:
            norm += (2**s[1]-1) / np.log2(2+i)
            i += 1
    dcg = dcg_k(score_label, k)
    return dcg / norm

In [13]:
def call_ndcg(y_hat, y):
    ndcg_sum, num = 0, 0
    y_hat, y = y_hat.T, y.T
    n_users = y.shape[0]

    for i in range(n_users):
        y_hat_i = y_hat[i][np.where(y[i])]
        y_i = y[i][np.where(y[i])]

        if y_i.shape[0] < 2:
            continue

        ndcg_sum += ndcg_k(y_hat_i, y_i, y_i.shape[0])  # user-wise calculation
        num += 1

    return ndcg_sum / num

# Pretraining and testing local kernel network.

In [14]:
best_rmse_ep, best_mae_ep, best_ndcg_ep = 0, 0, 0
best_rmse, best_mae, best_ndcg = float("inf"), float("inf"), 0

# Pre-Training
optimizer = torch.optim.AdamW(complete_model.local_kernel_net.parameters(), lr=0.001)


def closure():
    """
    This function is a training step.
    Sends whole training data to model and calculates loss.
    """
    optimizer.zero_grad()
    
    # Send train ratings and masks to GPU
    x = torch.Tensor(train_ratings).double().to(device)
    m = torch.Tensor(train_masks).double().to(device)
    
    # Turn on the gradients (train mode)
    complete_model.local_kernel_net.train()
        
    # Get predicted ratings and calculate the loss
    pred, reg = complete_model.local_kernel_net(x)
    loss = Loss().to(device)(pred, reg, m, x)
    loss.backward()
    
    return loss


last_rmse = np.inf
counter = 0 # counter for early stopping

for i in tqdm(range(max_epoch_p)):
    
    # Train model
    optimizer.step(closure)
    
    # Validation
    complete_model.local_kernel_net.eval()

    pre, _ = model(torch.Tensor(train_ratings).double().to(device))

    pre = pre.float().cpu().detach().numpy()

    # Calculate loss with masks, that excludes entries with zeros.
    error = (test_masks * (np.clip(pre, 1., 5.) - test_ratings) ** 2).sum() / test_masks.sum()  # test error
    test_rmse = np.sqrt(error)

    error_train = (train_masks * (np.clip(pre, 1., 5.) - train_ratings) ** 2).sum() / train_masks.sum()  # train error
    train_rmse = np.sqrt(error_train)

    # Update counter for early stopping
    counter = counter + 1 if last_rmse - train_rmse < tol_p else 0

    last_rmse = train_rmse

    if patience_p == counter:
        print("EARLY STOPPING...")
        print('Epoch:', i+1, 'test rmse:', test_rmse, 'train rmse:', train_rmse)
        break

    if i % 50 != 0:
        continue
        
    print('Epoch:', i, 'test rmse:', test_rmse, 'train rmse:', train_rmse)

  1%|          | 5/500 [00:01<02:08,  3.85it/s]

Epoch: 0 test rmse: 2.7561524 train rmse: 2.7305737


  5%|▌         | 26/500 [00:02<00:44, 10.76it/s]

EARLY STOPPING...
Epoch: 27 test rmse: 1.147558 train rmse: 1.1200588





# Fine-tune the model with Global kernel.

In [15]:
# Fine-Tuning
optimizer = torch.optim.AdamW(complete_model.parameters(), lr=0.001)


def closure():
    optimizer.zero_grad()
    x = torch.Tensor(train_ratings).double().to(device)
    m = torch.Tensor(train_masks).double().to(device)
    complete_model.train()
    pred, reg = complete_model(x)
    loss = Loss().to(device)(pred, reg, m, x)
    loss.backward()
    return loss


last_rmse = np.inf
counter = 0

for i in tqdm(range(max_epoch_f)):
    optimizer.step(closure)
    complete_model.eval()

    pre, _ = complete_model(torch.Tensor(train_ratings).double().to(device))

    pre = pre.float().cpu().detach().numpy()
    
    error = (test_masks * (np.clip(pre, 1., 5.) - test_ratings) ** 2).sum() / test_masks.sum()  # test error
    test_rmse = np.sqrt(error)

    error_train = (train_masks * (np.clip(pre, 1., 5.) - train_ratings) ** 2).sum() / train_masks.sum()  # train error
    train_rmse = np.sqrt(error_train)

    test_mae = (test_masks * np.abs(np.clip(pre, 1., 5.) - test_ratings)).sum() / test_masks.sum()
    train_mae = (train_masks * np.abs(np.clip(pre, 1., 5.) - train_ratings)).sum() / train_masks.sum()

    test_ndcg = call_ndcg(np.clip(pre, 1., 5.), test_ratings)
    train_ndcg = call_ndcg(np.clip(pre, 1., 5.), train_ratings)

    if test_rmse < best_rmse:
        best_rmse = test_rmse
        best_rmse_ep = i+1

    if test_mae < best_mae:
        best_mae = test_mae
        best_mae_ep = i+1

    if best_ndcg < test_ndcg:
        best_ndcg = test_ndcg
        best_ndcg_ep = i+1

    if last_rmse-train_rmse < tol_f:
        counter += 1
    else:
        counter = 0

    last_rmse = train_rmse

    if patience_f == counter:
        print("EARLY STOPPING...")
        print('Epoch:', i+1, 'test rmse:', test_rmse, 'test mae:', test_mae, 'test ndcg:', test_ndcg)
        print('Epoch:', i+1, 'train rmse:', train_rmse, 'train mae:', train_mae, 'train ndcg:', train_ndcg)
        break


    if i % 50 != 0:
        continue

    print('Epoch:', i, 'test rmse:', test_rmse, 'test mae:', test_mae, 'test ndcg:', test_ndcg)
    print('Epoch:', i, 'train rmse:', train_rmse, 'train mae:', train_mae, 'train ndcg:', train_ndcg)

  0%|          | 1/1000 [00:04<1:08:48,  4.13s/it]

Epoch: 0 test rmse: 2.7452645 test mae: 2.4947066 test ndcg: 0.8314637703951842
Epoch: 0 train rmse: 2.7253463 train mae: 2.4872675 train ndcg: 0.8321904513528707


  5%|▌         | 51/1000 [00:49<15:02,  1.05it/s] 

Epoch: 50 test rmse: 1.0374023 test mae: 0.83987576 test ndcg: 0.8668892314651708
Epoch: 50 train rmse: 0.99231714 train mae: 0.8031081 train ndcg: 0.8712929394509384


 10%|█         | 101/1000 [01:35<13:31,  1.11it/s]

Epoch: 100 test rmse: 0.961977 test mae: 0.76432824 test ndcg: 0.8839483017462775
Epoch: 100 train rmse: 0.908272 train mae: 0.7221081 train ndcg: 0.8881494991480695


 15%|█▌        | 151/1000 [02:20<13:06,  1.08it/s]

Epoch: 150 test rmse: 0.94871056 test mae: 0.7516293 test ndcg: 0.8866630888436046
Epoch: 150 train rmse: 0.8810991 train mae: 0.69875777 train ndcg: 0.9001709883863989


 20%|██        | 201/1000 [03:06<11:56,  1.12it/s]

Epoch: 200 test rmse: 0.9382955 test mae: 0.74308777 test ndcg: 0.8906907019947309
Epoch: 200 train rmse: 0.86162966 train mae: 0.683177 train ndcg: 0.9094092465911764


 25%|██▌       | 251/1000 [03:52<11:29,  1.09it/s]

Epoch: 250 test rmse: 0.92899764 test mae: 0.7343174 test ndcg: 0.891988095541211
Epoch: 250 train rmse: 0.8476789 train mae: 0.67050797 train ndcg: 0.915443705229339


 30%|███       | 301/1000 [04:38<10:30,  1.11it/s]

Epoch: 300 test rmse: 0.92117256 test mae: 0.7258345 test ndcg: 0.8950496996769294
Epoch: 300 train rmse: 0.8376732 train mae: 0.6608189 train ndcg: 0.9184835424161315


 35%|███▌      | 351/1000 [05:23<09:50,  1.10it/s]

Epoch: 350 test rmse: 0.9160043 test mae: 0.7216417 test ndcg: 0.8963276096465423
Epoch: 350 train rmse: 0.83250195 train mae: 0.6568198 train ndcg: 0.9200619219281913


 40%|████      | 401/1000 [06:09<09:01,  1.11it/s]

Epoch: 400 test rmse: 0.91297907 test mae: 0.7152972 test ndcg: 0.8981369804396268
Epoch: 400 train rmse: 0.82926136 train mae: 0.6502835 train ndcg: 0.9202063810733202


 45%|████▌     | 451/1000 [06:55<08:17,  1.10it/s]

Epoch: 450 test rmse: 0.9085523 test mae: 0.71358335 test ndcg: 0.898317289506791
Epoch: 450 train rmse: 0.82503146 train mae: 0.6485659 train ndcg: 0.9216268945715564


 50%|█████     | 501/1000 [07:41<07:33,  1.10it/s]

Epoch: 500 test rmse: 0.9066267 test mae: 0.711418 test ndcg: 0.8993780922437968
Epoch: 500 train rmse: 0.8210894 train mae: 0.6451426 train ndcg: 0.9222166598621259


 55%|█████▌    | 551/1000 [08:27<06:52,  1.09it/s]

Epoch: 550 test rmse: 0.90560764 test mae: 0.70928246 test ndcg: 0.899179906904189
Epoch: 550 train rmse: 0.8187376 train mae: 0.6417172 train ndcg: 0.9227702650772963


 60%|██████    | 601/1000 [09:13<06:06,  1.09it/s]

Epoch: 600 test rmse: 0.90241504 test mae: 0.7079589 test ndcg: 0.8986305180895459
Epoch: 600 train rmse: 0.8144908 train mae: 0.6395276 train ndcg: 0.9249649231794719


 65%|██████▌   | 651/1000 [09:59<05:18,  1.09it/s]

Epoch: 650 test rmse: 0.9006282 test mae: 0.7059668 test ndcg: 0.9000956773819535
Epoch: 650 train rmse: 0.8115829 train mae: 0.6363241 train ndcg: 0.9253851996660867


 70%|███████   | 701/1000 [10:45<04:35,  1.09it/s]

Epoch: 700 test rmse: 0.9003111 test mae: 0.7052751 test ndcg: 0.9000724895636018
Epoch: 700 train rmse: 0.80879545 train mae: 0.6335227 train ndcg: 0.9262234100156218


 75%|███████▌  | 751/1000 [11:30<03:51,  1.08it/s]

Epoch: 750 test rmse: 0.89811337 test mae: 0.7043304 test ndcg: 0.9010473509476117
Epoch: 750 train rmse: 0.80543184 train mae: 0.63187903 train ndcg: 0.9273877719860271


 80%|████████  | 801/1000 [12:16<03:02,  1.09it/s]

Epoch: 800 test rmse: 0.8973036 test mae: 0.7028618 test ndcg: 0.9003603367416186
Epoch: 800 train rmse: 0.8036924 train mae: 0.6298713 train ndcg: 0.9282800801362534


 85%|████████▌ | 851/1000 [13:02<02:14,  1.10it/s]

Epoch: 850 test rmse: 0.89720446 test mae: 0.7028789 test ndcg: 0.9004019083780251
Epoch: 850 train rmse: 0.8005888 train mae: 0.62754965 train ndcg: 0.9285487188288548


 90%|█████████ | 901/1000 [13:48<01:30,  1.09it/s]

Epoch: 900 test rmse: 0.8973619 test mae: 0.7028559 test ndcg: 0.9013640783083982
Epoch: 900 train rmse: 0.79779553 train mae: 0.6250428 train ndcg: 0.9294491400553294


 95%|█████████▌| 951/1000 [14:34<00:44,  1.10it/s]

Epoch: 950 test rmse: 0.89637375 test mae: 0.70260984 test ndcg: 0.9004420351234291
Epoch: 950 train rmse: 0.7948206 train mae: 0.62368155 train ndcg: 0.9309851371228353


100%|██████████| 1000/1000 [15:19<00:00,  1.09it/s]


In [16]:
# Final result
print('Epoch:', best_rmse_ep, ' best rmse:', best_rmse)
print('Epoch:', best_mae_ep, ' best mae:', best_mae)
print('Epoch:', best_ndcg_ep, ' best ndcg:', best_ndcg)

Epoch: 999  best rmse: 0.89447355
Epoch: 1000  best mae: 0.70042264
Epoch: 998  best ndcg: 0.9036142702502777


# Save model

In [23]:
MODEL_PATH = "../models/"

# Save local and full models.
# torch.save(model.state_dict(), MODEL_PATH + 'han_local_kernel.pt')
# torch.save(complete_model.state_dict(), MODEL_PATH + 'han_complete_model.pt')

# Inference

In [24]:
# Load local kernel model
model = KernelNet(users_num, n_hid, n_dim, n_layers, lambda_s, lambda_2).double().to(device)
model.load_state_dict(torch.load(MODEL_PATH + 'han_local_kernel.pt', map_location=device))

# Load global model
complete_model = CompleteNet(model, users_num, movies_num, n_hid, n_dim, n_layers, lambda_s, lambda_2, gk_size, dot_scale).double().to(device)
complete_model.load_state_dict(torch.load(MODEL_PATH + 'han_complete_model.pt', map_location=device))
complete_model.eval()

CompleteNet(
  (local_kernel_net): KernelNet(
    (layers): ModuleList(
      (0-1): 2 x KernelLayer(
        (activation): Sigmoid()
      )
      (2): KernelLayer(
        (activation): Identity()
      )
    )
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (global_kernel_net): KernelNet(
    (layers): ModuleList(
      (0-1): 2 x KernelLayer(
        (activation): Sigmoid()
      )
      (2): KernelLayer(
        (activation): Identity()
      )
    )
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [22]:
user_ratings = torch.zeros(movies_num, users_num).double().to(device)

print("Input two integeres: index of movie and it's rating (1 to 5) separated by a single space.")
print("To stop writing, press ENTER.")

watched_movies = set()
while True:
    user_input = input()
    
    if user_input == '':
        break
    
    movie_index, rating = list(map(int, user_input.split()))
    watched_movies.add(movie_index)
    user_ratings[movie_index, 0] = rating

pred = complete_model(user_ratings)[0].to(device)

# Output top 10 movies with highest rating for this user.
i = 0
while True:
    if i == 10:
        break
    
    recommended_movie = pred[:, 0].argmax().item()
    if recommended_movie not in watched_movies:
        print(f'{i + 1}. {movies_names[recommended_movie]}')
        i += 1
        
    pred[recommended_movie] = 0

Input two integeres: index of movie and it's rating (1 to 5) separated by a single space.
To stop writing, press ENTER.
1. Get Shorty (1995)
2. Copycat (1995)
3. Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
4. Twelve Monkeys (1995)
5. Babe (1995)
6. Dead Man Walking (1995)
7. Richard III (1995)
8. Seven (Se7en) (1995)
9. Usual Suspects, The (1995)
10. Mighty Aphrodite (1995)
