In [1]:
import numpy as np
import torch
import torch.nn as nn
from time import time
import torch.nn.functional as F
torch.manual_seed(1234)
np.random.seed(1234)

In [2]:
train = np.loadtxt('ml-100k/u1.base', delimiter='\t').astype('int32')
test = np.loadtxt('ml-100k/u1.test', delimiter='\t').astype('int32')
total = np.concatenate((train, test), axis=0)

n_u = np.unique(total[:, 0]).size  # num of users
n_m = np.unique(total[:, 1]).size  # num of movies
n_train = train.shape[0]  # num of training ratings
n_test = test.shape[0]  # num of test ratings

train_r = np.zeros((n_m, n_u), dtype='float32')
test_r = np.zeros((n_m, n_u), dtype='float32')

for i in range(n_train):
    train_r[train[i, 1] - 1, train[i, 0] - 1] = train[i, 2]

for i in range(n_test):
    test_r[test[i, 1] - 1, test[i, 0] - 1] = test[i, 2]

train_m = np.greater(train_r, 1e-12).astype('float32')  # masks indicating non-zero entries
test_m = np.greater(test_r, 1e-12).astype('float32')

print('data matrix loaded')
print('num of users: {}'.format(n_u))
print('num of movies: {}'.format(n_m))
print('num of training ratings: {}'.format(n_train))
print('num of test ratings: {}'.format(n_test))

data matrix loaded
num of users: 943
num of movies: 1682
num of training ratings: 80000
num of test ratings: 20000


In [3]:
# Common hyperparameter settings
n_hid = 500
n_dim = 5
n_layers = 2
gk_size = 3

In [4]:
# Hyperparameters to tune for specific case
max_epoch_p = 501  # max number of epochs for pretraining
max_epoch_f = 1001  # max number of epochs for finetuning
patience_p = 5  # number of consecutive rounds of early stopping condition before actual stop for pretraining
patience_f = 10  # and finetuning
tol_p = 1e-4  # minimum threshold for the difference between consecutive values of train rmse, used for early stopping, for pretraining
tol_f = 1e-5  # and finetuning
lambda_2 = 20.  # regularisation of number or parameters
lambda_s = 0.006  # regularisation of sparsity of the final matrix
dot_scale = 1  # dot product weight for global kernel

In [5]:
def local_kernel(u, v):
    dist = torch.norm(u - v, p=2, dim=2)
    hat = torch.clamp(1. - dist ** 2, min=0.)
    return hat

In [6]:
class KernelLayer(nn.Module):
    def __init__(self, n_in, n_hid, n_dim, lambda_s, lambda_2, activation=nn.Sigmoid()):
        super().__init__()
        self.W = nn.Parameter(torch.randn(n_in, n_hid))
        self.u = nn.Parameter(torch.randn(n_in, 1, n_dim))
        self.v = nn.Parameter(torch.randn(1, n_hid, n_dim))
        self.b = nn.Parameter(torch.randn(n_hid))

        self.lambda_s = lambda_s
        self.lambda_2 = lambda_2

        nn.init.xavier_uniform_(self.W, gain=torch.nn.init.calculate_gain("relu"))
        nn.init.xavier_uniform_(self.u, gain=torch.nn.init.calculate_gain("relu"))
        nn.init.xavier_uniform_(self.v, gain=torch.nn.init.calculate_gain("relu"))
        nn.init.zeros_(self.b)
        self.activation = activation

    def forward(self, x):
        w_hat = local_kernel(self.u, self.v)

        sparse_reg = torch.nn.functional.mse_loss(w_hat, torch.zeros_like(w_hat))
        sparse_reg_term = self.lambda_s * sparse_reg

        l2_reg = torch.nn.functional.mse_loss(self.W, torch.zeros_like(self.W))
        l2_reg_term = self.lambda_2 * l2_reg

        W_eff = self.W * w_hat  # Local kernelised weight matrix
        y = torch.matmul(x, W_eff) + self.b
        y = self.activation(y)

        return y, sparse_reg_term + l2_reg_term

In [7]:
class KernelNet(nn.Module):
    def __init__(self, n_u, n_hid, n_dim, n_layers, lambda_s, lambda_2):
        super().__init__()
        layers = []
        for i in range(n_layers):
            if i == 0:
                layers.append(KernelLayer(n_u, n_hid, n_dim, lambda_s, lambda_2))
            else:
                layers.append(KernelLayer(n_hid, n_hid, n_dim, lambda_s, lambda_2))
        layers.append(KernelLayer(n_hid, n_u, n_dim, lambda_s, lambda_2, activation=nn.Identity()))
        self.layers = nn.ModuleList(layers)
        self.dropout = nn.Dropout(0.33)

    def forward(self, x):
        total_reg = None
        for i, layer in enumerate(self.layers):
            x, reg = layer(x)
            if i < len(self.layers) - 1:
                x = self.dropout(x)
            if total_reg is None:
                total_reg = reg
            else:
                total_reg += reg
        return x, total_reg

In [8]:
class CompleteNet(nn.Module):
    def __init__(self, kernel_net, n_u, n_m, n_hid, n_dim, n_layers, lambda_s, lambda_2, gk_size, dot_scale):
        super().__init__()
        self.gk_size = gk_size
        self.dot_scale = dot_scale
        self.local_kernel_net = kernel_net
        self.conv_kernel = torch.nn.Parameter(torch.randn(n_m, gk_size ** 2) * 0.1)
        nn.init.xavier_uniform_(self.conv_kernel, gain=torch.nn.init.calculate_gain('relu'))

    def forward(self, x, x_local):
        gk = self.global_kernel(x_local, self.gk_size, self.dot_scale)
        x = self.global_conv(x, gk)
        x, global_reg_loss = self.local_kernel_net(x)
        return x, global_reg_loss

    def global_kernel(self, input, gk_size, dot_scale):
        avg_pooling = torch.mean(input, dim=1)  # Item (axis=1) based average pooling
        avg_pooling = avg_pooling.view(1, -1)

        gk = torch.matmul(avg_pooling, self.conv_kernel) * dot_scale  # Scaled dot product
        gk = gk.view(1, 1, gk_size, gk_size)

        return gk

    def global_conv(self, input, W):
        input = input.unsqueeze(0).unsqueeze(0)
        conv2d = nn.LeakyReLU()(F.conv2d(input, W, stride=1, padding=1))
        return conv2d.squeeze(0).squeeze(0)


class Loss(nn.Module):
    def forward(self, pred_p, reg_loss, train_m, train_r):
        # L2 loss
        diff = train_m * (train_r - pred_p)
        sqE = torch.nn.functional.mse_loss(diff, torch.zeros_like(diff))
        loss_p = sqE + reg_loss
        return loss_p

In [9]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
model = KernelNet(n_u, n_hid, n_dim, n_layers, lambda_s, lambda_2).float().to(device)

In [10]:
complete_model = CompleteNet(model, n_u, n_m, n_hid, n_dim, n_layers, lambda_s, lambda_2, gk_size,
                             dot_scale).float().to(device)

In [11]:
best_rmse_ep, best_mae_ep, best_ndcg_ep = 0, 0, 0
best_rmse, best_mae, best_ndcg = float('inf'), float('inf'), 0

time_cumulative = 0
tic = time()

# Pre-Training
optimizer = torch.optim.AdamW(complete_model.local_kernel_net.parameters(), lr=0.001)


def closure():
    optimizer.zero_grad()
    x = torch.Tensor(train_r).float().to(device)
    m = torch.Tensor(train_m).float().to(device)
    complete_model.local_kernel_net.train()
    pred, reg = complete_model.local_kernel_net(x)
    loss = Loss().to(device)(pred, reg, m, x)
    loss.backward()
    return loss


last_rmse = np.inf
counter = 0

for i in range(max_epoch_p):
    optimizer.step(closure)
    complete_model.local_kernel_net.eval()
    t = time() - tic
    time_cumulative += t

    pre, _ = model(torch.Tensor(train_r).float().to(device))

    pre = pre.float().cpu().detach().numpy()

    error = (test_m * (np.clip(pre, 1., 5.) - test_r) ** 2).sum() / test_m.sum()  # test error
    test_rmse = np.sqrt(error)

    error_train = (train_m * (np.clip(pre, 1., 5.) - train_r) ** 2).sum() / train_m.sum()  # train error
    train_rmse = np.sqrt(error_train)

    if last_rmse - train_rmse < tol_p:
        counter += 1
    else:
        counter = 0

    last_rmse = train_rmse

    if patience_p == counter:
        print('PRE-TRAINING')
        print('Epoch:', i + 1, 'test rmse:', test_rmse, 'train rmse:', train_rmse)
        print('Time:', t, 'seconds')
        print('Time cumulative:', time_cumulative, 'seconds')
        break

    if i % 50 == 0:
        print('PRE-TRAINING')
        print('Epoch:', i, 'test rmse:', test_rmse, 'train rmse:', train_rmse)
        print('Time:', t, 'seconds')
        print('Time cumulative:', time_cumulative, 'seconds')

PRE-TRAINING
Epoch: 0 test rmse: 2.7551959 train rmse: 2.7292118
Time: 0.7320170402526855 seconds
Time cumulative: 0.7320170402526855 seconds
PRE-TRAINING
Epoch: 50 test rmse: 1.0251902 train rmse: 0.9841468
Time: 2.8504528999328613 seconds
Time cumulative: 90.85948324203491 seconds
PRE-TRAINING
Epoch: 100 test rmse: 0.963181 train rmse: 0.91872776
Time: 4.979596853256226 seconds
Time cumulative: 287.83330845832825 seconds
PRE-TRAINING
Epoch: 127 test rmse: 0.95846754 train rmse: 0.9138297
Time: 6.0169031620025635 seconds
Time cumulative: 431.11435651779175 seconds


In [12]:
# Fine-Tuning

train_r_local = np.clip(pre, 1., 5.)

optimizer = torch.optim.AdamW(complete_model.parameters(), lr=0.001)


def closure():
    optimizer.zero_grad()
    x = torch.Tensor(train_r).float().to(device)
    x_local = torch.Tensor(train_r_local).float().to(device)
    m = torch.Tensor(train_m).float().to(device)
    complete_model.train()
    pred, reg = complete_model(x, x_local)
    loss = Loss().to(device)(pred, reg, m, x)
    loss.backward()
    return loss


last_rmse = np.inf
counter = 0

for i in range(max_epoch_f):
    optimizer.step(closure)
    complete_model.eval()
    t = time() - tic
    time_cumulative += t

    pre, _ = complete_model(torch.Tensor(train_r).float().to(device), torch.Tensor(train_r_local).float().to(device))

    pre = pre.float().cpu().detach().numpy()

    error = (test_m * (np.clip(pre, 1., 5.) - test_r) ** 2).sum() / test_m.sum()  # test error
    test_rmse = np.sqrt(error)

    error_train = (train_m * (np.clip(pre, 1., 5.) - train_r) ** 2).sum() / train_m.sum()  # train error
    train_rmse = np.sqrt(error_train)

    test_mae = (test_m * np.abs(np.clip(pre, 1., 5.) - test_r)).sum() / test_m.sum()
    train_mae = (train_m * np.abs(np.clip(pre, 1., 5.) - train_r)).sum() / train_m.sum()

    if test_rmse < best_rmse:
        best_rmse = test_rmse
        best_rmse_ep = i + 1

    if test_mae < best_mae:
        best_mae = test_mae
        best_mae_ep = i + 1

    if last_rmse - train_rmse < tol_f:
        counter += 1
    else:
        counter = 0

    last_rmse = train_rmse

    if patience_f == counter:
        print('FINE-TUNING')
        print('Epoch:', i + 1, 'test rmse:', test_rmse, 'test mae:', test_mae)
        print('Epoch:', i + 1, 'train rmse:', train_rmse, 'train mae:', train_mae)
        print('Time:', t, 'seconds')
        break

    if i % 50 == 0:
        print('FINE-TUNING')
        print('Epoch:', i, 'test rmse:', test_rmse, 'test mae:', test_mae)
        print('Epoch:', i, 'train rmse:', train_rmse, 'train mae:', train_mae)
        print('Time:', t, 'seconds')

FINE-TUNING
Epoch: 0 test rmse: 1.0568719 test mae: 0.85774463
Epoch: 0 train rmse: 1.0274782 train mae: 0.8347582
Time: 6.147149085998535 seconds
FINE-TUNING
Epoch: 50 test rmse: 0.91829413 test mae: 0.7246843
Epoch: 50 train rmse: 0.8555746 train mae: 0.6759926
Time: 8.51752495765686 seconds
FINE-TUNING
Epoch: 100 test rmse: 0.91115785 test mae: 0.71845055
Epoch: 100 train rmse: 0.8490566 train mae: 0.67007416
Time: 10.976081132888794 seconds
FINE-TUNING
Epoch: 150 test rmse: 0.9085041 test mae: 0.7159648
Epoch: 150 train rmse: 0.8445192 train mae: 0.6664536
Time: 13.349500179290771 seconds
FINE-TUNING
Epoch: 200 test rmse: 0.9089046 test mae: 0.71515864
Epoch: 200 train rmse: 0.8410914 train mae: 0.66259533
Time: 15.778004884719849 seconds
FINE-TUNING
Epoch: 250 test rmse: 0.9091276 test mae: 0.71467775
Epoch: 250 train rmse: 0.8394677 train mae: 0.66124177
Time: 18.275057077407837 seconds
FINE-TUNING
Epoch: 300 test rmse: 0.9089401 test mae: 0.7156781
Epoch: 300 train rmse: 0.83796

In [13]:
# Final result
print('Epoch:', best_rmse_ep, ' best rmse:', best_rmse)
print('Epoch:', best_mae_ep, ' best mae:', best_mae)

Epoch: 881  best rmse: 0.9054057
Epoch: 930  best mae: 0.712293
