## Training and Testing

Let's start by manually defining some neccesary parameters

In [1]:
import numpy as np
import torch
import os
import re
import scipy.sparse as sp
import multiprocessing
import torch.nn.functional as F

from time import time
from functools import partial
from utils.dataset import Data
from utils.metrics import ranklist_by_heapq, get_performance
# from utils.parser import parse_args
from ngcf import NGCF_BPR
from multiprocessing import Pool

you will notice that I import `NGCF_BPR` not simply `NGCF`. This is because there is another class in the `ngcf` module called `NGCF_BCE` which is coded to simply predict 1/0 interaction or not "*a la traditional classification*" problem.

In [2]:
use_cuda = torch.cuda.is_available()

cores = multiprocessing.cpu_count()

Ks = [10, 20]

data_path = "Data/toy_data/"
batch_size = 64
data_generator = Data(data_path, batch_size, val=False)
n_users = data_generator.n_users
n_items = data_generator.n_items

_, _, mean_adj = data_generator.get_adj_mat()
adjacency_matrix = mean_adj + sp.eye(mean_adj.shape[0])

emb_size = 12
layers = [12, 6]
node_dropout = 0.1
mess_dropout = [0.1]*len(layers)
regularization = 1e-5
lr = 0.01
n_fold = 10

pretrain = 0

print_every, eval_every, save_every = 1, 1, 10

n_users=1000, n_items=2000
n_interactions=30780
n_train=24228, n_test=6552, sparsity=0.01539
already load adj matrix (3000, 3000) 0.013494253158569336


In [3]:
model = NGCF_BPR(n_users, n_items, emb_size, adjacency_matrix, layers,
    node_dropout, mess_dropout, regularization, n_fold, batch_size)
if use_cuda: 
    model = model.cuda()

In [4]:
for n,p in model.named_parameters():
    if p.requires_grad: print(n)

embeddings_user
embeddings_item
W1.0.weight
W1.0.bias
W1.1.weight
W1.1.bias
W2.0.weight
W2.0.bias
W2.1.weight
W2.1.bias


And this is it really regarding the parameters of the model. We only need the user/item embeddings and then a series of linear layers (that we could refer as graph layers). The embeddings will be concatenated over rows, multiplied by the Laplacian matrix, and then passed through a the graph/linear layers recursively. 

Let's now move to the training phase. The training phase is your typical pytorch training function, with the exception that the output of the forward pass is already the [BPR](https://arxiv.org/pdf/1205.2618.pdf) loss. It goes this way:

In [5]:
def train(model, data_generator, optimizer):
    model.train()
    n_batch = data_generator.n_train // data_generator.batch_size + 1
    running_loss=0
    for _ in range(n_batch):
        # tuple (users, positive items, negative items)     
        u, i, j = data_generator.sample()
        optimizer.zero_grad()
        loss = model(u,i,j)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss

We have not talked about the BPR loss yet, so let's have a look. The definition in the [paper](https://arxiv.org/pdf/1905.08108.pdf) is:

$$
Loss = \sum_{(u,i,j) \in \mathcal{O}} -ln \big(\sigma(\hat{y}_{ui} - \hat{y}_{uj})\big) + \lambda ||\Theta||^{2}_{2}
$$

Where $\mathcal{O} = \{ (u,i,j)|(u,i) \in  R^{+}, (u,j) \in R^{-} \}$ is the set of training tuples with $R^{+}$ and $R^{-}$ corresponding to observed and unobserved interactions (aka positive and negative) respectively. $\sigma$ is the sigmoid function and $||\Theta|| = \{ \text{E}, \{ \textbf{W}^{l}_{1},\textbf{W}^{l}_{2} \}^{L}_{l=1}  \}$ are all training parameters. 

In pytorch:

In [6]:
def bpr_loss(self, u, i, j):
    # first term
    y_ui = torch.mul(u, i).sum(dim=1)
    y_uj = torch.mul(u, j).sum(dim=1)
    log_prob = (torch.log(torch.sigmoid(y_ui-y_uj))).mean()

    # regularization
    l2norm = (torch.sum(u**2)/2. + torch.sum(i**2)/2. + torch.sum(j**2)/2.).mean()
    l2reg  = reg*l2norm

    # Loss
    return -log_prob + l2reg

okay, so now we now how the training happens, let's move to the validation/testing. Here, we will first use the authors `early_stopping` function. I am sure there are more "pytorchian" ways of doing it, but this function is simple and does the job, so let's use it 

In [7]:
def early_stopping(log_value, best_value, stopping_step, expected_order='asc', patience=10):

    # better is higher or lower
    assert expected_order in ['asc', 'dec']

    if (expected_order == 'asc' and log_value >= best_value) or (expected_order == 'dec' and log_value <= best_value):
        stopping_step = 0
        best_value = log_value
    else:
        stopping_step += 1

    if stopping_step >= patience:
        print("Early stopping is trigger at step: {} log:{}".format(patience, log_value))
        should_stop = True
    else:
        should_stop = False

    return best_value, stopping_step, should_stop

Now let's see how we test on one user

In [8]:
def test_one_user(x):
    """
    x will be a zip object where the 1st element will be the user id and the 2nd
    will be the scores for all items in the dataset
    """
    
    u = x[0]
    rating = x[1]

    try:
        training_items = data_generator.train_items[u]
    except Exception:
        training_items = []

    # items that the user did interact with during testing
    user_pos_test = data_generator.test_set[u]
    all_items = set(range(data_generator.n_items))
    # test_items include negative items and  user_pos_test
    test_items = list(all_items - set(training_items))

    # and now we compute the metrics as described in the notebook Chapter03_metrics.ipynb.
    r, auc = ranklist_by_heapq(user_pos_test, test_items, rating, Ks)

    return get_performance(user_pos_test, r, auc, Ks)

And now that we know how to test in one user, let's do it for the whole dataset

In [9]:
def test(model, data_generator):

    result = {
        'precision': np.zeros(len(Ks)),
        'recall': np.zeros(len(Ks)),
        'ndcg': np.zeros(len(Ks)),
        'hit_ratio': np.zeros(len(Ks)),
        'auc': 0.
        }

    # here we can use larger batches
    u_batch_size = data_generator.batch_size * 2

    # test users are all users really
    test_users = list(data_generator.test_set.keys())
    n_test_users = len(test_users)
    n_user_batchs = n_test_users // u_batch_size + 1
    
    # n_test_items are normally all items
    n_test_items = data_generator.n_items

    count = 0
    p = Pool(cores)
    for u_batch_id in range(n_user_batchs):
        start = u_batch_id * u_batch_size
        end = (u_batch_id + 1) * u_batch_size

        user_batch = test_users[start: end]
        item_batch = np.arange(n_test_items)
    
        # ratings are simply the matrix multiplication of the graph embeddings. One option 
        # could be wrap this up into a sigmoid, to keep all between 0,1
        rate_batch  = torch.mm(model.g_embeddings_user, model.g_embeddings_item.t())

        # detach and to CPU so it can be parallelised through the cores
        rate_batch_np = rate_batch.detach().cpu().numpy()
        batch_result = p.map(test_one_user, zip(user_batch,rate_batch_np))

        count += len(batch_result)

        for re in batch_result:
            result['precision'] += re['precision']/n_test_users
            result['recall'] += re['recall']/n_test_users
            result['ndcg'] += re['ndcg']/n_test_users
            result['hit_ratio'] += re['hit_ratio']/n_test_users
            result['auc'] += re['auc']/n_test_users
    assert count == n_test_users
    p.close()
    return result

Let's see how all comes together! (Note that the process here is **extremely** inefficient since we are splitting a 3000x3000 matrix into 10 folds and using a 32 batch for only 1000 users)

In [10]:
cur_best_pre = 0.
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
stopping_step, should_stop = 0, False
for epoch in range(2):
    t1 = time()
    loss = train(model, data_generator, optimizer)
    if epoch % print_every  == (print_every - 1):
        print("Epoch:{} {:.2f}s, Loss = {:.4f}".
            format(epoch, time()-t1, loss))
    if epoch % eval_every  == (eval_every - 1):
        t2 = time()
        res = test(model, data_generator)
        print("VALIDATION.","\n"
            "Epoch: {}, {:.2f}s".format(epoch, time()-t2),"\n",
            "Recall@{}: {:.4f}, Recall@{}: {:.4f}".format(Ks[0], res['recall'][0],  Ks[-1], res['recall'][-1]), "\n"
            "Precision@{}: {:.4f}, Precision@{}: {:.4f}".format(Ks[0], res['precision'][0],  Ks[-1], res['precision'][-1]), "\n"
            "Hit_ratio@{}: {:.4f}, Hit_ratio@{}: {:.4f}".format(Ks[0], res['hit_ratio'][0],  Ks[-1], res['hit_ratio'][-1]), "\n"
            "NDCG@{}: {:.4f}, NDCG@{}: {:.4f}".format(Ks[0], res['ndcg'][0],  Ks[-1], res['ndcg'][-1])
            )
        cur_best_pre, stopping_step, should_stop = \
        early_stopping(res['recall'][0], cur_best_pre, stopping_step)
    if epoch % save_every == (save_every - 1):
        torch.save(model.state_dict(), modelpath)

Epoch:0 95.76s, Loss = 245.9971
VALIDATION. 
Epoch: 0, 0.66s 
 Recall@10: 0.0056, Recall@20: 0.0098 
Precision@10: 0.0032, Precision@20: 0.0029 
Hit_ratio@10: 0.0320, Hit_ratio@20: 0.0570 
NDCG@10: 0.0162, NDCG@20: 0.0224
Epoch:1 95.33s, Loss = 228.0495
VALIDATION. 
Epoch: 1, 0.67s 
 Recall@10: 0.0050, Recall@20: 0.0076 
Precision@10: 0.0032, Precision@20: 0.0026 
Hit_ratio@10: 0.0320, Hit_ratio@20: 0.0510 
NDCG@10: 0.0127, NDCG@20: 0.0175


If you remember, in the notebook `Chapter03_metrics.ipynb` I described another form of testing inspired by the code in [this repo](https://github.com/sh0416/bpr/blob/master/train.py). Let's revisit the code. A full explanation of the code flow is in that notebook

In [11]:
import scipy.sparse as sp

def split_mtx(X, n_folds=10):
    """
    Split a matrix/tensor in n_folds folds
    
    There is some redundancy with the split methods within the 
    NGCF_BPR class...I am ok with that, or almost.
    """
    X_folds = []
    fold_len = X.shape[0]//n_folds
    for i in range(n_folds):
        start = i * fold_len
        if i == n_folds -1:
            end = X.shape[0]
        else:
            end = (i + 1) * fold_len
        X_folds.append(X[start:end])
    return X_folds

# this was named "precision_and_recall_k"
def test_GPU(user_emb, item_emb, R_tr, R_te, Ks):

    tr_folds = split_mtx(R_tr)
    te_folds = split_mtx(R_te)
    ue_folds = split_mtx(user_emb)

    fold_prec, fold_rec = {}, {}
    for ue_fold, tr_fold, te_fold in zip(ue_folds, tr_folds, te_folds):

        result = torch.sigmoid(torch.mm(ue_fold, item_emb.t()))
        test_pred_mask = torch.from_numpy(1 - tr_fold.todense())
        test_true_mask = torch.from_numpy(te_fold.todense())
        if use_cuda:
            test_pred_mask, test_true_mask = test_pred_mask.cuda(), test_true_mask.cuda()
        test_pred = test_pred_mask * result
        test_true = test_true_mask * result

        _, test_indices = torch.topk(test_pred, dim=1, k=max(Ks))
        for k in Ks:
            topk_mask = torch.zeros_like(test_pred)
            source = torch.tensor(1.0).cuda() if use_cuda else torch.tensor(1.0)
            topk_mask.scatter_(dim=1, index=test_indices[:, :k], src=source)
            test_pred_topk = topk_mask * test_pred
            acc_result = (test_pred_topk != 0) & (test_pred_topk == test_true)
            pr_k = acc_result.sum().float() / (user_emb.shape[0] * k)
            rec_k = (acc_result.float().sum(dim=1) / test_true_mask.float().sum(dim=1))
            try:
                fold_prec[k].append(pr_k)
                fold_rec[k].append(rec_k)
            except KeyError:
                fold_prec[k] = [pr_k]
                fold_rec[k] = [rec_k]

    precision, recall = {}, {}
    for k in Ks:
        precision[k] = np.sum(fold_prec[k])
        recall[k] = torch.cat(fold_rec[k]).mean()
    return precision, recall

to use it, one would simply replace `test` with `test_GPU` and: 

In [12]:
pre, rec = test_GPU(
    model.g_embeddings_user, 
    model.g_embeddings_item, 
    data_generator.Rtr, 
    data_generator.Rte, 
    Ks)

In [13]:
pre

{10: tensor(0.0037, device='cuda:0'), 20: tensor(0.0040, device='cuda:0')}