In [None]:
# neural collaborative filtering

In [66]:
import numpy as np 
import pandas as pd 
import scipy.sparse as sp

import os
import time

import torch
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim
import torch.utils.data as data
import torch.backends.cudnn as cudnn

In [67]:
def load_all(test_num=100):
    """ We load all the three file here to save time in each epoch. """
    train_data = pd.read_csv(
        '/home/dm/Downloads/github_ncf/ml-1m.train.rating', 
        sep='\t', header=None, names=['user', 'item'], 
        usecols=[0, 1], dtype={0: np.int32, 1: np.int32})

    user_num = train_data['user'].max() + 1
    item_num = train_data['item'].max() + 1

    train_data = train_data.values.tolist()

    # load ratings as a dok matrix
    train_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)
    for x in train_data:
        train_mat[x[0], x[1]] = 1.0

    test_data = []
    with open('/home/dm/Downloads/github_ncf/ml-1m.test.negative', 'r') as fd:
        line = fd.readline()
        while line != None and line != '':
            arr = line.split('\t')
            u = eval(arr[0])[0]
            test_data.append([u, eval(arr[0])[1]])
            for i in arr[1:]:
                test_data.append([u, int(i)])
            line = fd.readline()
    return train_data, test_data, user_num, item_num, train_mat



In [68]:
class NCFData(data.Dataset):
    def __init__(self, features, num_item, train_mat=None, num_ng=0, is_training=None):
        super(NCFData, self).__init__()
        """ Note that the labels are only useful when training, we thus 
            add them in the ng_sample() function.
        """
        self.features_ps = features
        self.num_item = num_item
        self.train_mat = train_mat
        self.num_ng = num_ng
        self.is_training = is_training
        self.labels = [0 for _ in range(len(features))]

    def ng_sample(self):
        # 在paper中提到，要算objective function的时候需要negative instances
        # 一个postive instance对应4个negative instances(sampled from unobserved interactions)
        # 因此只有train dataset需要negative sample，test dataset只是单单让他变成一个data.Dataset的实例
        assert self.is_training, 'no need to sampling when testing'

        self.features_ng = [] #例如，positive instance is (0,25)，
                              #那么4个negative instances是(0,34)(0,9)(0,1778)(0,44) unobserved interactions
        for x in self.features_ps:
            u = x[0]
            for t in range(self.num_ng):
                j = np.random.randint(self.num_item)
                while (u, j) in self.train_mat:
                    j = np.random.randint(self.num_item)
                self.features_ng.append([u, j])

        labels_ps = [1 for _ in range(len(self.features_ps))]
        labels_ng = [0 for _ in range(len(self.features_ng))]

        self.features_fill = self.features_ps + self.features_ng
        self.labels_fill = labels_ps + labels_ng

    def __len__(self): # custom dataset class need both of _len_ and _getitem_ functions
        return (self.num_ng + 1) * len(self.labels)

    def __getitem__(self, idx):
        features = self.features_fill if self.is_training \
                    else self.features_ps
        labels = self.labels_fill if self.is_training \
                    else self.labels

        user = features[idx][0]
        item = features[idx][1]
        label = labels[idx]
        return user, item ,label

In [69]:
class NCF(nn.Module):
    def __init__(self, user_num, item_num, factor_num, num_layers,
                    dropout, model, GMF_model=None, MLP_model=None):
        super(NCF, self).__init__()
        """
        user_num: number of users;
        item_num: number of items;
        factor_num: number of predictive factors;
        num_layers: the number of layers in MLP model;
        dropout: dropout rate between fully connected layers;
        model: 'MLP', 'GMF', 'NeuMF-end', and 'NeuMF-pre';
        GMF_model: pre-trained GMF weights;
        MLP_model: pre-trained MLP weights.
        """
        self.dropout = dropout
        self.model = model
        self.GMF_model = GMF_model
        self.MLP_model = MLP_model

        self.embed_user_GMF = nn.Embedding(user_num, factor_num)
        self.embed_item_GMF = nn.Embedding(item_num, factor_num)
        self.embed_user_MLP = nn.Embedding(
                user_num, factor_num * (2 ** (num_layers - 1))) #注意这里是num_layers -1
        self.embed_item_MLP = nn.Embedding(
                item_num, factor_num * (2 ** (num_layers - 1)))

        MLP_modules = []
        for i in range(num_layers):
            # 注意这里是num_layers -i 因为输入是user embedding和 item embedding 的拼
            # 所以输入的维度是(1,factor_num * 2 ** (num_layers)， 这里i从0开始，所以维度一样
            input_size = factor_num * (2 ** (num_layers - i)) 
            MLP_modules.append(nn.Dropout(p=self.dropout))
            MLP_modules.append(nn.Linear(input_size, input_size//2))
            MLP_modules.append(nn.ReLU())
        self.MLP_layers = nn.Sequential(*MLP_modules)

        if self.model in ['MLP', 'GMF']:
            predict_size = factor_num 
        else:
            predict_size = factor_num * 2
        self.predict_layer = nn.Linear(predict_size, 1)

        self._init_weight_()

    def _init_weight_(self):
        """ We leave the weights initialization here. """
        if not self.model == 'NeuMF-pre':
            nn.init.normal_(self.embed_user_GMF.weight, std=0.01)
            nn.init.normal_(self.embed_user_MLP.weight, std=0.01)
            nn.init.normal_(self.embed_item_GMF.weight, std=0.01)
            nn.init.normal_(self.embed_item_MLP.weight, std=0.01)

            for m in self.MLP_layers:
                if isinstance(m, nn.Linear):
                    nn.init.xavier_uniform_(m.weight)
            nn.init.kaiming_uniform_(self.predict_layer.weight, 
                                    a=1, nonlinearity='sigmoid')

            for m in self.modules():
                if isinstance(m, nn.Linear) and m.bias is not None:
                    m.bias.data.zero_()
        else:
            # embedding layers
            self.embed_user_GMF.weight.data.copy_(
                            self.GMF_model.embed_user_GMF.weight)
            self.embed_item_GMF.weight.data.copy_(
                            self.GMF_model.embed_item_GMF.weight)
            self.embed_user_MLP.weight.data.copy_(
                            self.MLP_model.embed_user_MLP.weight)
            self.embed_item_MLP.weight.data.copy_(
                            self.MLP_model.embed_item_MLP.weight)

            # mlp layers
            for (m1, m2) in zip(
                self.MLP_layers, self.MLP_model.MLP_layers):
                if isinstance(m1, nn.Linear) and isinstance(m2, nn.Linear):
                    m1.weight.data.copy_(m2.weight)
                    m1.bias.data.copy_(m2.bias)

            # predict layers
            predict_weight = torch.cat([
                self.GMF_model.predict_layer.weight, 
                self.MLP_model.predict_layer.weight], dim=1)
            precit_bias = self.GMF_model.predict_layer.bias + \
                        self.MLP_model.predict_layer.bias

            self.predict_layer.weight.data.copy_(0.5 * predict_weight)
            self.predict_layer.bias.data.copy_(0.5 * precit_bias)

    def forward(self, user, item):
        # model = NeuMF-pre/end 既要算output_GMF也要output_MLP
        if not self.model == 'MLP':
            embed_user_GMF = self.embed_user_GMF(user)
            embed_item_GMF = self.embed_item_GMF(item)
            output_GMF = embed_user_GMF * embed_item_GMF #output dim is (1, factor_num)
        if not self.model == 'GMF':
            embed_user_MLP = self.embed_user_MLP(user) # (1 , factor_num * (2 ** (num_layers - 1)))
            embed_item_MLP = self.embed_item_MLP(item)
            interaction = torch.cat((embed_user_MLP, embed_item_MLP), -1) # dim =-1 相当于 dim = 1, 横着拼
            output_MLP = self.MLP_layers(interaction) # output dim is (1,factor_num)

        if self.model == 'GMF':
            concat = output_GMF
        elif self.model == 'MLP':
            concat = output_MLP
        else:
            concat = torch.cat((output_GMF, output_MLP), -1)

        prediction = self.predict_layer(concat)
        return prediction.view(-1)

In [74]:
def hit(gt_item, pred_items):
    if gt_item in pred_items:
        return 1
    return 0


def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index+2))
    return 0


def metrics(model, test_loader, top_k):
    HR, NDCG = [], []

    for user, item, label in test_loader:
        user = user.cuda()
        item = item.cuda()

        predictions = model(user, item)
        _, indices = torch.topk(predictions, top_k)
        recommends = torch.take(
                item, indices).cpu().numpy().tolist()

        gt_item = item[0].item()
        HR.append(hit(gt_item, recommends))
        NDCG.append(ndcg(gt_item, recommends))

    return np.mean(HR), np.mean(NDCG)

In [71]:
def util_data():
    
    num_ng = 4
    batch_size = 256
    test_num_ng = 99


    model_path = './models/'
    GMF_model_path = model_path + 'GMF.pth'
    MLP_model_path = model_path + 'MLP.pth'
    NeuMF_model_path = model_path + 'NeuMF.pth'

    ############################## PREPARE DATASET ##########################
    train_data, test_data, user_num ,item_num, train_mat = load_all()

    # Construct the train and test datasets
    train_dataset = NCFData(train_data, item_num, train_mat, num_ng, True)

    test_dataset = NCFData(test_data, item_num, train_mat, 0, False)

    train_loader = data.DataLoader(train_dataset,batch_size=batch_size, shuffle=True, num_workers=4)

    test_loader = data.DataLoader(test_dataset,batch_size=test_num_ng+1, shuffle=False, num_workers=0)
    return train_loader, test_loader

In [76]:
def main(train_loader, test_loader, model_choose):
    lr = 0.01
    dropout = 0.0
    epochs = 20
    top_k  = 10
    factor_num = 32
    num_layers = 3
    out = True   #save models or not
    MODEL = model_choose  #'MLP', 'GMF', 'NeuMF-end', 'NeuMF-pre'

    model_path = './models/'
    GMF_model_path = model_path + 'GMF.pth'
    MLP_model_path = model_path + 'MLP.pth'
    NeuMF_model_path = model_path + 'NeuMF.pth'

    if MODEL == 'NeuMF-pre':
        #assert os.path.exists(GMF_model_path), 'lack of GMF model'
        #assert os.path.exists(MLP_model_path), 'lack of MLP model'
        GMF_model = torch.load(GMF_model_path)
        MLP_model = torch.load(MLP_model_path)
    else:
        GMF_model = None
        MLP_model = None

    model = NCF(user_num, item_num,factor_num,num_layers, dropout, MODEL, GMF_model, MLP_model)
    model.cuda()
    loss_function = nn.BCEWithLogitsLoss() #Binary Cross Entropy

    # choose the optimizer
    if MODEL == 'NeuMF-pre':
        optimizer = optim.SGD(model.parameters(), lr=lr)
    else:
        optimizer = optim.Adam(model.parameters(), lr=lr)

    ########################### TRAINING #####################################
    count, best_hr = 0, 0
    for epoch in range(epochs):
        model.train() # Enable dropout (if have).???????????
        start_time = time.time()
        train_loader.dataset.ng_sample() #??

        for user, item, label in train_loader:
            user = user.cuda()
            item = item.cuda()
            label = label.float().cuda()

            model.zero_grad()
            prediction = model(user, item)
            loss = loss_function(prediction, label)
            loss.backward()
            optimizer.step()
            # writer.add_scalar('data/loss', loss.item(), count)
            count += 1

        model.eval()
        HR, NDCG = metrics(model, test_loader, top_k)

        elapsed_time = time.time() - start_time
        print("The time elapse of epoch {:03d}".format(epoch) + " is: " + 
                time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
        print("HR: {:.3f}\tNDCG: {:.3f}".format(np.mean(HR), np.mean(NDCG)))

        if HR > best_hr:
            best_hr, best_ndcg, best_epoch = HR, NDCG, epoch
            if out:
                if not os.path.exists(model_path):
                    os.mkdir(model_path)
                torch.save(model, 
                    '{}{}.pth'.format(model_path, MODEL))

        print("End. Best epoch {:03d}: HR = {:.3f}, NDCG = {:.3f}".format(best_epoch, best_hr, best_ndcg))

In [64]:
train_loader, test_loader = util_data()


In [77]:
main(train_loader, test_loader, 'GMF')

The time elapse of epoch 000 is: 00: 00: 52
HR: 0.614	NDCG: 0.351
End. Best epoch 000: HR = 0.614, NDCG = 0.351


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


The time elapse of epoch 001 is: 00: 00: 52
HR: 0.645	NDCG: 0.375
End. Best epoch 001: HR = 0.645, NDCG = 0.375
The time elapse of epoch 002 is: 00: 00: 50
HR: 0.658	NDCG: 0.384
End. Best epoch 002: HR = 0.658, NDCG = 0.384
The time elapse of epoch 003 is: 00: 00: 53
HR: 0.657	NDCG: 0.383
End. Best epoch 002: HR = 0.658, NDCG = 0.384
The time elapse of epoch 004 is: 00: 00: 51
HR: 0.659	NDCG: 0.391
End. Best epoch 004: HR = 0.659, NDCG = 0.391
The time elapse of epoch 005 is: 00: 00: 52
HR: 0.666	NDCG: 0.393
End. Best epoch 005: HR = 0.666, NDCG = 0.393
The time elapse of epoch 006 is: 00: 00: 49
HR: 0.679	NDCG: 0.404
End. Best epoch 006: HR = 0.679, NDCG = 0.404
The time elapse of epoch 007 is: 00: 00: 52
HR: 0.670	NDCG: 0.397
End. Best epoch 006: HR = 0.679, NDCG = 0.404
The time elapse of epoch 008 is: 00: 00: 53
HR: 0.678	NDCG: 0.407
End. Best epoch 006: HR = 0.679, NDCG = 0.404
The time elapse of epoch 009 is: 00: 00: 51
HR: 0.670	NDCG: 0.397
End. Best epoch 006: HR = 0.679, NDCG 

In [78]:
main(train_loader, test_loader, 'MLP')

The time elapse of epoch 000 is: 00: 01: 12
HR: 0.524	NDCG: 0.285
End. Best epoch 000: HR = 0.524, NDCG = 0.285
The time elapse of epoch 001 is: 00: 01: 12
HR: 0.581	NDCG: 0.320
End. Best epoch 001: HR = 0.581, NDCG = 0.320
The time elapse of epoch 002 is: 00: 01: 11
HR: 0.609	NDCG: 0.345
End. Best epoch 002: HR = 0.609, NDCG = 0.345
The time elapse of epoch 003 is: 00: 01: 10
HR: 0.631	NDCG: 0.356
End. Best epoch 003: HR = 0.631, NDCG = 0.356
The time elapse of epoch 004 is: 00: 01: 13
HR: 0.648	NDCG: 0.371
End. Best epoch 004: HR = 0.648, NDCG = 0.371
The time elapse of epoch 005 is: 00: 01: 14
HR: 0.639	NDCG: 0.367
End. Best epoch 004: HR = 0.648, NDCG = 0.371
The time elapse of epoch 006 is: 00: 01: 14
HR: 0.661	NDCG: 0.379
End. Best epoch 006: HR = 0.661, NDCG = 0.379
The time elapse of epoch 007 is: 00: 01: 15
HR: 0.656	NDCG: 0.375
End. Best epoch 006: HR = 0.661, NDCG = 0.379
The time elapse of epoch 008 is: 00: 01: 14
HR: 0.656	NDCG: 0.383
End. Best epoch 006: HR = 0.661, NDCG 