# 1. Config

In [1]:
import os
import urllib.request

# Dataset name
dataset = 'ml-1m'
assert dataset in ['ml-1m', 'pinterest-20']

# Create necessary directories
if not os.path.exists('./data'):
    os.makedirs('./data')

# Base URL for the datasets from the GitHub repository
base_url = 'https://raw.githubusercontent.com/hexiangnan/neural_collaborative_filtering/master/Data/'

# Paths to download files
train_rating_url = base_url + '{}.train.rating'.format(dataset)
test_rating_url = base_url + '{}.test.rating'.format(dataset)
test_negative_url = base_url + '{}.test.negative'.format(dataset)

# Local paths where the files will be saved
train_rating = './data/{}.train.rating'.format(dataset)
test_rating = './data/{}.test.rating'.format(dataset)
test_negative = './data/{}.test.negative'.format(dataset)

# Function to download and save files
def download_dataset(url, file_path):
    if not os.path.exists(file_path):
        print(f"Downloading {file_path}...")
        urllib.request.urlretrieve(url, file_path)
        print(f"Saved to {file_path}")
    else:
        print(f"{file_path} already exists.")

# Download datasets
download_dataset(train_rating_url, train_rating)
download_dataset(test_rating_url, test_rating)
download_dataset(test_negative_url, test_negative)

# Paths for saving models
model_path = './models/'
if not os.path.exists(model_path):
    os.makedirs(model_path)
BPR_model_path = model_path + 'NeuMF.pth'

print("Datasets downloaded and paths are set.")

Downloading ./data/ml-1m.train.rating...
Saved to ./data/ml-1m.train.rating
Downloading ./data/ml-1m.test.rating...
Saved to ./data/ml-1m.test.rating
Downloading ./data/ml-1m.test.negative...
Saved to ./data/ml-1m.test.negative
Datasets downloaded and paths are set.


# 2. Model

In [2]:
import torch
import torch.nn as nn


class BPR(nn.Module):
	def __init__(self, user_num, item_num, factor_num):
		super(BPR, self).__init__()
		"""
		user_num: number of users;
		item_num: number of items;
		factor_num: number of predictive factors.
		"""
		self.embed_user = nn.Embedding(user_num, factor_num)
		self.embed_item = nn.Embedding(item_num, factor_num)

		nn.init.normal_(self.embed_user.weight, std=0.01)
		nn.init.normal_(self.embed_item.weight, std=0.01)

	def forward(self, user, item_i, item_j):
		user = self.embed_user(user)
		item_i = self.embed_item(item_i)
		item_j = self.embed_item(item_j)

		prediction_i = (user * item_i).sum(dim=-1)
		prediction_j = (user * item_j).sum(dim=-1)
		return prediction_i, prediction_j

# 3. Data utils

In [3]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

import torch.utils.data as data

def load_all(test_num=100):
	""" We load all the three file here to save time in each epoch. """
	train_data = pd.read_csv(
		train_rating,
		sep='\t', header=None, names=['user', 'item'],
		usecols=[0, 1], dtype={0: np.int32, 1: np.int32})

	user_num = train_data['user'].max() + 1
	item_num = train_data['item'].max() + 1

	train_data = train_data.values.tolist()

	# load ratings as a dok matrix
	train_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)
	for x in train_data:
		train_mat[x[0], x[1]] = 1.0

	test_data = []
	with open(test_negative, 'r') as fd:
		line = fd.readline()
		while line != None and line != '':
			arr = line.split('\t')
			u = eval(arr[0])[0]
			test_data.append([u, eval(arr[0])[1]])
			for i in arr[1:]:
				test_data.append([u, int(i)])
			line = fd.readline()
	return train_data, test_data, user_num, item_num, train_mat


class BPRData(data.Dataset):
	def __init__(self, features,
				num_item, train_mat=None, num_ng=0, is_training=None):
		super(BPRData, self).__init__()
		""" Note that the labels are only useful when training, we thus
			add them in the ng_sample() function.
		"""
		self.features = features
		self.num_item = num_item
		self.train_mat = train_mat
		self.num_ng = num_ng
		self.is_training = is_training

	def ng_sample(self):
		assert self.is_training, 'no need to sampling when testing'

		self.features_fill = []
		for x in self.features:
			u, i = x[0], x[1]
			for t in range(self.num_ng):
				j = np.random.randint(self.num_item)
				while (u, j) in self.train_mat:
					j = np.random.randint(self.num_item)
				self.features_fill.append([u, i, j])

	def __len__(self):
		return self.num_ng * len(self.features) if \
				self.is_training else len(self.features)

	def __getitem__(self, idx):
		features = self.features_fill if \
				self.is_training else self.features

		user = features[idx][0]
		item_i = features[idx][1]
		item_j = features[idx][2] if \
				self.is_training else features[idx][1]
		return user, item_i, item_j

# 4. Evaluate

In [4]:
import numpy as np
import torch

def hit(gt_item, pred_items):
    if gt_item in pred_items:
        return 1
    return 0

def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index + 2))
    return 0

def metrics(model, test_loader, top_k, device):
    HR, NDCG = [], []

    for user, item_i, item_j in test_loader:
        # Move tensors to the specified device (CPU or GPU)
        user = user.to(device)
        item_i = item_i.to(device)
        item_j = item_j.to(device)  # item_j is not used but still moved to device

        # Forward pass
        prediction_i, prediction_j = model(user, item_i, item_j)

        # Get top-k predictions
        _, indices = torch.topk(prediction_i, top_k)
        recommends = torch.take(item_i, indices).cpu().numpy().tolist()  # Move to CPU for numpy operations

        gt_item = item_i[0].item()  # Ground truth item
        HR.append(hit(gt_item, recommends))
        NDCG.append(ndcg(gt_item, recommends))

    return np.mean(HR), np.mean(NDCG)

# 5. Main

In [5]:
!pip install tensorboardX

Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/101.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


## Setups

In [6]:
import os
import time
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.backends.cudnn as cudnn
from tensorboardX import SummaryWriter

args = {
    "lr": 0.01,
    "lamda": 0.001,
    "batch_size": 4096,
    "epochs": 50,
    "top_k": 10,
    "factor_num": 32,
    "num_ng": 4,
    "test_num_ng": 99,
    "out": True,
    "gpu": "0"
}

# If using CUDA, set device. Otherwise, use CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    cudnn.benchmark = True

## Prepare dataset

In [7]:
train_data, test_data, user_num, item_num, train_mat = load_all()

# construct the train and test datasets
train_dataset = BPRData(
        train_data, item_num, train_mat, args["num_ng"], True)
test_dataset = BPRData(
        test_data, item_num, train_mat, 0, False)
train_loader = data.DataLoader(train_dataset,
        batch_size=args["batch_size"], shuffle=True, num_workers=4)
test_loader = data.DataLoader(test_dataset,
        batch_size=args["test_num_ng"] + 1, shuffle=False, num_workers=0)



## Create model

In [8]:
model = BPR(user_num, item_num, args["factor_num"])
model = model.to(device)

optimizer = optim.SGD(
            model.parameters(), lr=args["lr"], weight_decay=args["lamda"])
writer = SummaryWriter()

## Training

In [9]:
count, best_hr = 0, 0
for epoch in range(args["epochs"]):
    model.train()
    start_time = time.time()
    train_loader.dataset.ng_sample()

    for user, item_i, item_j in train_loader:
        user = user.to(device)
        item_i = item_i.to(device)
        item_j = item_j.to(device)

        model.zero_grad()
        prediction_i, prediction_j = model(user, item_i, item_j)
        loss = - (prediction_i - prediction_j).sigmoid().log().sum()
        loss.backward()
        optimizer.step()
        writer.add_scalar('data/loss', loss.item(), count)
        count += 1

    model.eval()
    HR, NDCG = metrics(model, test_loader, top_k=args["top_k"], device=device)

    elapsed_time = time.time() - start_time
    print("The time elapsed of epoch {:03d}".format(epoch) + " is: " +
          time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
    print("HR: {:.3f}\tNDCG: {:.3f}".format(np.mean(HR), np.mean(NDCG)))

    if HR > best_hr:
        best_hr, best_ndcg, best_epoch = HR, NDCG, epoch
        if args["out"]:
            if not os.path.exists(model_path):
                os.mkdir(model_path)
            torch.save(model, f'{model_path}/BPR.pt')

print("End. Best epoch {:03d}: HR = {:.3f}, \
    NDCG = {:.3f}".format(best_epoch, best_hr, best_ndcg))



The time elapsed of epoch 000 is: 00:00:37
HR: 0.440	NDCG: 0.246
The time elapsed of epoch 001 is: 00:00:39
HR: 0.454	NDCG: 0.252
The time elapsed of epoch 002 is: 00:00:38
HR: 0.487	NDCG: 0.274
The time elapsed of epoch 003 is: 00:00:37
HR: 0.524	NDCG: 0.294
The time elapsed of epoch 004 is: 00:00:36
HR: 0.561	NDCG: 0.313
The time elapsed of epoch 005 is: 00:00:37
HR: 0.583	NDCG: 0.328
The time elapsed of epoch 006 is: 00:00:37
HR: 0.601	NDCG: 0.341
The time elapsed of epoch 007 is: 00:00:39
HR: 0.623	NDCG: 0.355
The time elapsed of epoch 008 is: 00:00:37
HR: 0.631	NDCG: 0.363
The time elapsed of epoch 009 is: 00:00:39
HR: 0.644	NDCG: 0.371
The time elapsed of epoch 010 is: 00:00:37
HR: 0.651	NDCG: 0.379
The time elapsed of epoch 011 is: 00:00:39
HR: 0.660	NDCG: 0.384
The time elapsed of epoch 012 is: 00:00:37
HR: 0.668	NDCG: 0.389
The time elapsed of epoch 013 is: 00:00:39
HR: 0.673	NDCG: 0.394
The time elapsed of epoch 014 is: 00:00:39
HR: 0.676	NDCG: 0.396
The time elapsed of epoch