In [1]:
import time
import numpy as np
from numpy import random


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import math
import heapq 

import multiprocessing as mp
import argparse

In [2]:
class MFbpr(nn.Module):
    '''
    BPR learning for MF model
    '''
    def __init__(self, dataset, factors, learning_rate, reg, init_mean, init_stdev):
        super(MFbpr, self).__init__()
        self.dataset = dataset
        self.train = dataset.train
        self.test = dataset.test
        self.num_user = dataset.num_user
        self.num_item = dataset.num_item
        self.neg = dataset.neg
        self.factors = factors
        self.learning_rate = learning_rate
        self.reg = reg
        self.init_mean = init_mean
        self.init_stdev = init_stdev

        # user & item latent vectors
        self.U = nn.Parameter(torch.normal(mean=self.init_mean * torch.ones(self.num_user, self.factors), std=self.init_stdev))
        self.V = nn.Parameter(torch.normal(mean=self.init_mean * torch.ones(self.num_item, self.factors), std=self.init_stdev))
        self.U = self.U.cuda()
        self.V = self.V.cuda()
        # optim
        self.mf_optim = optim.Adam([self.U, self.V], lr=self.learning_rate)

        self.items_of_user = []
        self.num_rating = 0     # number of ratings
        for u in range(len(self.train)):
            self.items_of_user.append(set([]))
            for i in range(len(self.train[u])):
                item = self.train[u][i][0]
                self.items_of_user[u].add(item)
                self.num_rating += 1

    def forward(self, u, i, j):
        y_ui = torch.diag(torch.mm(self.U[u], self.V[i].t()))
        y_uj = torch.diag(torch.mm(self.U[u], self.V[j].t()))
        regularizer = self.reg * (torch.sum(self.U[u] ** 2) + torch.sum(self.V[i] ** 2) + torch.sum(self.V[j] ** 2))
        loss = regularizer - torch.sum(torch.log2(torch.sigmoid(y_ui - y_uj)))
        return y_ui, y_uj, loss

    def build_model(self, epoch=30, num_thread=4, batch_size=32):
        data_loader = DataLoader(self.dataset, batch_size=batch_size, pin_memory=True)  # 데이터 로딩 시 CUDA 사용
        print("Training MF-BPR with: learning_rate=%.4f, regularization=%.4f, factors=%d, #epoch=%d, batch_size=%d."
              % (self.learning_rate, self.reg, self.factors, epoch, batch_size))
        t1 = time.time()
        iter_loss = 0
        for epoc in range(epoch):
            for s, (users, items_pos, items_neg) in enumerate(data_loader):
                self.mf_optim.zero_grad()
                y_ui, y_uj, loss = self.forward(users.cuda(non_blocking=True), items_pos.cuda(non_blocking=True), items_neg.cuda(non_blocking=True))  # 입력 데이터 CUDA 사용
                iter_loss += loss
                loss.backward()
                self.mf_optim.step()

            if epoc % 20 == 19:
                t2 = time.time()
                topK = 20
                (hits, ndcgs) = evaluate_model(self, self.test, topK, num_thread)
                hr_mean = np.array(hits).mean()
                ndcg_mean = np.array(ndcgs).mean()
                
                print("Epoch=%d [%.1f s] HitRatio@%d = %.4f, NDCG@%d = %.4f [%.1f s]"
                      % (epoc, (t2 - t1) / 20, topK, hr_mean, topK, ndcg_mean, time.time() - t2))
                t1 = time.time()
                iter_loss = 0

    def predict(self, u, i):
        return torch.matmul(self.U[u].detach().cpu(), self.V[i].detach().cpu())

    def get_batch(self, batch_size):
        users, pos_items, neg_items = [], [], []
        for i in range(batch_size):
            u = np.random.randint(0, self.num_user)
            i = self.train[u][np.random.randint(0, len(self.train[u]))][0]
            j = np.random.randint(0, self.num_item)
            while j in self.items_of_user[u]:
                j = np.random.randint(0, self.num_item)
            users.append(u)
            pos_items.append(i)
            neg_items.append(j)
        return (users, pos_items, neg_items)

In [3]:
def LoadRatingFile_HoldKOut(filename, splitter, K):
    train = []  
    test = []
    
    num_ratings = 0
    num_item = 0
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split(splitter)
            if (len(arr) < 4):
                continue
            user, item, time = int(arr[0]), int(arr[1]), int(arr[3]) 
            if (len(train) <= user):
                train.append([])
            train[user].append([item, time])
            num_ratings += 1
            num_item = max(item, num_item)
            line = f.readline()
    num_user = len(train)
    num_item = num_item + 1
    
    def getTime(item):
        return item[-1];
    for u in range (len(train)):
        train[u] = sorted(train[u], key = getTime)
    
    for u in range (len(train)):
        for k in range(K):
            if (len(train[u]) == 0):
                break
            test.append([u, train[u][-1][0], train[u][-1][1]])
            del train[u][-1]
            
    test = sorted(test, key = getTime)
    
    return train, test, num_user, num_item, num_ratings


class Pinterest(Dataset):
    def __init__(self, dir, splitter, K):
        self.train = []
        
        self.num_ratings = 0
        self.num_item = 0
        with open(dir+'pos.txt', "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split(splitter)
                if (len(arr) < 2):
                    continue
                user, item = int(arr[0]), int(arr[1])
                if (len(self.train) <= user):
                    self.train.append([])
                self.train[user].append([item])
                self.num_ratings += 1
                self.num_item = max(item, self.num_item)
                line = f.readline()
        self.num_user = len(self.train)
        self.num_item = self.num_item + 1

        self.test = []
        self.neg = dict()
        user = 0
        with open(dir+'neg.txt', 'r') as f_neg:
            line = f_neg.readline()
            while line != None and line != '':
                arr = line.split(splitter)
                pos = int(arr[0])
                self.test.append([user, pos])
                self.neg[user] = []
                for neg_i in range(len(arr)):
                    if arr[neg_i] != '\n':
                        self.neg[user].append(int(arr[neg_i]))

                user += 1
                line = f_neg.readline()
        print("#users: %d, #items: %d, #ratings: %d" %(self.num_user, self.num_item, self.num_ratings))

    def __len__(self):
        return self.num_user

    def __getitem__(self, idx):
        u = idx
        i = self.train[u][np.random.randint(0, len(self.train[u]))]
        j = np.random.randint(0, self.num_item)
        while j in self.train[u]:
            j = np.random.randint(0, self.num_item) 
        
        return (u, i, j)

# 데이터셋을 GPU로 전송하는 부분
pinterest_dataset = Pinterest(dir='path_to_your_data_directory', splitter=' ', K=your_K_value)
pinterest_dataset = pinterest_dataset.cuda()


In [4]:
# Global variables that are shared across processes
_model = None
_testRatings = None
_K = None

def evaluate_model(model, testRatings, K, num_thread):
    """
    Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
    Return: score of each test rating.
    """
    global _model
    global _testRatings
    global _K
    _model = model
    _testRatings = testRatings
    _K = K
    num_rating = len(testRatings)

    pool = mp.Pool(processes=num_thread)
    res = pool.map(eval_one_rating, range(num_rating))
    pool.close()
    pool.join()

    hits = [r[0] for r in res]
    ndcgs = [r[1] for r in res]
    return (hits, ndcgs)

def eval_one_rating(idx):
    rating = _testRatings[idx]
    hr = ndcg = 0
    u = rating[0]
    gtItem = rating[1]
    map_item_score = {}
    
    # Get the score of the test item first
    maxScore = _model.predict(u, gtItem)
    
    # Early stopping if there are K items larger than maxScore.
    countLarger = 0
    for i in _model.neg[u]:
        early_stop = False
        score = _model.predict(u, i)
        map_item_score[i] = score

        if score > maxScore:
            countLarger += 1
        if countLarger > _K:
            hr = ndcg = 0
            early_stop = True
            break
    # Generate topK rank list
    if not early_stop:
        items = torch.tensor(list(map_item_score.keys())).cuda()
        scores = torch.tensor(list(map_item_score.values())).cuda()
        _, indices = torch.topk(scores, _K)
        ranklist = items[indices.cpu()]
        hr = getHitRatio(ranklist, gtItem)
        ndcg = getNDCG(ranklist, gtItem)

    return (hr, ndcg)

def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0


In [None]:
def parse_args():
    args = argparse.Namespace()
    args.batch_size = 32
    args.learning_rate = 0.0003
    return args

if __name__ == '__main__':
    args = parse_args()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Load data
    dataset = "./data/"
    splitter = " "
    hold_k_out = 1
    pinterest = Pinterest(dataset, splitter, hold_k_out)
    
    # MFbpr parameters
    factors = 64
    learning_rate = args.learning_rate
    reg = 0.01
    init_mean = 0
    init_stdev = 0.01
    epoch = 30
    batch_size = args.batch_size
    num_thread = mp.cpu_count()
    print("#factors: %d, lr: %f, reg: %f, batch_size: %d" % (factors, learning_rate, reg, batch_size))
    
    # Run model
    bpr = MFbpr(pinterest,
                factors, learning_rate, reg, init_mean, init_stdev).to(device)
    bpr.build_model(epoch, num_thread, batch_size=batch_size)

    # save model
    np.save("out/u"+str(learning_rate)+".npy", bpr.U.detach().cpu().numpy())
    np.save("out/v"+str(learning_rate)+".npy", bpr.V.detach().cpu().numpy())


#users: 14470, #items: 9396, #ratings: 50000
#factors: 64, lr: 0.000300, reg: 0.010000, batch_size: 32
Training MF-BPR with: learning_rate=0.0003, regularization=0.0100, factors=64, #epoch=10000, batch_size=32.
