### baseline

In [6]:
import numpy as np
import pandas as pd
import random as rd
import collections
import time
import random

import torch
import torch.nn as nn
from torch.utils.data import Dataset

from used_metric import get_performance

r_path = './douban/'

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [8]:
class Data(Dataset):
    def __init__(self, path = r_path, batch_size = 2048, step = 100):
        self.path = path
        self.batch_size = batch_size
        self.step = step
        self.n_users, self.n_items = 0, 0
        self.train_user_list = collections.defaultdict(list)
        self.train_item_list = collections.defaultdict(list)
        
        self.test_user_list = collections.defaultdict(list)
        self.test_item_list = collections.defaultdict(list)
        
        self.load_train_data()
        self.load_test_data()
        
        self.n_users = self.n_users + 1
        self.n_items = self.n_items + 1
        self.users = list(range(self.n_users))
        self.items = list(range(self.n_items))

        self.length = self.batch_size * self.step
        
    def load_train_data(self):          
        train_file = self.path + 'train.txt'
        with open(train_file) as f:
            for line in f.readlines():
                line = line.strip('\n').split(' ')
                if len(line) == 0:
                    continue
                line = [int(i) for i in line]
                user = line[0]
                items = line[1:]
                if (len(items)==0):
                    continue
                self.train_user_list[user] = items
                for item in items:
                    self.train_item_list[item].append(user)
                self.n_users = max(self.n_users, user)
                self.n_items = max(self.n_items, max(items))
        
    def load_test_data(self):
        test_file = self.path + 'test.txt'
        with open(test_file) as f:
            for line in f.readlines():
                line = line.strip('\n').split(' ')
                if len(line) == 0:
                    continue
                line = [int(i) for i in line]
                user = line[0]
                items = line[1:]
                if (len(items)==0):
                    continue
                self.test_user_list[user] = items
                for item in items:
                    self.test_item_list[item].append(user)
                self.n_users = max(self.n_users, user)
                self.n_items = max(self.n_items, max(items))
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        neg_item = None
        
        user = random.choice(self.users)   

        pos_items = list(self.train_user_list[user])
        pos_item = random.choice(pos_items)
        
        while neg_item is None or neg_item in pos_items:
            neg_item = random.choice(self.items)
            
        return user, pos_item, neg_item

In [9]:
class evaluation():
    def __init__(self, data, Ks, batch_size):
        self.data = data
        self.Ks = Ks
        self.batch_size = batch_size

    def test_one_batch(self, model, batch_user):
        batch_user = batch_user.to(device)
        batch_rec = model.do_recommendation(batch_user)
        batch_rec = torch.sigmoid(batch_rec)
        mask = torch.ones_like(batch_rec)
        for i in range(len(batch_user)):
            mask[i].scatter_(dim = 0, index=torch.tensor(list(self.data.train_user_list[batch_user[i].item()])).to(device), value=torch.tensor(0.0).to(device))
        batch_rec = torch.mul(mask, batch_rec)
        _, batch_rec = torch.sort(batch_rec, descending=True)
        batch_rec = batch_rec.cpu().numpy()
        result = {'precision': np.zeros(len(self.Ks)), 'recall': np.zeros(len(self.Ks)), 'ndcg': np.zeros(len(self.Ks)),
                  'hit_ratio': np.zeros(len(self.Ks))}
        for i in range(len(batch_user)):
            u = batch_user[i].item()
            r = batch_rec[i]
            u_target = self.data.test_user_list[u]
            one_user_result = get_performance(u_target, r, self.Ks)
            result['precision'] += one_user_result['precision']
            result['recall'] += one_user_result['recall']
            result['ndcg'] += one_user_result['ndcg']
            result['hit_ratio'] += one_user_result['hit_ratio']
        return result

    def eval(self, model):
        result = {'precision': np.zeros(len(self.Ks)), 'recall': np.zeros(len(self.Ks)), 'ndcg': np.zeros(len(self.Ks)),
                  'hit_ratio': np.zeros(len(self.Ks))}
        all_users = list(self.data.test_user_list.keys())
        tot_users = len(all_users)
        for i in range(0, tot_users, self.batch_size):
            end_idx = min(i + self.batch_size, tot_users)
            batch_user = torch.tensor(all_users[i:end_idx], dtype=torch.long)
            res = self.test_one_batch(model, batch_user)
            
            result['precision'] += res['precision']/tot_users
            result['recall'] += res['recall']/tot_users
            result['ndcg'] += res['ndcg']/tot_users
            result['hit_ratio'] += res['hit_ratio']/tot_users
        return result

In [10]:
class BPRMF(nn.Module):
    def __init__(self, n_users, n_items, emb_size, weight_decay):
        super(BPRMF, self).__init__()
        self.n_users = data.n_users
        self.n_items = data.n_items

        self.decay = weight_decay
        self.emb_dim = emb_size

        self.user_embedding = nn.Embedding(self.n_users, self.emb_dim)
        self.item_embedding = nn.Embedding(self.n_items, self.emb_dim) 
        
        self.user_embedding.weight.data.uniform_(0, 0.005)  # 0-0.005之间均匀分布
        self.user_embedding.weight.data.uniform_(0, 0.005)
    
    def forward(self, users, pos_items, neg_items):
        user_embedding = self.user_embedding(users)
        pos_item_embedding = self.item_embedding(pos_items)
        neg_item_embedding = self.item_embedding(neg_items)

        pos_scores = torch.sum(user_embedding * pos_item_embedding, dim=1)
        neg_scores = torch.sum(user_embedding * neg_item_embedding, dim=1)

        bpr_loss = -torch.mean(torch.log(torch.sigmoid(pos_scores - neg_scores)))
        
        regularizer = torch.norm(user_embedding, p=2) + torch.norm(pos_item_embedding, p=2) + torch.norm(neg_item_embedding, p=2)
        reg_loss = self.decay * regularizer
        return bpr_loss + reg_loss
    
    def do_recommendation(self, users):
        user_emb = self.user_embedding(users)
        item_embs = self.item_embedding.weight
        scores = torch.mm(user_emb, item_embs.t())
        return scores

In [11]:
data = Data(batch_size = 2048, step = 500)

In [12]:
dataloader = torch.utils.data.DataLoader(data, batch_size = 2048, shuffle = False, num_workers = 6)

In [13]:
evaluator = evaluation(data, [20], 2048)

In [14]:
model = BPRMF(data.n_users, data.n_items, emb_size = 32, weight_decay = 1e-04).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [15]:
start = time.perf_counter()

EPOCH = 200
for epoch in range(EPOCH):
    t_loss = 0.0
    for idx, (users, pos_items, neg_items) in enumerate(dataloader):
        users = users.to(device)
        pos_items = pos_items.to(device)
        neg_items = neg_items.to(device)
        optimizer.zero_grad()
        loss = model(users, pos_items, neg_items)
        loss.backward()
        optimizer.step()
        t_loss += loss.item()
        
    if (1 + epoch) % 5 == 0 and (1 + epoch) >= 100 :
        res = evaluator.eval(model)
        print(f'epoch: {epoch + 1}, loss: {t_loss}')
        print(res)
        end = time.perf_counter()
        print('Running time: %s Seconds\n' % (end - start))
        start = time.perf_counter()

epoch: 100, loss: 61.90420763194561
{'precision': array([0.02946663]), 'recall': array([0.02403053]), 'ndcg': array([0.03601531]), 'hit_ratio': array([0.26511832])}
Running time: 432.1645838320255 Seconds

epoch: 105, loss: 60.69356349110603
{'precision': array([0.02989233]), 'recall': array([0.02439008]), 'ndcg': array([0.03644326]), 'hit_ratio': array([0.26524352])}
Running time: 37.038741022348404 Seconds

epoch: 110, loss: 59.539621494710445
{'precision': array([0.03024289]), 'recall': array([0.02497485]), 'ndcg': array([0.0371705]), 'hit_ratio': array([0.26868662])}
Running time: 37.336487002670765 Seconds

epoch: 115, loss: 58.594766072928905
{'precision': array([0.03037436]), 'recall': array([0.0250056]), 'ndcg': array([0.03787942]), 'hit_ratio': array([0.27206711])}
Running time: 37.454474210739136 Seconds

epoch: 120, loss: 57.82446390390396
{'precision': array([0.02995493]), 'recall': array([0.0240781]), 'ndcg': array([0.03624025]), 'hit_ratio': array([0.26524352])}
Running t