In [1]:
import numpy as np
import pandas as pd
import random as rd
import collections
import time
import random

import torch
import torch.nn as nn
from torch.utils.data import Dataset

from used_metric import get_performance

r_path = './douban/'

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
class Data(Dataset):
    def __init__(self, path=r_path, batch_size = 2048, step = 100):
        self.path = path
        self.batch_size = batch_size
        self.step = step
        self.n_users, self.n_items = 0, 0
        self.train_user_list = collections.defaultdict(list)
        self.train_user_list_time = collections.defaultdict(list)
        self.train_item_list = collections.defaultdict(list)
        
        self.test_user_list = collections.defaultdict(list)
        self.test_item_list = collections.defaultdict(list)
        
        self.load_train_data()
        self.load_test_data()
        
        self.n_users = self.n_users + 1
        self.n_items = self.n_items + 1
        self.users = list(range(self.n_users))
        self.items = list(range(self.n_items))
        
        self.length = self.batch_size * self.step
        
    def add_expo_popularity(self,popularity):
        self.expo_popularity = popularity
        
    def load_train_data(self):          
        train_file = self.path + 'train_with_time.txt'        
        train_data = pd.read_csv(train_file, header=None, sep=' ')
        train_data.columns = ['uid','iid','time','stars']
        train_data = train_data[['uid','iid','time']]
        unique_time = train_data['time'].unique()
        print("time slot unique in train:",unique_time)
        self.unique_times = list(unique_time)
        if train_data['time'].unique().shape[0] < 2:
            raise RuntimeWarning("there only one time slot for train...., this may cause our method not work")

        for col in train_data.columns:
            train_data[col] = train_data[col].astype(int)
        user_item_time = train_data.groupby('uid')[['iid','time']].agg(list)
        self.train_user_list = dict(zip(user_item_time.index,user_item_time['iid']))
        self.train_user_list_time = dict(zip(user_item_time.index,user_item_time['time']))
        item_user = train_data.groupby('iid')[['uid','time']].agg(list)
        print(item_user.head(2))
        self.train_item_list = dict(zip(item_user.index,item_user['uid']))

        self.n_users = max(self.n_users,train_data['uid'].max())
        self.n_items = max(self.n_items,train_data['iid'].max())
        
    def load_test_data(self):
        test_file = self.path + 'test.txt'
        with open(test_file) as f:
            for line in f.readlines():
                line = line.strip('\n').split(' ')
                if len(line) == 0:
                    continue
                line = [int(i) for i in line]
                user = line[0]
                items = line[1:]
                if (len(items)==0):
                    continue
                self.test_user_list[user] = items
                for item in items:
                    self.test_item_list[item].append(user)
                self.n_users = max(self.n_users, user)
                self.n_items = max(self.n_items, max(items))

    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        user = random.choice(self.users) 

        neg_item = None
        
        pos_items = self.train_user_list[user]
        clicked_times = self.train_user_list_time[user]
        
        random_index = rd.randint(0,len(pos_items) - 1)
        pos_item = pos_items[random_index]
        pos_time = clicked_times[random_index]
    
        while neg_item is None or neg_item in pos_items:
            neg_item = rd.choice(self.items)
      
        pos_pop = self.expo_popularity[pos_item,pos_time]
        neg_pop = self.expo_popularity[neg_item,pos_time]   
    
        return user, pos_item, neg_item, pos_pop, neg_pop

In [4]:
class evaluation():
    def __init__(self, data, Ks, batch_size):
        self.data = data
        self.Ks = Ks
        self.batch_size = batch_size

    def test_one_batch(self, model, batch_user):
        batch_user = batch_user.to(device)
        batch_rec = model.do_recommendation(batch_user)
        batch_rec = torch.sigmoid(batch_rec)
        mask = torch.ones_like(batch_rec)
        for i in range(len(batch_user)):
            mask[i].scatter_(dim = 0, index=torch.tensor(list(self.data.train_user_list[batch_user[i].item()])).to(device), value=torch.tensor(0.0).to(device))
        batch_rec = torch.mul(mask, batch_rec)
        _, batch_rec = torch.sort(batch_rec, descending=True)
        batch_rec = batch_rec.cpu().numpy()
        result = {'precision': np.zeros(len(self.Ks)), 'recall': np.zeros(len(self.Ks)), 'ndcg': np.zeros(len(self.Ks)),
                  'hit_ratio': np.zeros(len(self.Ks))}
        for i in range(len(batch_user)):
            u = batch_user[i].item()
            r = batch_rec[i]
            u_target = self.data.test_user_list[u]
            one_user_result = get_performance(u_target, r, self.Ks)
            result['precision'] += one_user_result['precision']
            result['recall'] += one_user_result['recall']
            result['ndcg'] += one_user_result['ndcg']
            result['hit_ratio'] += one_user_result['hit_ratio']
        return result

    def eval(self, model):
        result = {'precision': np.zeros(len(self.Ks)), 'recall': np.zeros(len(self.Ks)), 'ndcg': np.zeros(len(self.Ks)),
                  'hit_ratio': np.zeros(len(self.Ks))}
        all_users = list(self.data.test_user_list.keys())
        tot_users = len(all_users)
        for i in range(0, tot_users, self.batch_size):
            end_idx = min(i + self.batch_size, tot_users)
            batch_user = torch.tensor(all_users[i:end_idx], dtype=torch.long)
            res = self.test_one_batch(model, batch_user)
            
            result['precision'] += res['precision']/tot_users
            result['recall'] += res['recall']/tot_users
            result['ndcg'] += res['ndcg']/tot_users
            result['hit_ratio'] += res['hit_ratio']/tot_users
        return result

In [5]:
def load_popularity():
    pop_save_path = r_path + "item_pop_seq_ori2.txt"
    print("popularity used:",pop_save_path)
    with open(pop_save_path) as f:
        print("pop save path: ", pop_save_path)
        item_list = []
        pop_item_all = []
        for line in f:
            line = line.strip().split()
            item, pop_list = int(line[0]), [float(x) for x in line[1:]]
            item_list.append(item)
            pop_item_all.append(pop_list)
    pop_item_all = np.array(pop_item_all)
    print("pop_item_all shape:", pop_item_all.shape)
    print("load pop information:",pop_item_all.mean(),pop_item_all.max(),pop_item_all.min())
    return pop_item_all

In [6]:
pop_item_all = load_popularity()
last_stage_popualarity = pop_item_all[:,-2]
popularity_exp = 0.02
popularity_matrix = pop_item_all[:,:-1]
popularity_matrix = np.power(popularity_matrix,popularity_exp)

popularity used: ./autodl-tmp/douban_movie/popularity_distribution.txt
pop save path:  ./autodl-tmp/douban_movie/popularity_distribution.txt
pop_item_all shape: (26047, 10)
load pop information: 0.0037181866982990796 1.0 0.0


In [28]:
class ConditionalBPRMF(nn.Module):
    def __init__(self, n_users, n_items, emb_size, weight_decay):
        super(ConditionalBPRMF, self).__init__()
        self.n_users = data.n_users
        self.n_items = data.n_items

        self.decay = weight_decay
        self.emb_dim = emb_size

        self.user_embedding = nn.Embedding(self.n_users, self.emb_dim)
        self.item_embedding = nn.Embedding(self.n_items, self.emb_dim) 
        
        self.user_embedding.weight.data.uniform_(0, 0.005)  # 0-0.005之间均匀分布
        self.item_embedding.weight.data.uniform_(0, 0.005)
    
    def forward(self, users, pos_items, neg_items, pos_pop, neg_pop):
        user_embedding = self.user_embedding(users)
        pos_item_embedding = self.item_embedding(pos_items)
        neg_item_embedding = self.item_embedding(neg_items)

        pos_scores = torch.sum(user_embedding * pos_item_embedding, dim=1)
        neg_scores = torch.sum(user_embedding * neg_item_embedding, dim=1)

        pos_scores = torch.nn.functional.elu(pos_scores) + 1
        neg_scores = torch.nn.functional.elu(neg_scores) + 1
        pos_scores_with_pop = pos_scores * pos_pop
        neg_scores_with_pop = neg_scores * neg_pop

        bpr_loss = -torch.mean(torch.log(torch.sigmoid(pos_scores_with_pop - neg_scores_with_pop)))
        
        regularizer = torch.norm(user_embedding, p=2) + torch.norm(pos_item_embedding, p=2) + torch.norm(neg_item_embedding, p=2)
        reg_loss = self.decay * regularizer
        return bpr_loss + reg_loss
    
    def do_recommendation(self, users):
        user_emb = self.user_embedding(users)
        item_embs = self.item_embedding.weight
        scores = torch.mm(user_emb, item_embs.t())
        return scores

In [29]:
data = Data(batch_size = 2048, step = 500)

time slot unique in train: [0 1 2 5 3 4 6 7 8]
                                                   uid  \
iid                                                      
0    [0, 3, 6, 10, 11, 16, 20, 22, 25, 28, 30, 32, ...   
1    [0, 1, 40, 60, 80, 83, 91, 106, 108, 114, 124,...   

                                                  time  
iid                                                     
0    [0, 0, 8, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...  
1    [0, 7, 0, 0, 7, 4, 7, 6, 1, 7, 1, 3, 2, 0, 0, ...  


In [30]:
data.add_expo_popularity(popularity_matrix)

In [31]:
dataloader = torch.utils.data.DataLoader(data, batch_size=2048, shuffle=False, num_workers = 6)

In [32]:
evaluator = evaluation(data,[20], 2048)

In [33]:
model = ConditionalBPRMF(data.n_users, data.n_items, emb_size = 32, weight_decay = 1e-04).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [34]:
start = time.perf_counter()

EPOCH = 400
for epoch in range(EPOCH):
    t_loss = 0.0
    for idx, (users, pos_items, neg_items, pos_pop, neg_pop) in enumerate(dataloader):
        users = users.to(device)
        pos_items = pos_items.to(device)
        neg_items = neg_items.to(device)
        pos_pop = pos_pop.to(device)
        neg_pop = neg_pop.to(device)
        optimizer.zero_grad()
        loss = model(users, pos_items, neg_items, pos_pop, neg_pop)
        loss.backward()
        optimizer.step()
        t_loss += loss.item()
        
    if (1+epoch) % 20 == 0 and (1+epoch) >= 100 :
        res = evaluator.eval(model)
        print(f'epoch: {epoch+1}, loss: {t_loss}')
        print(res)
        end = time.perf_counter()
        print('Running time: %s Seconds\n' % (end - start))
        start = time.perf_counter()

epoch: 100, loss: 53.28260979806472
{'precision': array([0.04032177]), 'recall': array([0.03914023]), 'ndcg': array([0.05334784]), 'hit_ratio': array([0.36083636])}
Running time: 547.5436552911997 Seconds

epoch: 120, loss: 50.652926401691566
{'precision': array([0.03970828]), 'recall': array([0.03792956]), 'ndcg': array([0.05035846]), 'hit_ratio': array([0.35582822])}
Running time: 124.95018323510885 Seconds

epoch: 140, loss: 48.54335732552048
{'precision': array([0.04145799]), 'recall': array([0.04020134]), 'ndcg': array([0.05384049]), 'hit_ratio': array([0.3659071])}
Running time: 125.34669161587954 Seconds

epoch: 160, loss: 47.066732179190716
{'precision': array([0.04216539]), 'recall': array([0.04059176]), 'ndcg': array([0.055362]), 'hit_ratio': array([0.36797296])}
Running time: 119.3810528293252 Seconds

epoch: 180, loss: 45.8281084993628
{'precision': array([0.04208714]), 'recall': array([0.0413063]), 'ndcg': array([0.05463329]), 'hit_ratio': array([0.37104044])}
Running time

In [None]:
torch.save(model.state_dict(), 'PD.pth')