In [13]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.utils.data as Data
import pandas as pd
import numpy as np
from collections import defaultdict
import math

In [9]:
train_df = pd.read_csv("~/Thesis/Data/train.csv")
ratings = pd.read_csv('~/Data/clean_rating4.csv').drop(["id"], axis = 1)

idx_to_animes = list(set(ratings['anime_id'].tolist()))
idx_to_users = list(set(ratings['user_id'].tolist()))
anime_to_idx = {anime: idx for idx, anime in enumerate(idx_to_animes)}
user_to_idx = {user: idx for idx, user in enumerate(idx_to_users)}
num_users, num_animes = len(idx_to_users), len(idx_to_animes)

In [3]:
train_data_raw = train_df.values.tolist()
train_users = list(set(train_df.user))
user_item_dic = defaultdict(list)

train_data = []
for d in train_data_raw:
    train_data.append([user_to_idx[d[0]], anime_to_idx[d[1]]])
    user_item_dic[user_to_idx[d[0]]].append(anime_to_idx[d[1]])

In [4]:
class BPRDataset(Data.Dataset):
    def __init__(self, data, num_item, num_ng, dic):
        super(BPRDataset, self).__init__()
        self.data = data
        self.num_ng = num_ng
        self.dic = dic
        self.num_item = num_item
        
    def select_ng(self):
        self.new_data = []
        for idx, d in enumerate(self.data):
            if not idx % 100000: print(idx)
            ng_num = 0
            while ng_num < self.num_ng:
                item = np.random.randint(self.num_item)   
                if item not in self.dic[d[0]]:
                    self.new_data.append([d[0], d[1], item])
                    ng_num += 1
                    
    def __len__(self):
        return self.num_ng * len(self.data)
    
    def __getitem__(self, idx):
        user = self.new_data[idx][0]
        item_i = self.new_data[idx][1]
        item_j = self.new_data[idx][2]
        
        return user, item_i, item_j

In [2]:
class BPR(nn.Module):
    def __init__(self, num_users, num_animes, num_hidden):
        super(BPR, self).__init__()
        self.user_embed = nn.Embedding(num_users, num_hidden)
        self.anime_embed = nn.Embedding(num_animes, num_hidden)
        
        nn.init.normal_(self.user_embed.weight, std = 0.01)
        nn.init.normal_(self.anime_embed.weight, std = 0.01)
        
    def forward(self, user, anime_i, anime_j):
        point_i = torch.mm(self.user_embed(user), self.anime_embed(anime_i).permute(1, 0)).sum(dim = -1)
        point_j = torch.mm(self.user_embed(user), self.anime_embed(anime_j).permute(1, 0)).sum(dim = -1)
        
        return point_i, point_j

In [6]:
batch_size = 10000
train_dataset = BPRDataset(train_data, num_animes, 5, user_item_dic)
train_dataset.select_ng()
data_iter = Data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
for x, y, z in data_iter:
    print(x.shape, y.shape, z.shape)
    break

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
torch.Size([10000]) torch.Size([10000]) torch.Size([10000])


In [7]:
def train(net, lr, num_epochs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("train on ", device)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr = lr)
    for epoch in range(num_epochs):
        l_sum, n = 0, 0
        for user, item_i, item_j in data_iter:
            user = user.to(device)
            item_i = item_i.to(device)
            item_j = item_j.to(device)
            
            point_i, point_j = net(user, item_i, item_j)
#             print(point_i.shape, point_j.shape)
            loss = - (point_i - point_j).sigmoid().log().sum()
            for name, v in net.named_parameters():
                loss += torch.mm(torch.t(v), v).sum()
#             print(loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            l_sum += loss.item()
            n += 1
            
        print(epoch + 1, l_sum / n)

In [8]:
bpr_net = BPR(num_users, num_animes, 100)
train(bpr_net, 0.001, 30)

train on  cuda
1 30.648999848172195
2 0.0007467528257032308
3 0.000663203525019216
4 0.0007771549425718985
5 0.002070564635652908
6 0.00456850003264617
7 0.012098252700507567
8 0.029975381198820178
9 0.062191725159302734
10 0.10858001306161776
11 0.17155738844466878
12 0.21775755772005506
13 0.27114282481108537
14 0.3023840341721553
15 0.3083470133525548
16 0.31004916497202584
17 0.2990703439210361
18 0.2916379367275407
19 0.27460222861414574
20 0.2738083366745849
21 0.2924947790501319
22 0.2946966233343723
23 0.2866062854163085
24 0.2890528145818192
25 0.2763640492863416
26 0.2603687576321892
27 0.2843275551612561
28 0.28042879442301133
29 0.26883271361117833
30 0.26601723223124085


In [9]:
torch.save(bpr_net.state_dict(), "BPR2.pt")

In [7]:
test_df = pd.read_csv("~/Thesis/Data/test.csv")
users = list(set(test_df.user))

In [3]:
bpr_net = BPR(4701, 9775, 100).cuda()
bpr_net.load_state_dict(torch.load("BPR2.pt"))

<All keys matched successfully>

In [4]:
def ndcg(k, ranklist, testlist):
    if not testlist: return 0
    idcg_k, dcg_k = 0, 0
    if len(testlist) < k:
        k = len(testlist)
    for i in range(k):
        idcg_k += 1 / math.log(i + 2, 2)
        
    s = set(testlist)
    hits = [idx for idx, val in enumerate(ranklist) if val in s]
    count = len(hits)
    
    for i in range(count):
        dcg_k += 1 / math.log(hits[i] + 2, 2)
        
    return float(dcg_k / idcg_k)

In [11]:
def metric(net, bound):
    precision = []
    for user in users:
        animes = test_df[test_df.user == user].anime.tolist()
        user_input = torch.LongTensor([user_to_idx[user] for _ in range(len(animes))]).cuda()
        anime_input = torch.LongTensor([anime_to_idx[i] for i in animes]).cuda()
        
#         print(user_input, anime_input)
        point_i, point_j = net(user_input, anime_input, anime_input)
#         print(point_i.shape)
        _, idx = torch.topk(point_i, 5)
        
        target = test_df[(test_df.user == user) & (test_df.rating > bound)].anime.tolist()
        idx = [animes[i] for i in idx]
        
#         precision.append(len(set(idx) & set(target)) / len(idx))
        precision.append(ndcg(10, idx, target))
        
    return np.mean(precision)

In [14]:
result = []
for k in range(9, 10):
    result.append(metric(bpr_net, k))
    print(result[-1])

0.0026816075910204686


In [9]:
print(result)

[1.0, 0.9069559668155712, 0.7984258668368431, 0.6593065305254201, 0.44981918740693466, 0.2057009146990002, 0.05207402680280791, 0.011231652839821315, 0.005360561582641991, 0.0043395022335673255]
