In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.utils.data as Data
import random

In [2]:
ratings = pd.read_csv('~/Data/clean_rating3.csv')

idx_to_animes = list(set(ratings['anime_id'].tolist()))
idx_to_users = list(set(ratings['user_id'].tolist()))
anime_to_idx = {anime: idx for idx, anime in enumerate(idx_to_animes)}
user_to_idx = {user: idx for idx, user in enumerate(idx_to_users)}
num_users, num_animes = len(idx_to_users), len(idx_to_animes)

train_ratio = 0.9
# all_users = list(set(ratings.user_id))
# train_sample = random.sample(all_users, int(num_users * train_ratio))
# train_data = ratings[ratings.user_id.isin(train_sample)]
# test_data = list(set(all_users) - set(train_sample))
# print(train_data.shape)

data_ps = np.array(ratings.values.tolist())[:, :-1]
train_data = data_ps[:int(train_ratio * len(data_ps))]
test_data = data_ps[int(train_ratio * len(data_ps)):]
print(train_data.shape, test_data.shape)

from collections import defaultdict
user_item_dic, data_ps_list = defaultdict(list), []
for d in train_data:
    user_item_dic[user_to_idx[d[0]]].append(anime_to_idx[d[1]])
    data_ps_list.append([user_to_idx[d[0]], anime_to_idx[d[1]]])
    
user_item_dic, data_ps = dict(user_item_dic), data_ps_list

(2084859, 2) (231651, 2)


In [3]:
def get_hot_items():
    animes = pd.read_csv("~/Data/anime.csv")
    animes = animes[animes["anime_id"].isin(idx_to_animes)].loc[:, ["anime_id", "rating", "members"]].fillna(0)

    scalar = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
    animes["rating_norm"] = animes[["rating"]].apply(scalar)
    animes["members_norm"] = animes[["members"]].apply(scalar)
    animes["weight"] = 0.6 * animes["rating_norm"] + 0.4 * animes["members_norm"]
    animes = animes.sort_values(by = "weight", ascending = False)
    
    return [anime_to_idx[i] for i in animes.anime_id.tolist()]

In [4]:
class NCFDataset(Data.Dataset):
    def __init__(self, data_ps, users, num_items, hot_items, dic, num_ng = 100):
        super(NCFDataset, self).__init__()
        self.data_ps = data_ps
#         self.train_sample = train_sample
        self.users = users
        self.num_items = num_items
        self.num_ng = num_ng
        self.hot_items = hot_items
        self.dic = dic

    def select_ng(self):
        self.data_ng, num = [], 0
        for u in self.users:
            for item in self.hot_items:
                if item not in self.dic[u]:
                    self.data_ng.append([u, item])
                    num += 1
                if num == self.num_ng: break
            
            num = 0
                
        print(len(self.data_ng))
        self.label_ps = [1 for i in range(len(self.data_ps))]
        self.label_ng = [0 for i in range(len(self.data_ng))]
        self.data = self.data_ps + self.data_ng
        self.label = self.label_ps + self.label_ng

    def __getitem__(self, idx):
        user = self.data[idx][0]
        item = self.data[idx][1]
        label = self.label[idx]

        return user, item, label

    def __len__(self):
        return self.num_ng * len(self.users) + len(self.data_ps)

In [7]:
def train_data_iter(train_data, batch_size = 5000):
    user_inputs, item_inputs, labels = [], [], []
    for u in range(len(train_data)):
        for i in range(len(train_data[0])):
            user_inputs.append(u)
            item_inputs.append(i)
            labels.append(train_data[u, i])
            
            if len(user_inputs) == batch_size:
                yield torch.LongTensor(user_inputs), torch.LongTensor(item_inputs), torch.LongTensor(labels)
            user_inputs, item_inputs, labels = [], [], []

In [8]:
def test_data_iter(test_data):
    ng_items, ps_items = [], []
    for u in range(len(test_data)):
        for i in range(len(test_data[0])):
            if test_data[u, i] == 0:
                ng_items.append(i)
            else:
                ps_items.append(i)

        yield torch.LongTensor([u for _ in range(len(ng_items))]), torch.LongTensor(ng_items), torch.LongTensor(ps_items)
        ng_items, ps_items = [], []

In [5]:
batch_size = 1024
users = [user_to_idx[i] for i in list(set(train_data[:, 0]))]
dataset = NCFDataset(data_ps, users, num_animes, get_hot_items(), user_item_dic, num_ng = 100)
dataset.select_ng()
data_iter = Data.DataLoader(dataset, batch_size = batch_size, shuffle = True)

424400


In [6]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, model, factor_num = 8, num_layers = 3,
               MLP_model = None, GMF_model = None, alpha = 0.5, dropout = 0.5):
        super(NCF, self).__init__()
        self.MLP_model = MLP_model
        self.GMF_model = GMF_model
        self.alpha = alpha
        self.dropout = dropout
        self.user_embed_GMF = nn.Embedding(num_users, factor_num)
        self.item_embed_GMF = nn.Embedding(num_items, factor_num)
        self.user_embed_MLP = nn.Embedding(num_users, factor_num * (2 ** (num_layers - 1)))
        self.item_embed_MLP = nn.Embedding(num_items, factor_num * (2 ** (num_layers - 1)))

        self.MLP = nn.Sequential(
            nn.Dropout(p = self.dropout),
            nn.Linear(factor_num * (2 ** num_layers), factor_num * (2 ** (num_layers - 1))),
            nn.ReLU()
        )
        for layer in range(num_layers - 1, 0, -1):
            self.MLP.add_module('dropout' + str(num_layers - layer), nn.Dropout(p = self.dropout))
            self.MLP.add_module('linear' + str(num_layers - layer), nn.Linear(factor_num * (2 ** layer), factor_num * (2 ** (layer - 1))))
            self.MLP.add_module('relu' + str(num_layers - layer), nn.ReLU())

        self.model = model
        if self.model in ['GMF', 'MLP']:
            self.NeuMF = nn.Linear(factor_num, 1)
        else:
            self.NeuMF = nn.Linear(2 * factor_num, 1)

        self.__init_weights__()

    def __init_weights__(self):
        if self.model in ['GMF', 'MLP']:
            nn.init.normal_(self.user_embed_GMF.weight, std = 0.01)
            nn.init.normal_(self.item_embed_GMF.weight, std = 0.01)
            nn.init.normal_(self.user_embed_MLP.weight, std = 0.01)
            nn.init.normal_(self.item_embed_MLP.weight, std = 0.01)

            for layer in self.MLP:
                if isinstance(layer, nn.Linear):
                    nn.init.xavier_uniform_(layer.weight)
#                   nn.init.normal_(layer.weight, std = 0.01)

            nn.init.kaiming_uniform_(self.NeuMF.weight, a = 1, nonlinearity = 'sigmoid')
#             nn.init.normal_(self.NeuMF.weight, std = 0.01)

        elif self.GMF_model and self.MLP_model:
            self.user_embed_GMF.weight.data.copy_(self.GMF_model.user_embed_GMF.weight)
            self.item_embed_GMF.weight.data.copy_(self.GMF_model.item_embed_GMF.weight)
            self.user_embed_MLP.weight.data.copy_(self.MLP_model.user_embed_MLP.weight)
            self.item_embed_MLP.weight.data.copy_(self.MLP_model.item_embed_MLP.weight)

            for (m1, m2) in zip(self.MLP, self.MLP_model.MLP):
                if isinstance(m1, nn.Linear) and isinstance(m2, nn.Linear):
                    m1.weight.data.copy_(m2.weight)
                    m1.bias.data.copy_(m2.bias)

            NeuMF_weight = torch.cat((self.alpha * self.GMF_model.NeuMF.weight, (1 - self.alpha) * self.MLP_model.NeuMF.weight), 1)
            NeuMF_bias = self.GMF_model.NeuMF.bias + self.MLP_model.NeuMF.bias

            self.NeuMF.weight.data.copy_(NeuMF_weight)
            self.NeuMF.bias.data.copy_(NeuMF_bias)
            
    def forward(self, user, item):
        if self.model is 'GMF' or 'NCF':
            user_embed_GMF = self.user_embed_GMF(user)
            item_embed_GMF = self.item_embed_GMF(item)
            
#             print(user_embed_GMF.device, item_embed_GMF.decive)
            GMF_output = user_embed_GMF * item_embed_GMF

        if self.model is 'MLP' or 'NCF':
            user_embed_MLP = self.user_embed_MLP(user)
            item_embed_MLP = self.item_embed_MLP(item)

            MLP_input = torch.cat((user_embed_MLP, item_embed_MLP), 1)
            MLP_output = self.MLP(MLP_input)

        if self.model is 'NCF':
            return self.NeuMF(torch.cat((MLP_output, GMF_output), 1))
        elif self.model is 'MLP':
            return self.NeuMF(MLP_output)
        elif self.model is 'GMF':
            return self.NeuMF(GMF_output)

In [7]:
def train(net, num_epochs, lr, train_type = 'NCF'):
    print(train_type)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     device = torch.device('cpu')
    print('train on', device)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr = lr)
    loss = nn.BCEWithLogitsLoss()
    for epoch in range(num_epochs):
        l_sum,n = 0, 0
        for user, item, label in data_iter:
            user = user.to(device)
            item = item.to(device)
            label = label.to(device)
            pred = net(user, item)
            l = loss(pred.view(label.shape), label.float())

            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            l_sum += l.cpu().item()
            n += 1

        print(epoch + 1, l_sum / n)

In [8]:
MLP_net = NCF(num_users, num_animes, model = 'MLP')
train(MLP_net, 30, lr = 0.0001, train_type = 'MLP')

GMF_net = NCF(num_users, num_animes, model = 'GMF')
train(GMF_net, 30, lr = 0.0001, train_type = 'GMF')

NCF_net = NCF(num_users, num_animes, model = 'NCF', GMF_model = GMF_net, MLP_model = MLP_net)
train(NCF_net, 30, lr = 0.0001)

MLP
train on cuda
1 0.37157237279274086
2 0.2477973664976635
3 0.2396407093030489
4 0.2336932306129657
5 0.22844408198990465
6 0.22506443206532445
7 0.22338746511196905
8 0.22189184373395873
9 0.2209419065935181
10 0.22026824099416395
11 0.21958490476614112
12 0.21896676738035528
13 0.21835950474771176
14 0.21774004235018618
15 0.21702001713480767
16 0.21648280844791526
17 0.21591884981778534
18 0.21541594328782063
19 0.21482789030686927
20 0.2144672526529204
21 0.21395322281041956
22 0.21353701458615412
23 0.21312855110514753
24 0.21258850201131468
25 0.2123978354830199
26 0.21200651642936727
27 0.21180320479757006
28 0.2116168748476417
29 0.21132563668603754
30 0.2107019572112571
GMF
train on cuda
1 0.6627772962837889
2 0.5444680874394379
3 0.4088423880617359
4 0.31827214697048356
5 0.271397169003045
6 0.24677128124387834
7 0.2321793955900094
8 0.22263663271552248
9 0.21607857659270646
10 0.2114692259593576
11 0.2082077394281782
12 0.20589140191315536
13 0.20423908490324352
14 0.2030

In [28]:
def metrics(net, test_data, bound):
    users = list(set(test_data[:,0]))
    recall, precise = [], []
    for user in users:
        all_items = torch.LongTensor([i for i in range(num_animes)]).cuda()
        test_user = torch.LongTensor([user for i in range(num_animes)]).cuda()
        pred = net(test_user, all_items)
        _, idx = torch.topk(pred, k = 5, dim = 0)
        
        target = [anime_to_idx[i[1]] for i in ratings[ratings.user_id == user].values if i[2] > bound]
        if not target: continue
        idx = idx.cpu().numpy().flatten()
        overlap = list(set(target) & set(idx))
#         print(overlap)
        recall.append(len(overlap) / len(target))
        precise.append(len(overlap) / len(idx))
        
    return np.mean(precise)

In [97]:
torch.save(NCF_net.state_dict(), "NCF.pt")
torch.save(GMF_net.state_dict(), "GMF.pt")
torch.save(MLP_net.state_dict(), "MLP.pt")

In [29]:
# model = NCF(num_users, num_animes, model = 'NCF', GMF_model = GMF_net, MLP_model = MLP_net).cuda()
# model.load_state_dict(torch.load("NCF.pt"))
precise = []
for bound in range(10):
    precise.append(metrics(NCF_net, test_data, bound))

print(precise)

[0.18340611353711792, 0.17292576419213973, 0.16331877729257643, 0.1646288209606987, 0.15938864628820962, 0.14890829694323146, 0.1427947598253275, 0.07554585152838426, 0.03449781659388646, 0.011920529801324504]
