In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.utils.data as Data
import random
import math

In [14]:
def get_data(ratings, ratio):
    uids, aids = list(set(ratings.user_id.tolist())), list(set(ratings.anime_id.tolist()))
    train_data, test_data = [], []
    for u in uids:
        if not uids.index(u) % 100: print(uids.index(u))
        for r in range(1, 11):
            temp = ratings[(ratings.user_id == u) & (ratings.rating == r)].values.tolist()
            if not temp: 
                continue
            
            test_data += temp[int(len(temp) * ratio):]
            train_data += temp[:int(len(temp) * ratio)]
    
    train_data += ratings[ratings.rating == -1].values.tolist()
    return train_data, test_data

In [2]:
ratings = pd.read_csv('~/Data/clean_rating4.csv').drop(["id"], axis = 1)

idx_to_animes = list(set(ratings['anime_id'].tolist()))
idx_to_users = list(set(ratings['user_id'].tolist()))
anime_to_idx = {anime: idx for idx, anime in enumerate(idx_to_animes)}
user_to_idx = {user: idx for idx, user in enumerate(idx_to_users)}
num_users, num_animes = len(idx_to_users), len(idx_to_animes)

train_ratio = 0.9

In [3]:
train_data, test_data = get_data(ratings, 0.7)

In [4]:
train_df = pd.DataFrame(train_data, columns = ["user", "anime", "rating"])
test_df = pd.DataFrame(test_data, columns = ["user", "anime", "rating"])

train_df.to_csv("train2.csv", index = False)
test_df.to_csv("test2.csv", index = False)

In [7]:
print(len(train_data), len(test_data))

1636407 680100


In [11]:
from collections import defaultdict
train_data = pd.read_csv("train.csv").values.tolist()
user_item_dic, data_ps_list = defaultdict(list), []
for d in train_data:
    user_item_dic[user_to_idx[d[0]]].append(anime_to_idx[d[1]])
    data_ps_list.append([user_to_idx[d[0]], anime_to_idx[d[1]]])
    
user_item_dic, data_ps = dict(user_item_dic), data_ps_list

In [12]:
def get_hot_items():
    animes = pd.read_csv("~/Data/anime.csv")
    animes = animes[animes["anime_id"].isin(idx_to_animes)].loc[:, ["anime_id", "rating", "members"]].fillna(0)

    scalar = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
    animes["rating_norm"] = animes[["rating"]].apply(scalar)
    animes["members_norm"] = animes[["members"]].apply(scalar)
    animes["weight"] = 0.6 * animes["rating_norm"] + 0.4 * animes["members_norm"]
    animes = animes.sort_values(by = "weight", ascending = False)
    
    return [anime_to_idx[i] for i in animes.anime_id.tolist()]

In [13]:
class NCFDataset(Data.Dataset):
    def __init__(self, data_ps, users, num_items, hot_items, dic, num_ng = 100):
        super(NCFDataset, self).__init__()
        self.data_ps = data_ps
#         self.train_sample = train_sample
        self.users = users
        self.num_items = num_items
        self.num_ng = num_ng
        self.hot_items = hot_items
        self.dic = dic

    def select_ng(self):
        self.data_ng, num = [], 0
        for u in self.users:
            for item in self.hot_items:
                if item not in self.dic[u]:
                    self.data_ng.append([u, item])
                    num += 1
                if num == self.num_ng: break
            
            num = 0
                
        print(len(self.data_ng))
        self.label_ps = [1 for i in range(len(self.data_ps))]
        self.label_ng = [0 for i in range(len(self.data_ng))]
        self.data = self.data_ps + self.data_ng
        self.label = self.label_ps + self.label_ng

    def __getitem__(self, idx):
        user = self.data[idx][0]
        item = self.data[idx][1]
        label = self.label[idx]

        return user, item, label

    def __len__(self):
        return self.num_ng * len(self.users) + len(self.data_ps)

In [7]:
def train_data_iter(train_data, batch_size = 5000):
    user_inputs, item_inputs, labels = [], [], []
    for u in range(len(train_data)):
        for i in range(len(train_data[0])):
            user_inputs.append(u)
            item_inputs.append(i)
            labels.append(train_data[u, i])
            
            if len(user_inputs) == batch_size:
                yield torch.LongTensor(user_inputs), torch.LongTensor(item_inputs), torch.LongTensor(labels)
            user_inputs, item_inputs, labels = [], [], []

In [8]:
def test_data_iter(test_data):
    ng_items, ps_items = [], []
    for u in range(len(test_data)):
        for i in range(len(test_data[0])):
            if test_data[u, i] == 0:
                ng_items.append(i)
            else:
                ps_items.append(i)

        yield torch.LongTensor([u for _ in range(len(ng_items))]), torch.LongTensor(ng_items), torch.LongTensor(ps_items)
        ng_items, ps_items = [], []

In [14]:
batch_size = 4096
users = [user_to_idx[i] for i in list(set(ratings.user_id.tolist()))]
dataset = NCFDataset(data_ps, users, num_animes, get_hot_items(), user_item_dic, num_ng = 100)
dataset.select_ng()
data_iter = Data.DataLoader(dataset, batch_size = batch_size, shuffle = True)

470100


In [15]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, model, factor_num = 8, num_layers = 3,
               MLP_model = None, GMF_model = None, alpha = 0.5, dropout = 0.5):
        super(NCF, self).__init__()
        self.MLP_model = MLP_model
        self.GMF_model = GMF_model
        self.alpha = alpha
        self.dropout = dropout
        self.user_embed_GMF = nn.Embedding(num_users, factor_num)
        self.item_embed_GMF = nn.Embedding(num_items, factor_num)
        self.user_embed_MLP = nn.Embedding(num_users, factor_num * (2 ** (num_layers - 1)))
        self.item_embed_MLP = nn.Embedding(num_items, factor_num * (2 ** (num_layers - 1)))

        self.MLP = nn.Sequential(
            nn.Dropout(p = self.dropout),
            nn.Linear(factor_num * (2 ** num_layers), factor_num * (2 ** (num_layers - 1))),
            nn.ReLU()
        )
        for layer in range(num_layers - 1, 0, -1):
            self.MLP.add_module('dropout' + str(num_layers - layer), nn.Dropout(p = self.dropout))
            self.MLP.add_module('linear' + str(num_layers - layer), nn.Linear(factor_num * (2 ** layer), factor_num * (2 ** (layer - 1))))
            self.MLP.add_module('relu' + str(num_layers - layer), nn.ReLU())

        self.model = model
        if self.model in ['GMF', 'MLP']:
            self.NeuMF = nn.Linear(factor_num, 1)
        else:
            self.NeuMF = nn.Linear(2 * factor_num, 1)

        self.__init_weights__()

    def __init_weights__(self):
        if self.model in ['GMF', 'MLP']:
            nn.init.normal_(self.user_embed_GMF.weight, std = 0.01)
            nn.init.normal_(self.item_embed_GMF.weight, std = 0.01)
            nn.init.normal_(self.user_embed_MLP.weight, std = 0.01)
            nn.init.normal_(self.item_embed_MLP.weight, std = 0.01)

            for layer in self.MLP:
                if isinstance(layer, nn.Linear):
                    nn.init.xavier_uniform_(layer.weight)
#                   nn.init.normal_(layer.weight, std = 0.01)

            nn.init.kaiming_uniform_(self.NeuMF.weight, a = 1, nonlinearity = 'sigmoid')
#             nn.init.normal_(self.NeuMF.weight, std = 0.01)

        elif self.GMF_model and self.MLP_model:
            self.user_embed_GMF.weight.data.copy_(self.GMF_model.user_embed_GMF.weight)
            self.item_embed_GMF.weight.data.copy_(self.GMF_model.item_embed_GMF.weight)
            self.user_embed_MLP.weight.data.copy_(self.MLP_model.user_embed_MLP.weight)
            self.item_embed_MLP.weight.data.copy_(self.MLP_model.item_embed_MLP.weight)

            for (m1, m2) in zip(self.MLP, self.MLP_model.MLP):
                if isinstance(m1, nn.Linear) and isinstance(m2, nn.Linear):
                    m1.weight.data.copy_(m2.weight)
                    m1.bias.data.copy_(m2.bias)

            NeuMF_weight = torch.cat((self.alpha * self.GMF_model.NeuMF.weight, (1 - self.alpha) * self.MLP_model.NeuMF.weight), 1)
            NeuMF_bias = self.GMF_model.NeuMF.bias + self.MLP_model.NeuMF.bias

            self.NeuMF.weight.data.copy_(NeuMF_weight)
            self.NeuMF.bias.data.copy_(NeuMF_bias)
            
    def forward(self, user, item):
        if self.model is 'GMF' or 'NCF':
            user_embed_GMF = self.user_embed_GMF(user)
            item_embed_GMF = self.item_embed_GMF(item)
            
#             print(user_embed_GMF.device, item_embed_GMF.decive)
            GMF_output = user_embed_GMF * item_embed_GMF

        if self.model is 'MLP' or 'NCF':
            user_embed_MLP = self.user_embed_MLP(user)
            item_embed_MLP = self.item_embed_MLP(item)

            MLP_input = torch.cat((user_embed_MLP, item_embed_MLP), 1)
            MLP_output = self.MLP(MLP_input)

        if self.model is 'NCF':
            return self.NeuMF(torch.cat((MLP_output, GMF_output), 1))
        elif self.model is 'MLP':
            return self.NeuMF(MLP_output)
        elif self.model is 'GMF':
            return self.NeuMF(GMF_output)

In [16]:
def train(net, num_epochs, lr, train_type = 'NCF'):
    print(train_type)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     device = torch.device('cpu')
    print('train on', device)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr = lr)
    loss = nn.BCEWithLogitsLoss()
    for epoch in range(num_epochs):
        l_sum,n = 0, 0
        for user, item, label in data_iter:
            user = user.to(device)
            item = item.to(device)
            label = label.to(device)
            pred = net(user, item)
            l = loss(pred.view(label.shape), label.float())

            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            l_sum += l.cpu().item()
            n += 1

        print(epoch + 1, l_sum / n)

In [17]:
MLP_net = NCF(num_users, num_animes, model = 'MLP')
train(MLP_net, 30, lr = 0.0001, train_type = 'MLP')

GMF_net = NCF(num_users, num_animes, model = 'GMF')
train(GMF_net, 30, lr = 0.0001, train_type = 'GMF')

NCF_net = NCF(num_users, num_animes, model = 'NCF', GMF_model = GMF_net, MLP_model = MLP_net)
train(NCF_net, 30, lr = 0.0001)


MLP
train on cuda
1 0.6772717644288702
2 0.3193732900526917
3 0.24635250559709604
4 0.2297241909411347
5 0.21995399863395876
6 0.2138198908960935
7 0.21006114584149665
8 0.20722362905451394
9 0.20497039515995286
10 0.20336203013808982
11 0.20182187062444037
12 0.20056911189000584
13 0.19908592521565632
14 0.19753121701837745
15 0.19595400225190282
16 0.1943134763576452
17 0.1930201905155645
18 0.19164339275036044
19 0.19067543284985625
20 0.18951160930892796
21 0.18850933050067678
22 0.1876950127407185
23 0.1864992302887648
24 0.1856056925741214
25 0.18477319924576768
26 0.18394603989656688
27 0.18291025257226332
28 0.1820479704917056
29 0.18118463861710818
30 0.18042890614676244
GMF
train on cuda
1 0.6850779712778851
2 0.6692479678728048
3 0.6405720547564979
4 0.5980559568960689
5 0.5463525998939588
6 0.4908029453847015
7 0.4366613899041148
8 0.38788245488139034
9 0.34657379371448627
10 0.3131496467636627
11 0.2868168946608756
12 0.26631812840989494
13 0.2504369759154551
14 0.23805833

In [15]:
torch.save(NCF_net.state_dict(), "NCF2.pt")

In [4]:
MLP_net = NCF(num_users, num_animes, model = 'MLP')
# train(MLP_net, epochs, lr = 0.0001, train_type = 'MLP')

GMF_net = NCF(num_users, num_animes, model = 'GMF')
# train(GMF_net, epochs, lr = 0.0001, train_type = 'GMF')


model = NCF(num_users, num_animes, model = 'NCF', GMF_model = GMF_net, MLP_model = MLP_net).cuda()
model.load_state_dict(torch.load("NCF2.pt"))

<All keys matched successfully>

In [5]:
df = pd.read_csv("test.csv")
users = list(set(df.user))

In [29]:
df = pd.read_csv("test.csv")
users = list(set(df.user))
precise = []
for u in users:
#     print(u)
    all_item = [anime_to_idx[i] for i in df[df.user == u].anime.tolist()]
    all_items = torch.LongTensor(all_item).cuda()
    test_user = torch.LongTensor([user_to_idx[u] for i in range(len(all_item))]).cuda()
    
#     print(all_items.shape, test_user.shape)
    
    pred = NCF_net(test_user, all_items)
    pred = pred.view(1, -1).detach().cpu().numpy()[0]
    index = [i for i in range(len(pred))]
    idx = sorted(dict(zip(pred, index)).items(), key = lambda x: x[0], reverse = True)[:5]
    idx = [all_item[d[1]] for d in idx]
    
        
    target = [anime_to_idx[i] for i in df[(df.user == u) & (df.rating > 9)].anime.tolist()]
    
    
    overlap = list(set(target) & set(idx))
        
    precise.append(len(overlap) / len(idx))
#     print(precise[-1])
    
print(np.mean(precise))

0.08389704318230165


In [9]:
def ndcg(k, ranklist, testlist):
    if not testlist: return 0
    idcg_k, dcg_k = 0, 0
    if len(testlist) < k:
        k = len(testlist)
    for i in range(k):
        idcg_k += 1 / math.log(i + 2, 2)
        
    s = set(testlist)
    hits = [idx for idx, val in enumerate(ranklist) if val in s]
    count = len(hits)
    
    for i in range(count):
        dcg_k += 1 / math.log(hits[i] + 2, 2)
        
    return float(dcg_k / idcg_k)

In [7]:
def metrics(net, df, bound):
    precise, ndcg_k = [], []
    users = list(set(df.user))
    for u in users:
        all_item = [anime_to_idx[i] for i in df[df.user == u].anime.tolist()]
    #     print(all_item)
        all_items = torch.LongTensor(all_item).cuda()
        test_user = torch.LongTensor([user_to_idx[u] for i in range(len(all_item))]).cuda()

        pred = net(test_user, all_items)

        pred = pred.view(1, -1).detach().cpu().numpy()[0]
        index = [i for i in range(len(pred))]
        idx = sorted(dict(zip(pred, index)).items(), key = lambda x: x[0], reverse = True)[:5]
        idx = [all_item[d[1]] for d in idx]

        target = [anime_to_idx[i] for i in df[(df.user == u) & (df.rating == bound)].anime.tolist()]

    #     idx = [all_item[i] for i in idx.cpu().numpy().flatten()]
        overlap = list(set(target) & set(idx))

        precise.append(ndcg(10, idx, target))
    #     print(precise[-1])

    return np.mean(precise)

In [18]:
# model = NCF(num_users, num_animes, model = 'NCF', GMF_model = GMF_net, MLP_model = MLP_net).cuda()
# model.load_state_dict(torch.load("NCF.pt"))
precise = []
for bound in range(9, 10):
    precise.append(metrics(NCF_net, df, bound))
    print(precise[-1])
    
print(precise)

0.09185195697295856
[0.09185195697295856]
