In [1]:
import numpy as np
import pandas as pd
import math
import random

In [2]:
data = pd.read_csv("~/Data/clean_rating4.csv").drop(["id"], axis = 1)
print(len(set(data.anime_id.tolist())))

9775


In [3]:
data = pd.read_csv('~/Data/clean_rating4.csv').drop(["id"], axis = 1)

idx_to_user = list(set(data.user_id))
idx_to_item = list(set(data.anime_id))

user_to_idx = {user: idx for idx, user in enumerate(idx_to_user)}
item_to_idx = {item: idx for idx, item in enumerate(idx_to_item)}

In [5]:
# ratings = data[data.rating > 0].rating.values.tolist()
# data = data[data.rating > 0].values

F, alpha, lam_bda, batch_size = 100, 0.02, 0.01, 512000
num_epochs, k = 30, 1 / math.sqrt(F)

P = np.array([[random.random() * k for _ in range(F)] for _ in range(len(idx_to_user))])
Q = np.array([[random.random() * k for _ in range(len(idx_to_item))] for _ in range(F)])
print(P)

[[0.04186862 0.03501887 0.05150125 ... 0.06878365 0.06032171 0.0523683 ]
 [0.07666143 0.06644667 0.04408552 ... 0.09449302 0.07899567 0.03497836]
 [0.06091234 0.00315532 0.02958437 ... 0.0836783  0.03427751 0.00404149]
 ...
 [0.02151923 0.00907406 0.02159938 ... 0.01591178 0.08004413 0.08474887]
 [0.01553068 0.07358949 0.00415209 ... 0.05967594 0.04992596 0.03977697]
 [0.02950404 0.09832602 0.09536331 ... 0.01518481 0.02261967 0.09897285]]


In [6]:
def sigmoid(x):
#     print(x)
    return 1.0 / (1 + math.exp(-x))

In [7]:
def train(train_data, alpha, lam_bda):
    index = [i for i in range(train_data.shape[0])]
    for epoch in range(num_epochs):
        sample = random.sample(index, batch_size)
        loss = 0
        for i in range(batch_size):
            user = user_to_idx[train_data[sample[i], 0]]
            item = item_to_idx[train_data[sample[i], 1]]
            rui = train_data[sample[i], 2]

            eui = rui - sigmoid(P[user, :].dot(Q[:, item]))
            loss += eui
            for f in range(F):
                P[user, f] += alpha * (eui * Q[f, item] - lam_bda * P[user, f])
                Q[f, item] += alpha * (eui * P[user, f] - lam_bda * Q[f, item])

        alpha *= 0.9
        print(epoch + 1, loss / batch_size)

In [8]:
def get_hot():
    animes = pd.read_csv("~/Data/anime.csv")
    animes = animes[animes["anime_id"].isin(idx_to_item)].loc[:, ["anime_id", "rating", "members"]].fillna(0)

    scalar = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
    animes["rating_norm"] = animes[["rating"]].apply(scalar)
    animes["members_norm"] = animes[["members"]].apply(scalar)
    animes["weight"] = 0.6 * animes["rating_norm"] + 0.4 * animes["members_norm"]
    animes = animes.sort_values(by = "weight", ascending = False)
    
    return animes.anime_id.tolist()

In [9]:
def negative_sample():
    sample_num = 100
    hot_items, neg_sample = get_hot(), []
    data = pd.read_csv("~/Data/clean_rating4.csv")
    grouped = data.groupby(["user_id"])
    for gp in grouped.groups:
        p, num = grouped.get_group(gp).anime_id.tolist(), 0
        for item in hot_items:
            if item not in p: 
                neg_sample.append([gp, item, 0])
                num += 1
            if num == sample_num:
                break

    return np.array(neg_sample)

In [10]:
train_data = np.array(pd.read_csv("~/Thesis/Data/train.csv"))
train_data[:, -1] = np.array([1 for _ in range(train_data.shape[0])])
train_data = np.concatenate((train_data, negative_sample()))
print(train_data)
# from sklearn.model_selection import train_test_split
# train_data, test_data = train_test_split(data, test_size = 0.3)
# train_ratio = 0.9
# train_data = data[:int(len(data) * train_ratio)]
# test_data = data[int(len(data) * train_ratio):]
# print(train_data.shape, test_data.shape)

[[    1    24     1]
 [    1    30     1]
 [    1    63     1]
 ...
 [ 4701 17895     0]
 [ 4701 26243     0]
 [ 4701 25013     0]]


In [55]:
train(train_data, alpha, lam_bda)

1 0.18030898087022273
2 0.10848601644591498
3 0.0709805096193708
4 0.0527580640736663
5 0.043247173574194504
6 0.03734086560867708
7 0.0335774526962298
8 0.030060293921356455
9 0.028271445582630114
10 0.026635097122290084
11 0.025345283139165883
12 0.024212969759301893
13 0.023434833138787258
14 0.02255711258725834
15 0.0223114990172638
16 0.02200024780382737
17 0.020914185943746003
18 0.020875434378049923
19 0.02028482928001763
20 0.020124117336000058
21 0.01994991355278006
22 0.020115463519858216
23 0.019422970193320926
24 0.019307259660603076
25 0.01928419077902894
26 0.018771222958243548
27 0.018768842033407184
28 0.01869189932446103
29 0.019206361817647653
30 0.019255725324471465


In [10]:
def ndcg(k, ranklist, testlist):
    if not testlist: return 0
    idcg_k, dcg_k = 0, 0
    if len(testlist) < k:
        k = len(testlist)
    for i in range(k):
        idcg_k += 1 / math.log(i + 2, 2)
        
    s = set(testlist)
    hits = [idx for idx, val in enumerate(ranklist) if val in s]
    count = len(hits)
    
    for i in range(count):
        dcg_k += 1 / math.log(hits[i] + 2, 2)
        
    return float(dcg_k / idcg_k)

In [65]:
df = pd.read_csv("~/Thesis/Data/test.csv")
test_users = list(set(df.user))
# df = pd.read_csv("~/Data/clean_rating4.csv").drop(["id"], axis = 1)
user_dict, final, precision = {}, [], []
recall, f1, ndcg_k = [], [], []
predict = P.dot(Q)
for k in range(10):
    ndcg_k, precision = [], []
    for user in test_users:
        dic = {}
        test_items = [item_to_idx[i] for i in df[df.user == user].anime.tolist()]
        target = [item_to_idx[data] for data in df[(df.user == user) & (df.rating > k)].anime.tolist()]
        
        user_idx = user_to_idx[user]
        pred = predict[user_idx, :]
        
        for item in test_items:
            dic[item] = pred[item]
        
#         print(set(dic.keys()) == set(target))
        topk = [i[0] for i in sorted(dic.items(), key = lambda x: x[1], reverse = True)[:5]]
        both = list(set(topk) & set(target))
#         print(len(both))

        precision.append(len(both) / len(topk))


    print(np.mean(precision))
    final.append(np.mean(precision))

1.0
0.9927674962773878
0.9833652414379922
0.9658796000850883
0.9320144650074453
0.865049989363965
0.7365241437991916
0.48810891299723463
0.23731121038077002
0.08959795788130184


In [66]:
print(final)

[1.0, 0.9927674962773878, 0.9833652414379922, 0.9658796000850883, 0.9320144650074453, 0.865049989363965, 0.7365241437991916, 0.48810891299723463, 0.23731121038077002, 0.08959795788130184]


In [2]:
a, b, c = 0, 0, 0

print(a, b, c)

SyntaxError: illegal expression for augmented assignment (<ipython-input-2-b5e5c79f0a1a>, line 2)