In [1]:
import numpy as np
import pandas as pd
import math
import random

In [2]:
data = pd.read_csv("~/Data/clean_rating4.csv").drop(["id"], axis = 1)
print(len(set(data.anime_id.tolist())))

9775


In [3]:
data = pd.read_csv('~/Data/clean_rating4.csv').drop(["id"], axis = 1)

idx_to_user = list(set(data.user_id))
idx_to_item = list(set(data.anime_id))

user_to_idx = {user: idx for idx, user in enumerate(idx_to_user)}
item_to_idx = {item: idx for idx, item in enumerate(idx_to_item)}

In [4]:
# ratings = data[data.rating > 0].rating.values.tolist()
# data = data[data.rating > 0].values

F, alpha, lam_bda, batch_size = 100, 0.02, 0.01, 512000
num_epochs, k = 30, 1 / math.sqrt(F)

P = np.array([[random.random() * k for _ in range(F)] for _ in range(len(idx_to_user))])
Q = np.array([[random.random() * k for _ in range(len(idx_to_item))] for _ in range(F)])
print(P)

[[0.05125325 0.08288313 0.07094636 ... 0.03449181 0.00810814 0.08796441]
 [0.09622844 0.09788483 0.00331065 ... 0.07016896 0.0413372  0.04071853]
 [0.09847842 0.03703292 0.04571082 ... 0.06038724 0.03800482 0.05239737]
 ...
 [0.07059714 0.07077892 0.08051576 ... 0.04653505 0.01941656 0.08299024]
 [0.05212735 0.05231705 0.03301792 ... 0.02747236 0.03613743 0.05168683]
 [0.04654265 0.00717057 0.04587552 ... 0.01531769 0.05228433 0.07105499]]


In [5]:
def sigmoid(x):
#     print(x)
    return 1.0 / (1 + math.exp(-x))

In [6]:
def train(train_data, alpha, lam_bda):
    index = [i for i in range(train_data.shape[0])]
    for epoch in range(num_epochs):
        sample = random.sample(index, batch_size)
        loss = 0
        for i in range(batch_size):
            user = user_to_idx[train_data[sample[i], 0]]
            item = item_to_idx[train_data[sample[i], 1]]
            rui = train_data[sample[i], 2]

            eui = rui - sigmoid(P[user, :].dot(Q[:, item]))
            loss += eui
            for f in range(F):
                P[user, f] += alpha * (eui * Q[f, item] - lam_bda * P[user, f])
                Q[f, item] += alpha * (eui * P[user, f] - lam_bda * Q[f, item])

        alpha *= 0.9
        print(epoch + 1, loss / batch_size)

In [7]:
def get_hot():
    animes = pd.read_csv("~/Data/anime.csv")
    animes = animes[animes["anime_id"].isin(idx_to_item)].loc[:, ["anime_id", "rating", "members"]].fillna(0)

    scalar = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
    animes["rating_norm"] = animes[["rating"]].apply(scalar)
    animes["members_norm"] = animes[["members"]].apply(scalar)
    animes["weight"] = 0.6 * animes["rating_norm"] + 0.4 * animes["members_norm"]
    animes = animes.sort_values(by = "weight", ascending = False)
    
    return animes.anime_id.tolist()

In [8]:
def negative_sample():
    sample_num = 100
    hot_items, neg_sample = get_hot(), []
    data = pd.read_csv("~/Data/clean_rating4.csv")
    grouped = data.groupby(["user_id"])
    for gp in grouped.groups:
        p, num = grouped.get_group(gp).anime_id.tolist(), 0
        for item in hot_items:
            if item not in p: 
                neg_sample.append([gp, item, 0])
                num += 1
            if num == sample_num:
                break

    return np.array(neg_sample)

In [9]:
train_data = np.array(pd.read_csv("~/Thesis/Data/train.csv"))
train_data[:, -1] = np.array([1 for _ in range(train_data.shape[0])])
train_data = np.concatenate((train_data, negative_sample()))
print(train_data)
# from sklearn.model_selection import train_test_split
# train_data, test_data = train_test_split(data, test_size = 0.3)
# train_ratio = 0.9
# train_data = data[:int(len(data) * train_ratio)]
# test_data = data[int(len(data) * train_ratio):]
# print(train_data.shape, test_data.shape)

[[    1    24     1]
 [    1    30     1]
 [    1    63     1]
 ...
 [ 4701 17895     0]
 [ 4701 26243     0]
 [ 4701 25013     0]]


In [10]:
train(train_data, alpha, lam_bda)

1 0.1804374085628686
2 0.10870383309928956
3 0.0712548110561285
4 0.05269657543122271
5 0.04332135649849241
6 0.036903072671099026
7 0.032909514628665745
8 0.03061615095529045
9 0.02829323482949638
10 0.026591735669735477
11 0.02531232580010883
12 0.024125545700460733
13 0.0234375086243116
14 0.023275330404514162
15 0.021803502535388684
16 0.0218736852141911
17 0.020900048719539214
18 0.021065231858990043
19 0.02038518764173666
20 0.02055196623006795
21 0.019743105341430128
22 0.019808590611309625
23 0.020038395077434774
24 0.01892769689075691
25 0.01937427975256554
26 0.018946923494932585
27 0.018857164051523462
28 0.01870345454169497
29 0.019421548598391073
30 0.018868820656171462


In [11]:
def ndcg(k, ranklist, testlist):
    if not testlist: return 0
    idcg_k, dcg_k = 0, 0
    if len(testlist) < k:
        k = len(testlist)
    for i in range(k):
        idcg_k += 1 / math.log(i + 2, 2)
        
    s = set(testlist)
    hits = [idx for idx, val in enumerate(ranklist) if val in s]
    count = len(hits)
    
    for i in range(count):
        dcg_k += 1 / math.log(hits[i] + 2, 2)
        
    return float(dcg_k / idcg_k)

In [12]:
df = pd.read_csv("~/Thesis/Data/test.csv")
test_users = list(set(df.user))
# df = pd.read_csv("~/Data/clean_rating4.csv").drop(["id"], axis = 1)
user_dict, final, precision = {}, [], []
recall, f1, ndcg_k = [], [], []
predict = P.dot(Q)
for k in range(9, 10):
    ndcg_k, precision = [], []
    for user in test_users:
        dic = {}
        test_items = [item_to_idx[i] for i in df[df.user == user].anime.tolist()]
        target = [item_to_idx[data] for data in df[(df.user == user) & (df.rating > k)].anime.tolist()]
        
        user_idx = user_to_idx[user]
        pred = predict[user_idx, :]
        
        for item in test_items:
            dic[item] = pred[item]
        
#         print(set(dic.keys()) == set(target))
        topk = [i[0] for i in sorted(dic.items(), key = lambda x: x[1], reverse = True)[:5]]
        both = list(set(topk) & set(target))
#         print(len(both))

        precision.append(ndcg(10, topk, target))


    print(np.mean(precision))
    final.append(np.mean(precision))

0.06415911245016132


In [66]:
print(final)

[1.0, 0.9927674962773878, 0.9833652414379922, 0.9658796000850883, 0.9320144650074453, 0.865049989363965, 0.7365241437991916, 0.48810891299723463, 0.23731121038077002, 0.08959795788130184]


In [2]:
a, b, c = 0, 0, 0

print(a, b, c)

SyntaxError: illegal expression for augmented assignment (<ipython-input-2-b5e5c79f0a1a>, line 2)