In [2]:
import numpy as np
import pandas as pd
import math
import random

In [2]:
data = pd.read_csv("~/Data/clean_rating4.csv").drop(["id"], axis = 1)
print(len(set(data.anime_id.tolist())))

9775


In [4]:
data = pd.read_csv('~/Data/clean_rating4.csv').drop(["id"], axis = 1)

idx_to_user = list(set(data.user_id))
idx_to_item = list(set(data.anime_id))

user_to_idx = {user: idx for idx, user in enumerate(idx_to_user)}
item_to_idx = {item: idx for idx, item in enumerate(idx_to_item)}

In [12]:
# ratings = data[data.rating > 0].rating.values.tolist()
# data = data[data.rating > 0].values

F, alpha, lam_bda, batch_size = 100, 0.02, 0.01, 512000
num_epochs, k = 30, 1 / math.sqrt(F)

P = np.array([[random.random() * k for _ in range(F)] for _ in range(len(idx_to_user))])
Q = np.array([[random.random() * k for _ in range(len(idx_to_item))] for _ in range(F)])
print(P)

[[0.00837757 0.07145481 0.05360349 ... 0.06733769 0.06129742 0.07747173]
 [0.00456755 0.04984179 0.05882774 ... 0.09882802 0.08714208 0.06718382]
 [0.04382356 0.08646178 0.09176005 ... 0.07711914 0.03064835 0.08595886]
 ...
 [0.08032087 0.08022453 0.04225595 ... 0.01407782 0.00485343 0.01694809]
 [0.07562786 0.08026737 0.06079744 ... 0.0477532  0.05579164 0.08609135]
 [0.00103994 0.05371762 0.09992921 ... 0.06650408 0.00012493 0.04377633]]


In [6]:
def sigmoid(x):
#     print(x)
    return 1.0 / (1 + math.exp(-x))

In [13]:
def train(train_data, alpha, lam_bda):
    index = [i for i in range(train_data.shape[0])]
    for epoch in range(num_epochs):
        sample = random.sample(index, batch_size)
        loss = 0
        for i in range(batch_size):
            user = user_to_idx[train_data[sample[i], 0]]
            item = item_to_idx[train_data[sample[i], 1]]
            rui = train_data[sample[i], 2]

            eui = rui - sigmoid(P[user, :].dot(Q[:, item]))
            loss += eui
            for f in range(F):
                P[user, f] += alpha * (eui * Q[f, item] - lam_bda * P[user, f])
                Q[f, item] += alpha * (eui * P[user, f] - lam_bda * Q[f, item])

        alpha *= 0.9
        print(epoch + 1, loss / batch_size)

In [8]:
def get_hot():
    animes = pd.read_csv("~/Data/anime.csv")
    animes = animes[animes["anime_id"].isin(idx_to_item)].loc[:, ["anime_id", "rating", "members"]].fillna(0)

    scalar = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
    animes["rating_norm"] = animes[["rating"]].apply(scalar)
    animes["members_norm"] = animes[["members"]].apply(scalar)
    animes["weight"] = 0.6 * animes["rating_norm"] + 0.4 * animes["members_norm"]
    animes = animes.sort_values(by = "weight", ascending = False)
    
    return animes.anime_id.tolist()

In [9]:
def negative_sample():
    sample_num = 100
    hot_items, neg_sample = get_hot(), []
    data = pd.read_csv("~/Data/clean_rating4.csv")
    grouped = data.groupby(["user_id"])
    for gp in grouped.groups:
        p, num = grouped.get_group(gp).anime_id.tolist(), 0
        for item in hot_items:
            if item not in p: 
                neg_sample.append([gp, item, 0])
                num += 1
            if num == sample_num:
                break

    return np.array(neg_sample)

In [10]:
train_data = np.array(pd.read_csv("~/Thesis/Data/train.csv"))
train_data[:, -1] = np.array([1 for _ in range(train_data.shape[0])])
train_data = np.concatenate((train_data, negative_sample()))
print(train_data)
# from sklearn.model_selection import train_test_split
# train_data, test_data = train_test_split(data, test_size = 0.3)
# train_ratio = 0.9
# train_data = data[:int(len(data) * train_ratio)]
# test_data = data[int(len(data) * train_ratio):]
# print(train_data.shape, test_data.shape)

[[    1    24     1]
 [    1    30     1]
 [    1    63     1]
 ...
 [ 4701 17895     0]
 [ 4701 26243     0]
 [ 4701 25013     0]]


In [14]:
train(train_data, alpha, lam_bda)

1 0.1805125371192668
2 0.10802295006503719
3 0.07061943493590289
4 0.0527743867225866
5 0.043354288663434284
6 0.037106950795542284
7 0.0331560186651994
8 0.030424067390972188
9 0.028197914545233203
10 0.02707026362977587
11 0.025151407649559383
12 0.024520688793599286
13 0.02320414520207114
14 0.022549380207388593
15 0.022576024754730556
16 0.022046508344297885
17 0.02100827288890135
18 0.020683177142277732
19 0.0207888087263882
20 0.020233985930775616
21 0.019852637419754543
22 0.019735136642961667
23 0.01959093455686906
24 0.01925241058823292
25 0.01939202481947437
26 0.01925370317180438
27 0.018931980016117202
28 0.019010014452250878
29 0.018774330907021106
30 0.018809070428013556


In [11]:
def ndcg(k, ranklist, testlist):
    if not testlist: return 0
    idcg_k, dcg_k = 0, 0
    if len(testlist) < k:
        k = len(testlist)
    for i in range(k):
        idcg_k += 1 / math.log(i + 2, 2)
        
    s = set(testlist)
    hits = [idx for idx, val in enumerate(ranklist) if val in s]
    count = len(hits)
    
    for i in range(count):
        dcg_k += 1 / math.log(hits[i] + 2, 2)
        
    return float(dcg_k / idcg_k)

In [15]:
def NDCG(ratings):
    idcg, dcg = 0, 0
    for idx, r in enumerate(ratings):
        dcg += (2 ** r - 1) / math.log(idx + 2, 2)
        
#     print(sorted(ratings, reverse = True))
    for idx, r in enumerate(sorted(ratings, reverse = True)):
        idcg += (2 ** r - 1) / math.log(idx + 2, 2)
        
    return float(dcg / idcg)

In [17]:
df = pd.read_csv("~/Thesis/Data/test.csv")
test_users = list(set(df.user))
# df = pd.read_csv("~/Data/clean_rating4.csv").drop(["id"], axis = 1)
user_dict, final, precision = {}, [], []
recall, f1, ndcg_k = [], [], []
predict = P.dot(Q)
for k in range(9, 10):
    ndcg_k, precision = [], []
    for user in test_users:
        dic = {}
        test_items = [item_to_idx[i] for i in df[df.user == user].anime.tolist()]
        target = [item_to_idx[data] for data in df[(df.user == user) & (df.rating > k)].anime.tolist()]
        
        user_idx = user_to_idx[user]
        pred = predict[user_idx, :]
        
        for item in test_items:
            dic[item] = pred[item]
        
#         print(set(dic.keys()) == set(target))
        topk = [i[0] for i in sorted(dic.items(), key = lambda x: x[1], reverse = True)[:5]]
        idx = [idx_to_item[d] for d in topk]
        rates = [df[(df.user == user) & (df.anime == st)].rating.tolist()[0] for st in idx]
#         both = list(set(topk) & set(target))
#         print(len(both))

        precision.append(NDCG(rates))


    print(np.mean(precision))
    final.append(np.mean(precision))

0.8401839398098622


In [66]:
print(final)

[1.0, 0.9927674962773878, 0.9833652414379922, 0.9658796000850883, 0.9320144650074453, 0.865049989363965, 0.7365241437991916, 0.48810891299723463, 0.23731121038077002, 0.08959795788130184]


In [2]:
a, b, c = 0, 0, 0

print(a, b, c)

SyntaxError: illegal expression for augmented assignment (<ipython-input-2-b5e5c79f0a1a>, line 2)