In [1]:
import numpy as np
import pandas as pd
import math
import random

In [2]:
data = pd.read_csv("~/Data/clean_rating4.csv").drop(["id"], axis = 1)
print(len(set(data.anime_id.tolist())))

9775


In [3]:
data = pd.read_csv("~/Data/clean_rating4.csv").drop(["id"], axis = 1)

idx_to_user = list(set(data.user_id))
idx_to_item = list(set(data[data.rating > 0].anime_id))

user_to_idx = {user: idx for idx, user in enumerate(idx_to_user)}
item_to_idx = {item: idx for idx, item in enumerate(idx_to_item)}

ratings = data[data.rating > 0].rating.values.tolist()
data = data[data.rating > 0].values

F, alpha, lam_bda, batch_size = 100, 0.02, 0.01, 512000
num_epochs, n, k = 30, len(ratings), 1 / math.sqrt(F)

P = np.array([[random.random() * k for _ in range(F)] for _ in range(len(idx_to_user))])
Q = np.array([[random.random() * k for _ in range(len(idx_to_item))] for _ in range(F)])

In [4]:
def sigmoid(x):
    return 1.0 / (1 + math.exp(-x))

In [5]:
def train(train_data, alpha, lam_bda):
    index = [i for i in range(train_data.shape[0])]
    for epoch in range(num_epochs):
        sample = random.sample(index, batch_size)
        loss = 0
        for i in range(batch_size):
            user = user_to_idx[train_data[sample[i], 0]]
            item = item_to_idx[train_data[sample[i], 1]]
            rui = train_data[sample[i], 2]

            eui = rui - sigmoid(P[user, :].dot(Q[:, item]))
            loss += eui
            for f in range(F):
                P[user, f] += alpha * (eui * Q[f, item] - lam_bda * P[user, f])
                Q[f, item] += alpha * (eui * P[user, f] - lam_bda * Q[f, item])

        alpha *= 0.9
        print(epoch + 1, loss / batch_size)

In [6]:
def get_hot():
    animes = pd.read_csv("~/Data/anime.csv")
    animes = animes[animes["anime_id"].isin(idx_to_item)].loc[:, ["anime_id", "rating", "members"]].fillna(0)

    scalar = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
    animes["rating_norm"] = animes[["rating"]].apply(scalar)
    animes["members_norm"] = animes[["members"]].apply(scalar)
    animes["weight"] = 0.6 * animes["rating_norm"] + 0.4 * animes["members_norm"]
    animes = animes.sort_values(by = "weight", ascending = False)
    
    return animes.anime_id.tolist()

In [7]:
def negative_sample():
    sample_num = 100
    hot_items, neg_sample = get_hot(), []
    data = pd.read_csv("~/Data/clean_rating4.csv")
    grouped = data.groupby(["user_id"])
    for gp in grouped.groups:
        p, num = grouped.get_group(gp).anime_id.tolist(), 0
        for item in hot_items:
            if item not in p: 
                neg_sample.append([gp, item, 0])
                num += 1
            if num == sample_num:
                break

    return np.array(neg_sample)

In [8]:
data[:, -1] = np.array([1 for _ in range(data.shape[0])])
data = np.concatenate((data, negative_sample()))
# from sklearn.model_selection import train_test_split
# train_data, test_data = train_test_split(data, test_size = 0.3)
train_ratio = 0.9
train_data = data[:int(len(data) * train_ratio)]
test_data = data[int(len(data) * train_ratio):]
print(train_data.shape, test_data.shape)

(2411043, 3) (267894, 3)


In [9]:
train(train_data, alpha, lam_bda)

1 0.25555402224789586
2 0.11768047158088206
3 0.06988390356009232
4 0.0498546014280664
5 0.03879834465430846
6 0.03167170343577947
7 0.027983559424568778
8 0.024431335325246713
9 0.02232596384266356
10 0.02123481204885261
11 0.02002578105046055
12 0.019234978376210916
13 0.01869379106775868
14 0.019169978291203798
15 0.01817782463717157
16 0.018633872222880352
17 0.018474578669134732
18 0.018496947476068393
19 0.018218864660697325
20 0.018460623520009677
21 0.018578304382262517
22 0.01836879926304485
23 0.01828924723231968
24 0.01841833640908466
25 0.018320572018831725
26 0.018399330378176468
27 0.018195561897799313
28 0.018011473451270743
29 0.017993123198943622
30 0.017682960586546777


In [11]:
test_users = list(set(test_data[:, 0].tolist()))
df = pd.read_csv("~/Data/clean_rating4.csv").drop(["id"], axis = 1)
user_dict, final, precision = {}, [], []
recall, f1 = [], []
predict = P.dot(Q)
for k in range(10):
    for user in test_users:
        user_idx = user_to_idx[user]
        pred = predict[user_idx, :]
        dic = {yui: idx for idx, yui in enumerate(pred)}
        topk = [i[1] for i in sorted(dic.items(), key = lambda x: x[0], reverse = True)[1:6]]
        target = [data[1] for data in df[df.user_id == user].values if data[2] > k]
        
#         print(topk, target)
        both = list(set(topk) & set(target))
        if not both:
            f1.append(0)
            continue
        precision.append(len(both) / len(topk))
        recall.append(len(both) / len(target))
        f1.append((2 * recall[-1] * precision[-1]) / (recall[-1] + precision[-1]))

    print(np.mean(f1))
    final.append(np.mean(precision))

0.0006414840402932892
0.0006424496614325545
0.0006432293297745433
0.0006449825033630561
0.0006459255643350805
0.0006468108600326396
0.0006446370329482179
0.0006388234745975467
0.0006284423305507102
0.0006109836720050726


In [26]:
print(final)

[0.04088397790055249, 0.04077348066298343, 0.0407182320441989, 0.04055248618784531, 0.04022099447513812, 0.03968692449355433, 0.038587213891081294, 0.036505524861878455, 0.03378759975445058, 0.03087292817679558]


In [2]:
a, b, c = 0, 0, 0

print(a, b, c)

SyntaxError: illegal expression for augmented assignment (<ipython-input-2-b5e5c79f0a1a>, line 2)