In [1]:
import numpy as np
import pandas as pd
import math
import random

In [2]:
data = pd.read_csv("~/Data/clean_rating3.csv")

idx_to_user = list(set(data.user_id))
idx_to_item = list(set(data[data.rating > 0].anime_id))

user_to_idx = {user: idx for idx, user in enumerate(idx_to_user)}
item_to_idx = {item: idx for idx, item in enumerate(idx_to_item)}

ratings = data[data.rating > 0].rating.values.tolist()
data = data[data.rating > 0].values

F, alpha, lam_bda, batch_size = 100, 0.02, 0.01, 512000
num_epochs, n, k = 30, len(ratings), 1 / math.sqrt(F)

P = np.array([[random.random() * k for _ in range(F)] for _ in range(len(idx_to_user))])
Q = np.array([[random.random() * k for _ in range(len(idx_to_item))] for _ in range(F)])

In [3]:
def sigmoid(x):
    return 1.0 / (1 + math.exp(-x))

In [4]:
def train(train_data, alpha, lam_bda):
    index = [i for i in range(train_data.shape[0])]
    for epoch in range(num_epochs):
        sample = random.sample(index, batch_size)
        loss = 0
        for i in range(batch_size):
            user = user_to_idx[train_data[sample[i], 0]]
            item = item_to_idx[train_data[sample[i], 1]]
            rui = train_data[sample[i], 2]

            eui = rui - sigmoid(P[user, :].dot(Q[:, item]))
            loss += eui
            for f in range(F):
                P[user, f] += alpha * (eui * Q[f, item] - lam_bda * P[user, f])
                Q[f, item] += alpha * (eui * P[user, f] - lam_bda * Q[f, item])

        alpha *= 0.9
        print(epoch + 1, loss / batch_size)

In [9]:
def get_hot():
    animes = pd.read_csv("~/Data/anime.csv")
    animes = animes[animes["anime_id"].isin(idx_to_item)].loc[:, ["anime_id", "rating", "members"]].fillna(0)

    scalar = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
    animes["rating_norm"] = animes[["rating"]].apply(scalar)
    animes["members_norm"] = animes[["members"]].apply(scalar)
    animes["weight"] = 0.6 * animes["rating_norm"] + 0.4 * animes["members_norm"]
    animes = animes.sort_values(by = "weight", ascending = False)
    
    return animes.anime_id.tolist()

In [7]:
def negative_sample():
    sample_num = 100
    hot_items, neg_sample = get_hot(), []
    data = pd.read_csv("~/Data/clean_rating3.csv")
    grouped = data.groupby(["user_id"])
    for gp in grouped.groups:
        p, num = grouped.get_group(gp).anime_id.tolist(), 0
        for item in hot_items:
            if item not in p: 
                neg_sample.append([gp, item, 0])
                num += 1
            if num == sample_num:
                break

    return np.array(neg_sample)

In [10]:
data[:, -1] = np.array([1 for _ in range(data.shape[0])])
data = np.concatenate((data, negative_sample()))
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size = 0.3)
print(train_data.shape, test_data.shape)

(1875255, 3) (803682, 3)


In [8]:
train(train_data, alpha, lam_bda)

(1875255, 3) (803682, 3)


KeyboardInterrupt: 

In [5]:
test_users = test_data[:, 0].tolist()
print(test_users)

NameError: name 'test_data' is not defined