In [1]:
from collections import defaultdict
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import myfm
from myfm import RelationBlock
import pandas as pd
from scipy import sparse as sps
# read movielens 100k data.
from myfm.utils.benchmark_data import MovieLens100kDataManager

In [2]:
ratings = np.load('../data/douban/data/douban.npy', allow_pickle=True)
train = np.load('../data/douban/data/otraining.npy', allow_pickle=True) * ratings
val = np.load('../data/douban/data/otest.npy', allow_pickle=True) * ratings

In [3]:
df_train = pd.DataFrame(
    list(zip(*train.nonzero(), list(map(lambda x: train[x[0], x[1]], zip(*train.nonzero()))))),
    columns=['user_id', 'movie_id', 'rating']
)
df_test = pd.DataFrame(
    list(zip(*val.nonzero(), list(map(lambda x: val[x[0], x[1]], zip(*val.nonzero()))))),
    columns=['user_id', 'movie_id', 'rating']
)

In [4]:
# index "0" is reserved for unknown ids.
user_to_index = defaultdict(lambda : 0, { uid: i+1 for i,uid in enumerate(np.unique(df_train.user_id)) })
movie_to_index = defaultdict(lambda: 0, { mid: i+1 for i,mid in enumerate(np.unique(df_train.movie_id))})
USER_ID_SIZE = len(user_to_index) + 1
MOVIE_ID_SIZE = len(movie_to_index) + 1

In [5]:
# Implement side information and flavor of SVD++
# We add "all users who have evaluated a movie in the train set" or
# "all movies rated by a user" as a feture of user/movie.
use_date = False # use date info or not
use_iu = True # use implicit user feature
use_ii = True # use implicit item feature
use_user_info = False # use user information
use_movie_info = False # use movie information

movie_vs_watched = dict()
user_vs_watched = dict()
for row in df_train.itertuples():
    user_id = row.user_id
    movie_id = row.movie_id
    movie_vs_watched.setdefault(movie_id, list()).append(user_id)
    user_vs_watched.setdefault(user_id, list()).append(movie_id)

if use_date:
    X_date_train = categorize_date(df_train)
    X_date_test  = categorize_date(df_test)
else:
    X_date_train, X_date_test = (None, None)

In [6]:
# setup grouping
feature_group_sizes = []
if use_date:
    feature_group_sizes.append(
        len(date_be.categories_[0]), # date
    )

feature_group_sizes.append(USER_ID_SIZE) # user ids

if use_iu:
    feature_group_sizes.append(MOVIE_ID_SIZE)

if use_user_info:
    feature_group_sizes.extend([
        len(c) for c in user_info_ohe.categories_ # user attributes
    ])

feature_group_sizes.append(MOVIE_ID_SIZE) # movie ids
                           
if use_ii:
    feature_group_sizes.append(USER_ID_SIZE)

if use_movie_info:
    feature_group_sizes.extend([
        len(c) for c in movie_info_ohe.categories_ # user attributes
    ])
    feature_group_sizes.append(len(movie_genres))


In [7]:
# given user/movie ids, add additional infos and return it as sparse
def augment_user_id(user_ids):
    Xs = []
    X_uid = sps.lil_matrix((len(user_ids), USER_ID_SIZE))
    for index, user_id in enumerate(user_ids):
        X_uid[index, user_to_index[user_id]] = 1
    Xs.append(X_uid)
    if use_iu:
        X_iu = sps.lil_matrix((len(user_ids), MOVIE_ID_SIZE))
        for index, user_id in enumerate(user_ids):
            watched_movies = user_vs_watched.get(user_id, [])
            normalizer = 1 / max(len(watched_movies), 1) ** 0.5
            for uid in watched_movies:
                X_iu[index, movie_to_index[uid]] = normalizer
        Xs.append(X_iu)
    if use_user_info:
        Xs.append(user_info_ohe.transform(user_info.reindex(user_ids)))
    return sps.hstack(Xs, format='csr')

def augment_movie_id(movie_ids):
    Xs = []
    X_movie = sps.lil_matrix((len(movie_ids), MOVIE_ID_SIZE))
    for index, movie_id in enumerate(movie_ids):
        X_movie[index, movie_to_index[movie_id]] = 1
    Xs.append(X_movie)
    
    if use_ii:
        X_ii = sps.lil_matrix((len(movie_ids), USER_ID_SIZE))
        for index, movie_id in enumerate(movie_ids):
            watched_users = movie_vs_watched.get(movie_id, [])
            normalizer = 1 / max(len(watched_users), 1) ** 0.5
            for uid in watched_users:
                X_ii[index, user_to_index[uid]] = normalizer
        Xs.append(X_ii)    
    
    if use_movie_info:
        Xs.append(movie_info_ohe.transform(movie_info.drop(columns=movie_genres).reindex(movie_ids)))
        Xs.append(sps.csr_matrix(movie_info.reindex(movie_ids)[movie_genres].values))
    return sps.hstack(Xs, format='csr')

## User Relation Block to express data
See [\[Rendle 2013\]](http://www.vldb.org/pvldb/vol6/p337-rendle.pdf) how comlexity dcrease drastically in this case (and most cases with bipartite graph structure).

In [8]:
# Create RelationBlock.
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.unique.html
train_blocks = []
test_blocks = []
for source, target in [(df_train, train_blocks), (df_test, test_blocks)]:
    unique_users, user_map = np.unique(source.user_id, return_inverse=True)
    target.append(
        RelationBlock(user_map, augment_user_id(unique_users))
    )
    unique_movies, movie_map = np.unique(source.movie_id, return_inverse=True)
    target.append(
        RelationBlock(movie_map, augment_movie_id(unique_movies))
    )

## Regression

In [9]:
fm = myfm.MyFMRegressor(rank=10)
fm.fit(
    X_date_train, df_train.rating.values, X_rel=train_blocks, X_test=X_date_test, X_rel_test=test_blocks,
    y_test=df_test.rating.values,
    n_iter=300, n_kept_samples=295
);

alpha = 2.74 w0 = 3.65  rmse_this: 0.81 mae_this: 0.64: 100%|██████████| 300/300 [00:19<00:00, 15.14it/s]


In [10]:
test_predictions = fm.predict(X_date_test, test_blocks)
test_predictions = np.clip(test_predictions, 1., 5.)

rmse = (
    (test_predictions - df_test.rating.values)**2
).mean() ** 0.5

print('rmse={:.4f}'.format(rmse))

rmse=0.7212


## Ordered Probit Regression

In [23]:
fm_probit = myfm.MyFMOrderedProbit(rank=10)
fm_probit.fit(
    X_date_train, df_train.rating.values - 1, X_rel=train_blocks,
    n_iter=300, n_kept_samples=295
);

w0= 0.226519, cutpoint = ['-2.971', '-1.836', '0.045', '1.648'] : 100%|██████████| 300/300 [00:40<00:00,  7.45it/s]


In [24]:
test_prediction_ordered_prob = fm_probit.predict_proba(X_date_test, test_blocks)
test_prediction_ordered_mean = 1 + test_prediction_ordered_prob.dot(np.arange(5)) # class 0 => rating 1 shift

rmse = (
    (test_prediction_ordered_mean - df_test.rating.values) ** 2
).mean() ** 0.5
mae = np.abs(test_prediction_ordered_mean - df_test.rating).mean()
test_predictions = fm.predict(X_date_test, test_blocks)

print('rmse={}, mae={}'.format(rmse, mae))



ValueError: Relation blocks have inconsistent mapper size with case_size