In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

from polara.datasets.movielens import get_movielens_data

In [2]:
split_seed, rand_seed = 0, 99

# Data

In [3]:
#Load row Data file

DATA_NAME = 'ml-1m'
DATA_FILE = 'D:/datasets/recsys/movielens/{}.zip'.format(DATA_NAME)

ml_data = get_movielens_data(local_file=DATA_FILE, get_genres=False)

In [4]:
# encode users and items to [0, N] interval with no missing indices
useridx, all_users = pd.factorize(ml_data.userid)
itemidx, all_items = pd.factorize(ml_data.movieid)

In [5]:
all_ratings = csr_matrix( # for efficient storage and some computations
    (
        ml_data.rating.values,
        (useridx, itemidx)
    )
)
all_ratings

<6040x3706 sparse matrix of type '<class 'numpy.int64'>'
	with 1000209 stored elements in Compressed Sparse Row format>

## Train-test split

In [6]:
def test_split(matrix, seed=None):
    '''
    Uses CSR format to efficiently access non-zero elements.
    Can be easily wrapped by numba jit with minor changes.
    '''
    test_items = []
    indptr = matrix.indptr
    indices = matrix.indices
    data = matrix.data
    np.random.seed(seed)
    for i in range(len(indptr)-1): # for every user i
        head = indptr[i]
        tail = indptr[i+1]
        vals = data[head:tail] # user ratings
        pos_max, = np.where(vals == vals.max())
        top_items = indices[head + pos_max]
        test_items.append(np.random.choice(top_items))
    return np.array(test_items)

In [7]:
test_items = test_split(all_ratings, seed=split_seed)

In [8]:
train_matrix = all_ratings.copy() # avoid mutating original data
train_matrix[np.arange(len(test_items)), test_items] = 0 # exclude test items
train_matrix.eliminate_zeros()
train_matrix = (train_matrix > 0).astype('f8') # make data implicit

# Model

In [9]:
rank = 50
_, s, vh = svds(train_matrix, k=rank, return_singular_vectors='vh')

In [10]:
s.shape, vh.shape

((50,), (50, 3706))

In [11]:
s = s[::-1] # sort in decreasing order of singular values
item_factors = vh[::-1, :].T

## Evaluation

In [12]:
def sample_unseen(pool_size, sample_size, exclude):
    '''Efficient sampling from a range with exclusion'''
    assert (pool_size-len(exclude)) >= sample_size 
    src = np.random.rand(pool_size)
    np.put(src, exclude, -1) # will never get to the top
    return np.argpartition(src, -sample_size)[-sample_size:]

def topk_idx(arr, topk, unsorted=False):
    'Select top-k elements. Sort for raniking metrics.'
    top_unsorted = np.argpartition(arr, -topk)[-topk:]
    if unsorted:
        return top_unsorted
    return top_unsorted[np.argsort(-arr[top_unsorted])]


def evaluate(observations, holdout, item_factors, rand_size=999, topk=10, seed=None):
    '''
    Calculate Hit-Rate@topk with randomly sampled unseen items.
    For further speedups can be wrapped by numba jit with minor changes.
    '''
    n_users, n_items = train_matrix.shape
    user_factors = observations.dot(item_factors)
    indptr = observations.indptr
    indices = observations.indices
    
    hr = 0
    arhr = 0
    for i in range(len(indptr)-1):
        head = indptr[i]
        tail = indptr[i+1]
        
        seen_items = np.r_[holdout[i], indices[head:tail]]
        rand_items = sample_unseen(n_items, rand_size, seen_items)
        
        holdout_prediction = item_factors[holdout[i], :] @ user_factors[i, :]
        random_predictions = item_factors[rand_items, :] @ user_factors[i, :]
        merged_predictions = np.r_[holdout_prediction, random_predictions] # test item is first
        
        top_recs = topk_idx(merged_predictions, topk)
        recs_pos, = np.where(top_recs == 0) # holdout item has index 0 (it was the first)
        if len(recs_pos): # array with a single element
            hr += 1
            arhr += 1. / (recs_pos[0]+1) # ranking starts from 1
    
    hr /= len(holdout)
    arhr /= len(holdout)
    return hr, arhr

In [13]:
hr_puresvd, arhr_puresvd = evaluate(train_matrix, test_items, item_factors, seed=rand_seed)
print(f"Hit Rate PureSVD({rank}):\nHR: {hr_puresvd} ARHR: {arhr_puresvd}")

Hit Rate PureSVD(50):
HR: 0.5362582781456954 ARHR: 0.2862381083780081


# Appendix

## Verification tests

In [14]:
assert train_matrix.nnz + len(test_items) == all_ratings.nnz

In [15]:
# check there's only 1 item per user
assert len(test_items) == all_ratings.shape[0]
 # verify max rating of the test items
assert (
    ml_data
    .groupby('userid')
    .apply(lambda x:
           x.loc[ # select item from test and its rating
               x.movieid == all_items[test_items[all_users.get_loc(x.name)]],
               'rating'
           ] >= x.rating.max() # compare with max user rating
          )
    .all()
)

In [16]:
# check test items are not present in train
assert all([train_matrix[i, test_items[i]] == 0 for i in range(train_matrix.shape[0])])

In [17]:
# verify sampler function
unobs = np.random.choice(1000, 500, replace=False)
assert not np.in1d(sample_unseen(1000, 500, unobs), unobs).any()

## Profiling report

In [18]:
%load_ext line_profiler

In [19]:
%lprun -f evaluate evaluate(train_matrix, test_items, item_factors, seed=rand_seed)

Timer unit: 1e-07 s

Total time: 2.33417 s
File: <ipython-input-12-df93d0691969>
Function: evaluate at line 16

Line #      Hits         Time  Per Hit   % Time  Line Contents
    16                                           def evaluate(observations, holdout, item_factors, rand_size=999, topk=10, seed=None):
    17                                               '''
    18                                               Calculate Hit-Rate@topk with randomly sampled unseen items.
    19                                               For further speedups can be wrapped by numba jit with minor changes.
    20                                               '''
    21         1        101.0    101.0      0.0      n_users, n_items = train_matrix.shape
    22         1     296908.0 296908.0      1.3      user_factors = observations.dot(item_factors)
    23         1         29.0     29.0      0.0      indptr = observations.indptr
    24         1          7.0      7.0      0.0      indices = observ