In [1]:
import pandas as pd
import numpy as np

from polara.datasets.movielens import get_movielens_data

In [2]:
#Load row Data file

DATA_NAME = 'ml-1m'
DATA_FILE = 'D:/datasets/recsys/movielens/{}.zip'.format(DATA_NAME)

ml_data = get_movielens_data(local_file=DATA_FILE)

In [3]:
#Make rating matrix from 3 columns of data
rating_matrix = ml_data.pivot(
    index='userid',
    columns='movieid',
    values='rating'
).fillna(0.0)
rating_matrix.head()

movieid,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
#Eliminate value from data and put the index of it into test
def train_test_split(data, seed=None):
    test = []
    np.random.seed(seed)
    for i in data.index:
        items = data.loc[i, :]
        items = items[items == items.max()]
        test_item = np.random.choice(items.index)
        test.append(test_item)
        data.loc[i, test_item] = 0.0
    return np.array(test)

In [5]:
test_items = train_test_split(rating_matrix, seed=0)

In [6]:
rating_matrix[rating_matrix > 0.0] = 1.0 # make data implicit

Inefficient way of computing truncated SVD:
```python
u, s, vh = np.linalg.svd(ratingMtx, full_matrices=True)
```

More efficient SVD computaton

In [7]:
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

In [8]:
rank = 50
_, s, vh = svds(csr_matrix(rating_matrix.values), k=rank, return_singular_vectors='vh')

In [9]:
s.shape, vh.shape

((50,), (50, 3706))

In [10]:
iten_factors = vh.T

In [11]:
# order matters for efficient computations!
predictions = (rating_matrix @ iten_factors) @ iten_factors.T
# however, it's still inefficient store this matrix in memory

In [12]:
predictions.columns = rating_matrix.columns

In [13]:
def hit_rate(predicted_scores, rating_matrix, test_vec, topn=10, rand_size=999, seed=None):
    res = 0
    np.random.seed(seed)
    for i in range(1, test_vec.size + 1):
        test_item = test_vec[i-1]
        
        is_unseen = rating_matrix.loc[i, :] < 1.0
        unseen_scores = predicted_scores.loc[i, is_unseen]
        
        unseen_clean = unseen_scores[unseen_scores.index != test_item]
        combined_items = np.r_[ # combine test item with random sample
            test_item, # will therefore have index 0 in sorted array
            np.random.choice(unseen_clean.index.values, rand_size)
        ]
        
        candidates = unseen_scores[combined_items]
        top_idx = np.argpartition(candidates.values, -topn)[-topn:]
        res += (top_idx == 0).any() # test item has index 0
    return res / predicted_scores.shape[0]

In [14]:
#Count Hit Rate of Recommendation for every user using test set:
hr_puresvd = hit_rate(predictions, rating_matrix, test_items, seed=42)
print(f"Hit Rate PureSVD({rank}): {hr_puresvd}")

Hit Rate PureSVD(50): 0.5408940397350993
