In [1]:
import time, sys
import warnings

import pandas as pd
import numpy as np
from numpy.linalg import svd

warnings.filterwarnings("ignore")

from dataprep import split_holdout, sample_unseen_interactions

In [2]:
def check_train_test(train, test, itemid='itemid'):
    intersection = np.setdiff1d(test[itemid], train[itemid]).tolist()
    print(intersection)
    if len(intersection) == 0:
        return
    train = train.append(test.query(f"{itemid:s} == @intersection"))
    test = test.query(f"{itemid:s} != @intersection")
    return train, test

#GOOD ONE
def HR(rec_mtx, holdout_unseen, holdout, userid='userid', itemid='itemid', arhr=False, topN = 10):
    summ_hr = 0.0
    summ_arhr = 0.0
    for i in holdout[userid].values:
        holdout_unseen[i] = np.append(holdout_unseen[i], holdout[itemid][i])
        sor = (rec_mtx.loc[i:i][holdout_unseen[i]]
                .unstack()
                .sort_values(ascending=False)[:topN]
                .droplevel(level=1, axis=0)
                .index
        )
        if holdout[itemid][i] in sor:
            summ_hr = summ_hr + 1
            if arhr:
                local = sor.to_series(index=range(1,topN+1))[sor==holdout[itemid][i]].index[0]
                summ_arhr = summ_arhr + 1/local    
   
    return summ_hr/holdout.shape[0], summ_arhr/holdout.shape[0]     

In [3]:
# randomization control
seed = 0
holdout_seed = 42 # to sample unseen items for holdout
# evaluation setitings
target_metric = 'hr'

In [4]:
full_data = pd.read_csv('/home/albert/Recommendations/yahoo_data_full.gz')

In [5]:
full_data.head(5)

Unnamed: 0,userid,itemid,is_holdout
0,346,1,False
1,385,1,False
2,517,1,False
3,538,1,False
4,651,1,False


In [6]:
full_data.shape

(404745, 3)

In [7]:
rs = np.random.RandomState(seed)

train, holdout = split_holdout(  # test
    full_data,
    sample_max_rated = True,
    feedback = 'is_holdout',
    random_state = rs
)
holdout.index = holdout.userid

In [8]:
assert not train.is_holdout.any()
assert holdout.is_holdout.all()

In [9]:
holdout_unseen = sample_unseen_interactions(
    full_data,
    train.itemid.unique(), 
    seed=holdout_seed)
holdout_unseen.head()

userid
346    [146, 895, 303, 1293, 624, 1287, 623, 436, 327...
385    [2180, 1352, 502, 2802, 978, 2530, 3149, 999, ...
517    [946, 416, 3254, 2866, 3030, 3256, 1902, 2226,...
538    [1305, 2289, 1916, 2807, 2576, 2778, 1977, 753...
651    [8, 2669, 2827, 2558, 3114, 2320, 428, 1155, 1...
Name: itemid, dtype: object

In [10]:
train.is_holdout = 1.0

In [11]:
ratingMtx = train.pivot(index='userid', columns='itemid', values='is_holdout').fillna(0.0)

In [12]:
ratingMtx.head()

itemid,1,2,3,4,5,6,7,8,9,10,...,3303,3304,3305,3306,3307,3308,3309,3310,3311,3312
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
ratingMtx.shape

(7307, 3312)

In [14]:
u, s, vh = np.linalg.svd(ratingMtx, full_matrices=True)
rank = 25
topN = 10
v = vh.T[:,:rank]
m = v@v.T
d = ratingMtx@m
d.columns = ratingMtx.columns

### Results:

In [15]:
HRPureSVD, ARHRPureSVD = HR(d,holdout_unseen, holdout, arhr=True)
print(f"HR@{topN} PureSVD({rank}): {HRPureSVD};\n\nARHR@{topN} PureSVD({rank}): {ARHRPureSVD};\n" )

HR@10 PureSVD(25): 0.3874367045299028;

ARHR@10 PureSVD(25): 0.1745556554814798;

