In [1]:
import warnings

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, csc_matrix
from scipy.sparse.linalg import norm

from sklearn.linear_model import ElasticNet
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


from polara.datasets.movielens import get_movielens_data

from SLIM import SLIM, SLIMatrix

from dataprep import split_holdout, sample_unseen_interactions

warnings.filterwarnings("ignore")

In [2]:
#Make P matrix that our final recommendation model

def rec_walk_model(W, R, alp=0.005):
    normW = norm(W, np.inf) # Count infinite matrix norm of our Model
    ones = np.ones(W.shape[0])
    W = W / normW
    diagVec = ones - W.dot(ones)
    row = range(0,diagVec.shape[0])
    col = range(0,diagVec.shape[0])
    Diag = csr_matrix((diagVec, (row, col)), shape=(diagVec.shape[0], diagVec.shape[0]))
    W = W + Diag

    M = csr_matrix(np.diag(np.ones(R.shape[0] + R.shape[1])), dtype='float64') # One of two Mtx in RecWalkModel
    M[M.shape[0]-W.shape[0]:, M.shape[0]-W.shape[0]:] = W

    H = csr_matrix(np.diag(np.zeros(R.shape[0] + R.shape[1])), dtype='float64')
    H[: R.shape[0], H.shape[1] - R.shape[1]:] = R
    H[H.shape[0] - R.shape[1]:, : R.shape[0]] = R.transpose()
    k = H.dot(np.ones(R.shape[0] + R.shape[1]))
    k = 1 / k
    H = csr_matrix((H.transpose().dot(np.diag(k).transpose())).transpose())

    P = alp*H + (1 - alp)*M
    return P

### HR function is the same as in PureSVD

In [3]:
def HR(rec_mtx, holdout_unseen, holdout, userid='userid', itemid='itemid', topN = 10):
    summ = 0.0
    for i in holdout[userid].values:
        sor = (rec_mtx.loc[i:i][holdout_unseen[i]]
                .unstack()
                .sort_values(ascending=False)[:topN]
                .droplevel(level=1, axis=0)
                .values.min()
        )
        movid = holdout.query(f'{userid:s} == @i')[itemid][i]
        if rec_mtx.loc[i:i][movid][i] > sor:
            summ = summ + 1
   
    return summ/rec_mtx.shape[0]    

In [4]:
# randomization control
seed = 0
holdout_seed = 42 # to sample unseen items for holdout
# evaluation setitings
target_metric = 'hr'

In [5]:
full_data = pd.read_csv('/home/albert/Recommendations/yahoo_data_full.gz')
full_data.head(5)

Unnamed: 0,userid,itemid,is_holdout
0,346,1,False
1,385,1,False
2,517,1,False
3,538,1,False
4,651,1,False


In [6]:
rs = np.random.RandomState(seed)

train, holdout = split_holdout(  # test
    full_data,
    sample_max_rated = True,
    feedback = 'is_holdout',
    random_state = rs
)
holdout.index = holdout.userid

In [7]:
assert not train.is_holdout.any()
assert holdout.is_holdout.all()

In [8]:
holdout_unseen = sample_unseen_interactions(
    full_data,
    train.itemid.unique(), 
    seed=holdout_seed)
holdout_unseen.head()

userid
346    [146, 895, 303, 1293, 624, 1287, 623, 436, 327...
385    [2180, 1352, 502, 2802, 978, 2530, 3149, 999, ...
517    [946, 416, 3254, 2866, 3030, 3256, 1902, 2226,...
538    [1305, 2289, 1916, 2807, 2576, 2778, 1977, 753...
651    [8, 2669, 2827, 2558, 3114, 2320, 428, 1155, 1...
Name: itemid, dtype: object

In [9]:
train.is_holdout = 1.0
ratingMtx = train.pivot(index='userid', columns='itemid', values='is_holdout').fillna(0.0)
ratingMtx.head()

itemid,1,2,3,4,5,6,7,8,9,10,...,3303,3304,3305,3306,3307,3308,3309,3310,3311,3312
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
ratingMtx.shape

(7307, 3312)

In [11]:
sparseRating = csr_matrix(ratingMtx.to_numpy())

In [12]:
#What is the density of initial rating?:

a = ratingMtx.index.size
b = ratingMtx.columns.size
c = a * b

print("Users: {};\nItems: {};\nEntries all: {};\nEntries non zero: {};\nDensity of A: {};\n".\
      format(a, b, c, sparseRating.size, sparseRating.size / c))

Users: 7307;
Items: 3312;
Entries all: 24200784;
Entries non zero: 397438;
Density of A: 0.016422525815692583;



In [13]:
#Parameters on NN:

num = 0.1
C = int(ratingMtx.columns.size*num)
initHR = 0.0

In [14]:
 #Train Matrix W:
trainmat = SLIMatrix(sparseRating)
params = {'algo':'cd',
          'nthreads':4,
          'l1r':10,
          'l2r':10,
          'nnbrs': C
         }
model = SLIM()
model.train(params, trainmat)
    
#Got Matrix W:
WSlim = model.to_csr()

Learning takes 5.171 secs.


In [15]:
#Density of distance matrix W, diagonal and other elements
print("Density of Rating matrix: {};\n".format(sparseRating.size / c))
print("Density of W: {};\n".format(WSlim.size / WSlim.shape[0]**2))
print("Number of elements less than 0 in W matrix: {} \n".format(WSlim.toarray()[WSlim.toarray()<0].size))
print("Maximal diagonal element of the W matrix: {}\n".format(WSlim.diagonal().max()))

Density of Rating matrix: 0.016422525815692583;

Density of W: 0.004606109535811804;

Number of elements less than 0 in W matrix: 0 

Maximal diagonal element of the W matrix: 0.0



In [16]:
#RecWalk P matrix with W SLIM based:
PMtxSlim = rec_walk_model(WSlim, sparseRating).toarray()

### Best result: HR@10 = 0.5461; 
#### k = 5; y1 = 10; y2 = 10; C = 331; alp = 0.005;

In [17]:
#Make a few more steps to future to capture intersactions between items:
kMtxSlim = PMtxSlim
for k in range(20): 
    kMtxSlim = kMtxSlim@PMtxSlim
        
    #Get essential information on scores from achieved Recommendation Matrix P:
    RecSlim = pd.DataFrame(kMtxSlim[: ratingMtx.shape[0], ratingMtx.shape[0]:], 
                                   index=ratingMtx.index,
                                   columns=ratingMtx.columns,
                                   dtype='float64')
        
    #Count Hit Rate of Recommendation for every user using test set:
    HRSlim = HR(RecSlim, holdout_unseen, holdout)
    print("Hit Rate RecWalk[M] K-step with W based on SLIM: {};\n".format(HRSlim))
                

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.514027644724237;

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.5309976734638019;

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.5390721226221431;

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.546188586287122;

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.5445463254413576;

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.5455043109347202;

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.544683180511838;

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.5411249486793486;

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.5394826878335842;

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.5349664705077323;

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.5286711372656356;

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.5237443547283427;

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.518543862050089;

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.5129328041603941;

Hit Rate RecWalk[M] K-st