In [1]:
import time, sys
import warnings

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, csc_matrix
from scipy.sparse.linalg import norm

from sklearn.linear_model import ElasticNet
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib as mpl
import matplotlib.pyplot as plt


from polara.datasets.movielens import get_movielens_data
from polara.tools.preprocessing import filter_sessions_by_length
from polara import RecommenderData

from SLIM import SLIM, SLIMatrix

warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
#Load row Data file

DATA_NAME = 'ml-1m'
DATA_FILE = '/home/albert/Recommendations/{}.zip'.format(DATA_NAME)

ml_data = get_movielens_data(local_file=DATA_FILE, get_genres=False)

In [3]:
#Make rating matrix from 3 columns of data

ratingMtx = ml_data.pivot(index='userid', columns='movieid', values='rating').fillna(0.0)

#Change non-zero values on 1.0 as a sign of interaction
ratingMtx[ratingMtx[:] > 0.0] = 1.0


In [4]:
ratingMtx.head()

movieid,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
#Eliminate value from data and put the index of it into test

def train_test_split(data):
    test = []   
    for i in data.index:
        item = data.loc[i,:].sort_values(ascending=False)
        item = item[item > 0]
        c = np.random.choice(item.index)
        test.append(c)
        data.loc[i, c:c] = 0.0        
    test = np.array(test) 
    return test

In [6]:
#Make train, validation, test sets

#valid = train_test_split(ratingMtx)
test = train_test_split(ratingMtx)

In [7]:
#Make data sparse

sparseRating = csr_matrix(ratingMtx.to_numpy())
sparseRating.eliminate_zeros()

In [8]:
#What is density of initial rating?

a = ratingMtx.index.size
b = ratingMtx.columns.size
c = a * b

print("Users: {};\nItems: {};\nEntries all: {};\nEntries non zero: {};\nDensity of A: {};\n".\
      format(a, b, c, sparseRating.size, sparseRating.size / c))

Users: 6040;
Items: 3706;
Entries all: 22384240;
Entries non zero: 994169;
Density of A: 0.04441379291858915;



In [9]:
#Class for SLIM (Nearest Neighbour) for constructing W matrix of distance between items
#Realization is from 

class SlimElasticNetRecommender():

    def __init__(self, URM_train):
        
        self.URM_train = URM_train

    def fit(self, l1_ratio=0.1, alpha=1.0, positive_only=True, topK=100,
            verbose = True):

        self.l1_ratio = l1_ratio
        self.positive_only = positive_only
        self.topK = topK

        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=alpha,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=1000,
                                tol=1e-4)

        self.URM_train = csc_matrix(self.URM_train) 
        
        n_items = self.URM_train.shape[1]


        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0


        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = self.URM_train[:, currentItem].toarray()

            # set the j-th column of X to zero
            start_pos = self.URM_train.indptr[currentItem]
            end_pos = self.URM_train.indptr[currentItem + 1]

            current_item_data_backup = self.URM_train.data[start_pos: end_pos].copy()
            self.URM_train.data[start_pos: end_pos] = 0.0



            # fit one ElasticNet model per column        
                       
            self.model.fit(self.URM_train, y)
            

            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data


            local_topK = min(len(nonzero_model_coef_value)-1, self.topK)

            relevant_items_partition = (-nonzero_model_coef_value).argpartition(local_topK)[0:local_topK]
            relevant_items_partition_sorting = np.argsort(-nonzero_model_coef_value[relevant_items_partition])
            ranking = relevant_items_partition[relevant_items_partition_sorting]


            for index in range(len(ranking)):

                if numCells == len(rows):
                    rows = np.concatenate((rows, np.zeros(dataBlock, dtype=np.int32)))
                    cols = np.concatenate((cols, np.zeros(dataBlock, dtype=np.int32)))
                    values = np.concatenate((values, np.zeros(dataBlock, dtype=np.float32)))


                rows[numCells] = nonzero_model_coef_index[ranking[index]]
                cols[numCells] = currentItem
                values[numCells] = nonzero_model_coef_value[ranking[index]]

                numCells += 1


            # finally, replace the original values of the j-th column
            self.URM_train.data[start_pos:end_pos] = current_item_data_backup


            if verbose and (time.time() - start_time_printBatch > 300 or currentItem == n_items - 1):
                print("Processed {} ( {:.2f}% ) in {:.2f} minutes. Items per second: {:.0f}".format(
                    currentItem + 1,
                    100.0 * float(currentItem + 1)/n_items,
                    (time.time() - start_time)/60,
                    float(currentItem)/(time.time() - start_time)))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()


        # generate the sparse weight matrix
        self.W_sparse = csr_matrix((values[:numCells], (rows[:numCells], cols[:numCells])),
                                       shape=(n_items, n_items), dtype=np.float32)
                                       
                                       

In [10]:
trainmat = SLIMatrix(sparseRating)
params = {'algo':'cd',
          'nthreads':4,
          'l1r':1.0,
          'l2r':1.0,
         }
model = SLIM()
model.train(params, trainmat)

Learning takes 40.782 secs.


In [12]:
#Make W matrix

R = SlimElasticNetRecommender(sparseRating)
R.fit()
WSlim = R.W_sparse

Processed 3706 ( 100.00% ) in 0.74 minutes. Items per second: 83


In [13]:
#Density of distance matrix W, diagonal and other elements

print("MaurizioFD SLIM Matrix:\n")
print("Density of Rating matrix: {};\n".format(sparseRating.size / c))
print("Density of W: {};\n".format(WSlim.size / WSlim.shape[0]**2))
print("Number of elements less than 0 in W matrix: {} \n".format(WSlim.toarray()[WSlim.toarray()<0].size))
print("Maximal diagonal element of the W matrix: {}\n".format(WSlim.diagonal().max()))

MaurizioFD SLIM Matrix:

Density of Rating matrix: 0.04441379291858915;

Density of W: 0.0002777689597155646;

Number of elements less than 0 in W matrix: 0 

Maximal diagonal element of the W matrix: 0.0



In [14]:
WSlim = model.to_csr()

#Density of distance matrix W, diagonal and other elements
print("Karypis SLIM Matrix:\n")
print("Density of Rating matrix: {};\n".format(sparseRating.size / c))
print("Density of W: {};\n".format(WSlim.size / WSlim.shape[0]**2))
print("Number of elements less than 0 in W matrix: {} \n".format(WSlim.toarray()[WSlim.toarray()<0].size))
print("Maximal diagonal element of the W matrix: {}\n".format(WSlim.diagonal().max()))

Karypis SLIM Matrix:

Density of Rating matrix: 0.04441379291858915;

Density of W: 0.022413224685746105;

Number of elements less than 0 in W matrix: 0 

Maximal diagonal element of the W matrix: 0.0



In [15]:
#Make P matrix that our final recommendation model

def rec_walk_model(W, R, alp=0.001):
    normW = norm(W, np.inf) # Count infinite matrix norm of our Model
    ones = np.ones(W.shape[0])
    W = W / normW
    diagVec = ones - W.dot(ones)
    row = range(0,diagVec.shape[0])
    col = range(0,diagVec.shape[0])
    Diag = csr_matrix((diagVec, (row, col)), shape=(diagVec.shape[0], diagVec.shape[0]))
    W = W + Diag

    M = csr_matrix(np.diag(np.ones(R.shape[0] + R.shape[1])), dtype='float64') # One of two Mtx in RecWalkModel
    M[M.shape[0]-W.shape[0]:, M.shape[0]-W.shape[0]:] = W

    H = csr_matrix(np.diag(np.zeros(R.shape[0] + R.shape[1])), dtype='float64')
    H[: R.shape[0], H.shape[1] - R.shape[1]:] = R
    H[H.shape[0] - R.shape[1]:, : R.shape[0]] = R.transpose()
    k = H.dot(np.ones(R.shape[0] + R.shape[1]))
    k = 1 / k
    H = csr_matrix((H.transpose().dot(np.diag(k).transpose())).transpose())

    P = alp*H + (1 - alp)*M
    return P

In [16]:
#RecWalk P matrix with W SLIM based

PMtxSlim = rec_walk_model(WSlim, sparseRating).toarray()

In [17]:
#RecWalk P matrix with W cosine_similarity based

WCosine = csr_matrix(cosine_similarity(sparseRating.transpose()))
PMtxCosine = rec_walk_model(WCosine, sparseRating).toarray()

In [18]:
#Make a few more steps to future to capture intersactions between items

k = 3
for i in range(k):
    np.matmul(PMtxSlim, PMtxSlim, out = PMtxSlim)
    np.matmul(PMtxCosine, PMtxCosine, out = PMtxCosine)

In [19]:
#Get essential information on scores from achieved Recommendation Matrix P:

RecSlim = pd.DataFrame(PMtxSlim[: 6040, 6040:], 
                       index=ratingMtx.index,
                       columns=ratingMtx.columns,
                       dtype='float64')

RecCosine = pd.DataFrame(PMtxCosine[: 6040, 6040:],
                         index=ratingMtx.index,
                         columns=ratingMtx.columns,
                         dtype='float64')

In [20]:
def hit_rate(recMtx, ratingMtx, testVec, topN = 10):
    sum = 0.0
    for i in range(1, testVec.size + 1):
        r = ratingMtx.loc[i: i, :].unstack().droplevel("userid") < 1.0
        if (testVec[i - 1] in 
            recMtx.loc[i:i,:].unstack().droplevel("userid")[r].sort_values(ascending=False)[:topN].index.values):
            sum += 1.0
    return sum / testVec.size        

In [21]:
#Count Hit Rate of Recommendation for every user using test set:

HRSlim = hit_rate(RecSlim,ratingMtx, test)
HRCosine = hit_rate(RecCosine,ratingMtx, test)
print("Hit Rate RecWalk[M] K-step with W based on SLIM: {};\n".format(HRSlim))
print("Hit Rate RecWalk[M] K-step with W based on cosine similarity: {};\n".format(HRCosine))

Hit Rate RecWalk[M] K-step with W based on SLIM: 0.24039735099337747;

Hit Rate RecWalk[M] K-step with W based on cosine similarity: 0.1802980132450331;

