In [1]:
!cp -r ../input/recsys-repo/RecSys_Course_AT_PoliMi-master/* ./
#%%
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

In [2]:
URM_all_dataframe = pd.read_csv('/kaggle/input/urm-true-binary/URM_True_Binary.csv')
URM_all_dataframe
URM_all = sps.coo_matrix((URM_all_dataframe["Data"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)
URM_all

<41629x24507 sparse matrix of type '<class 'numpy.int64'>'
	with 1554640 stored elements in Compressed Sparse Row format>

In [3]:
URM_train_dataframe = pd.read_csv('/kaggle/input/urm-split/Train_df.csv')
URM_train_dataframe
URM_train = sps.coo_matrix((URM_train_dataframe["Data"].values, 
                          (URM_train_dataframe["UserID"].values, URM_train_dataframe["ItemID"].values)))
URM_train = URM_train.tocsr() # to obtain fast access to rows (users)
URM_train

<41629x24507 sparse matrix of type '<class 'numpy.int64'>'
	with 1243712 stored elements in Compressed Sparse Row format>

In [4]:
URM_val_dataframe = pd.read_csv('/kaggle/input/urm-split/Test_df.csv')
URM_val_dataframe
URM_valid = sps.coo_matrix((URM_val_dataframe["Data"].values, 
                          (URM_val_dataframe["UserID"].values, URM_val_dataframe["ItemID"].values)))
URM_valid = URM_all.tocsr() # to obtain fast access to rows (users)
URM_valid

<41629x24507 sparse matrix of type '<class 'numpy.int64'>'
	with 1554640 stored elements in Compressed Sparse Row format>

In [5]:
userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(URM_all_dataframe)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 24507, Number of users	 41629
Max ID items	 24506, Max Id users	 41628

Average interactions per user 37.35
Average interactions per item 63.44

Sparsity 99.85 %


In [6]:
from Evaluation.Evaluator import EvaluatorHoldout

#create an evaluator object to evaluate validation set
#we will use it for hyperparameter tuning
evaluator_valid = EvaluatorHoldout(URM_valid, cutoff_list=[10])
#%% md
# Split users in groups and train multiple models

In [7]:
import numpy as np 
import scipy.sparse as sps

# count how many interactions each user has
profile_length = np.ediff1d(sps.csr_matrix(URM_train).indptr)
profile_length, profile_length.shape

(array([49, 18, 89, ..., 18, 39, 22], dtype=int32), (41629,))

In [8]:
# select 5% of users with least interactions
block_size = int(len(profile_length)*0.05)
block_size

2081

In [9]:
sorted_users = np.argsort(profile_length)
sorted_users

array([36889, 36096, 25061, ..., 19407,  8693, 12454])

In [10]:
# do the same for all users to obtain a division in 20 different groups of users
for group_id in range(0, 20):
    start_pos = group_id * block_size
    end_pos = min((group_id+1) * block_size, len(profile_length))
    
    users_in_group = sorted_users[start_pos:end_pos]
    
    users_in_group_p_len = profile_length[users_in_group]
    
    print("Group {}, #users in group {}, average p.len {:.2f}, median {}, min {}, max {}".format(
        group_id, 
        users_in_group.shape[0],
        users_in_group_p_len.mean(),
        np.median(users_in_group_p_len),
        users_in_group_p_len.min(),
        users_in_group_p_len.max()))

Group 0, #users in group 2081, average p.len 9.79, median 10.0, min 3, max 12
Group 1, #users in group 2081, average p.len 12.54, median 13.0, min 12, max 13
Group 2, #users in group 2081, average p.len 14.08, median 14.0, min 13, max 15
Group 3, #users in group 2081, average p.len 15.37, median 15.0, min 15, max 16
Group 4, #users in group 2081, average p.len 16.44, median 16.0, min 16, max 17
Group 5, #users in group 2081, average p.len 17.53, median 18.0, min 17, max 18
Group 6, #users in group 2081, average p.len 18.62, median 19.0, min 18, max 19
Group 7, #users in group 2081, average p.len 19.75, median 20.0, min 19, max 20
Group 8, #users in group 2081, average p.len 20.98, median 21.0, min 20, max 22
Group 9, #users in group 2081, average p.len 22.30, median 22.0, min 22, max 23
Group 10, #users in group 2081, average p.len 23.63, median 24.0, min 23, max 24
Group 11, #users in group 2081, average p.len 25.40, median 25.0, min 24, max 26
Group 12, #users in group 2081, average 

In [11]:
import implicit
import numpy as np

from Recommenders.BaseMatrixFactorizationRecommender import BaseMatrixFactorizationRecommender


class IALSRecommender_implicit(BaseMatrixFactorizationRecommender):
    """
    ALS implemented with implicit following guideline of
    https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe
    IDEA:
    Recomputing x_{u} and y_i can be done with Stochastic Gradient Descent, but this is a non-convex optimization problem.
    We can convert it into a set of quadratic problems, by keeping either x_u or y_i fixed while optimizing the other.
    In that case, we can iteratively solve x and y by alternating between them until the algorithm converges.
    This is Alternating Least Squares.
    """

    RECOMMENDER_NAME = "IALSRecommender_implicit"

    def __init__(self, URM_train, verbose=True):
        super(IALSRecommender_implicit, self).__init__(URM_train, verbose=verbose)

    def fit(self, n_factors=50, regularization=0.001847510119137634, iterations=30, num_threads=2):
        self.n_factors = n_factors
        self.regularization = regularization
        self.iterations = iterations

        sparse_item_user = self.URM_train.T

        # Initialize the als model and fit it using the sparse item-user matrix
        model = implicit.als.AlternatingLeastSquares(factors=self.n_factors, regularization=self.regularization,
                                                     iterations=self.iterations, num_threads=num_threads)

        alpha_val = 2
        # Calculate the confidence by multiplying it by our alpha value.

        data_conf = (sparse_item_user * alpha_val).astype('double')

        # Fit the model
        model.fit(data_conf)

        # Get the user and item vectors from our trained model
        self.USER_factors = model.user_factors
        self.ITEM_factors = model.item_factors

    def _compute_item_score(self, user_id_array, items_to_compute=None):
        """
        USER_factors is n_users x n_factors
        ITEM_factors is n_items x n_factors

        The prediction for cold users will always be -inf for ALL items

        :param user_id_array:
        :param items_to_compute:
        :return:
        """

        assert self.USER_factors.shape[1] == self.ITEM_factors.shape[1], \
            "{}: User and Item factors have inconsistent shape".format(self.RECOMMENDER_NAME)

        assert self.USER_factors.shape[0] > np.max(user_id_array), \
            "{}: Cold users not allowed. Users in trained model are {}, requested prediction for users up to {}".format(
                self.RECOMMENDER_NAME, self.USER_factors.shape[0], np.max(user_id_array))

        if items_to_compute is not None:
            item_scores = - np.ones((len(user_id_array), self.ITEM_factors.shape[0]), dtype=np.float32) * np.inf
            item_scores[:, items_to_compute] = np.dot(self.USER_factors[user_id_array],
                                                      np.transpose(self.ITEM_factors[items_to_compute, :]))

        else:
            item_factors_T = np.transpose(self.ITEM_factors)
            user_factors = self.USER_factors[user_id_array]
            item_scores = np.dot(user_factors, item_factors_T)

        # No need to select only the specific negative items or warm users because the -inf score will not change
        if self.use_bias:
            item_scores += self.ITEM_bias + self.GLOBAL_bias
            item_scores = np.transpose(np.transpose(item_scores) + self.USER_bias[user_id_array])

        return item_scores

In [12]:
import numpy as np
import scipy.sparse as sps
from Recommenders.Recommender_utils import check_matrix
from sklearn.linear_model import ElasticNet
from Recommenders.BaseSimilarityMatrixRecommender import BaseItemSimilarityMatrixRecommender
from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit
import time, sys
from tqdm import tqdm
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

# os.environ["PYTHONWARNINGS"] = ('ignore::exceptions.ConvergenceWarning:sklearn.linear_model')
# os.environ["PYTHONWARNINGS"] = ('ignore:Objective did not converge:ConvergenceWarning:')

class SLIMElasticNetRecommender(BaseItemSimilarityMatrixRecommender):
    """
    Train a Sparse Linear Methods (SLIM) item similarity model.
    NOTE: ElasticNet solver is parallel, a single intance of SLIM_ElasticNet will
          make use of half the cores available
    See:
        Efficient Top-N Recommendation by Linear Regression,
        M. Levy and K. Jack, LSRS workshop at RecSys 2013.
        SLIM: Sparse linear methods for top-n recommender systems,
        X. Ning and G. Karypis, ICDM 2011.
        http://glaros.dtc.umn.edu/gkhome/fetch/papers/SLIM2011icdm.pdf
    """

    RECOMMENDER_NAME = "SLIMElasticNetRecommender"

    def __init__(self, URM_train, verbose = True):
        super(SLIMElasticNetRecommender, self).__init__(URM_train, verbose = verbose)

    @ignore_warnings(category=ConvergenceWarning)
    def fit(self, l1_ratio=0.1, alpha = 1.0, positive_only=True, topK = 100,**earlystopping_kwargs):

        assert l1_ratio>= 0 and l1_ratio<=1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(self.RECOMMENDER_NAME, l1_ratio)

        self.l1_ratio = l1_ratio
        self.positive_only = positive_only
        self.topK = topK


        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=alpha,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = URM_train[:, currentItem].toarray()

            # set the j-th column of X to zero
            start_pos = URM_train.indptr[currentItem]
            end_pos = URM_train.indptr[currentItem + 1]

            current_item_data_backup = URM_train.data[start_pos: end_pos].copy()
            URM_train.data[start_pos: end_pos] = 0.0

            # fit one ElasticNet model per column
            self.model.fit(URM_train, y)

            # self.model.coef_ contains the coefficient of the ElasticNet model
            # let's keep only the non-zero values

            # Select topK values
            # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
            # - Partition the data to extract the set of relevant items
            # - Sort only the relevant items
            # - Get the original item index

            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data

            local_topK = min(len(nonzero_model_coef_value)-1, self.topK)

            relevant_items_partition = (-nonzero_model_coef_value).argpartition(local_topK)[0:local_topK]
            relevant_items_partition_sorting = np.argsort(-nonzero_model_coef_value[relevant_items_partition])
            ranking = relevant_items_partition[relevant_items_partition_sorting]

            for index in range(len(ranking)):

                if numCells == len(rows):
                    rows = np.concatenate((rows, np.zeros(dataBlock, dtype=np.int32)))
                    cols = np.concatenate((cols, np.zeros(dataBlock, dtype=np.int32)))
                    values = np.concatenate((values, np.zeros(dataBlock, dtype=np.float32)))


                rows[numCells] = nonzero_model_coef_index[ranking[index]]
                cols[numCells] = currentItem
                values[numCells] = nonzero_model_coef_value[ranking[index]]

                numCells += 1

            # finally, replace the original values of the j-th column
            URM_train.data[start_pos:end_pos] = current_item_data_backup

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)


            if time.time() - start_time_printBatch > 300 or currentItem == n_items-1:
                self._print("Processed {} ({:4.1f}%) in {:.2f} {}. Items per second: {:.2f}".format(
                    currentItem+1,
                    100.0* float(currentItem+1)/n_items,
                    new_time_value,
                    new_time_unit,
                    float(currentItem)/elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        # generate the sparse weight matrix
        self.W_sparse = sps.csr_matrix((values[:numCells], (rows[:numCells], cols[:numCells])),
                                       shape=(n_items, n_items), dtype=np.float32)

In [13]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

In [14]:
import pyximport
pyximport.install()

(None, <pyximport.pyximport.PyxImporter at 0x7fed64396c10>)

In [15]:
#prepare the environment to run Cython code
!python run_compile_all_cython.py

run_compile_all_cython: Found 10 Cython files in 4 folders...
run_compile_all_cython: All files will be compiled using your current python environment: '/opt/conda/bin/python'
Compiling [1/10]: MatrixFactorizationImpressions_Cython_Epoch.pyx... 
In file included from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/ndarraytypes.h:1969[m[K,
                 from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/ndarrayobject.h:12[m[K,
                 from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/arrayobject.h:4[m[K,
                 from [01m[KMatrixFactorizationImpressions_Cython_Epoch.c:746[m[K:
      |  [01;35m[K^~~~~~~[m[K
[01m[KMatrixFactorizationImpressions_Cython_Epoch.c:[m[K In function ‘[01m[K__pyx_f_43MatrixFactorizationImpressions_Cython_Epoch_32MatrixFactorization_Cython_Epoch_sampleBPR_Cython[m[K’:
12758 |       [01;35m[K__pyx_t_4 = (__pyx_v_start_pos_impression_ite

In [16]:
#here we will save MAPs of different user groups
MAP_recommender_per_group = {}

#here we will save pairs label : recommnder_object
recommender_object_dict = {}

In [17]:
from Recommenders.MatrixFactorization.IALSRecommender import IALSRecommender

mf_ials = IALSRecommender(URM_train)
mf_ials.fit(num_factors = 31, confidence_scaling= 'log',alpha = 0.0024941846820976015, epsilon = 3.449297756742473, reg = 5.61162089901928e-05, epochs = 40)
recommender_object_dict["MF_IALS"] = mf_ials

IALSRecommender: Epoch 1 of 40. Elapsed time 13.87 sec
IALSRecommender: Epoch 2 of 40. Elapsed time 26.04 sec
IALSRecommender: Epoch 3 of 40. Elapsed time 38.25 sec
IALSRecommender: Epoch 4 of 40. Elapsed time 51.96 sec
IALSRecommender: Epoch 5 of 40. Elapsed time 1.07 min
IALSRecommender: Epoch 6 of 40. Elapsed time 1.30 min
IALSRecommender: Epoch 7 of 40. Elapsed time 1.50 min
IALSRecommender: Epoch 8 of 40. Elapsed time 1.70 min
IALSRecommender: Epoch 9 of 40. Elapsed time 1.93 min
IALSRecommender: Epoch 10 of 40. Elapsed time 2.13 min
IALSRecommender: Epoch 11 of 40. Elapsed time 2.36 min
IALSRecommender: Epoch 12 of 40. Elapsed time 2.57 min
IALSRecommender: Epoch 13 of 40. Elapsed time 2.78 min
IALSRecommender: Epoch 14 of 40. Elapsed time 3.01 min
IALSRecommender: Epoch 15 of 40. Elapsed time 3.22 min
IALSRecommender: Epoch 16 of 40. Elapsed time 3.45 min
IALSRecommender: Epoch 17 of 40. Elapsed time 3.65 min
IALSRecommender: Epoch 18 of 40. Elapsed time 3.86 min
IALSRecommender

In [18]:
impl_IALS =  IALSRecommender_implicit(URM_train)
impl_IALS.fit(n_factors= 382, regularization= 36.55096224415435, iterations=70, num_threads=2)
recommender_object_dict["IALS_implicit"] = mf_ials

  0%|          | 0/70 [00:00<?, ?it/s]

In [19]:
from Recommenders.MatrixFactorization.PureSVDRecommender import PureSVDRecommender

pure_svd = PureSVDRecommender(URM_train)
pure_svd.fit(num_factors=40)
recommender_object_dict["PURE_SVD"] = pure_svd

PureSVDRecommender: Computing SVD decomposition...
PureSVDRecommender: Computing SVD decomposition... done in 2.14 sec


In [20]:
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender

itemknn_cf = ItemKNNCFRecommender(URM_train)
itemknn_cf.fit(topK = 157, shrink = 463, similarity = 'cosine', normalize = True, feature_weighting = 'TF-IDF')
recommender_object_dict["ItemKNNCF"] = itemknn_cf

Similarity column 24507 (100.0%), 4149.15 column/sec. Elapsed time 5.91 sec


In [21]:
import numpy as np
import scipy.sparse as sps
from Recommenders.Recommender_utils import check_matrix
from sklearn.linear_model import ElasticNet
from Recommenders.BaseSimilarityMatrixRecommender import BaseItemSimilarityMatrixRecommender
from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit
import time, sys
from tqdm import tqdm
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

# os.environ["PYTHONWARNINGS"] = ('ignore::exceptions.ConvergenceWarning:sklearn.linear_model')
# os.environ["PYTHONWARNINGS"] = ('ignore:Objective did not converge:ConvergenceWarning:')

class SLIMElasticNetRecommender(BaseItemSimilarityMatrixRecommender):
    """
    Train a Sparse Linear Methods (SLIM) item similarity model.
    NOTE: ElasticNet solver is parallel, a single intance of SLIM_ElasticNet will
          make use of half the cores available
    See:
        Efficient Top-N Recommendation by Linear Regression,
        M. Levy and K. Jack, LSRS workshop at RecSys 2013.
        SLIM: Sparse linear methods for top-n recommender systems,
        X. Ning and G. Karypis, ICDM 2011.
        http://glaros.dtc.umn.edu/gkhome/fetch/papers/SLIM2011icdm.pdf
    """

    RECOMMENDER_NAME = "SLIMElasticNetRecommender"

    def __init__(self, URM_train, verbose = True):
        super(SLIMElasticNetRecommender, self).__init__(URM_train, verbose = verbose)

    @ignore_warnings(category=ConvergenceWarning)
    def fit(self, l1_ratio=0.1, alpha = 1.0, positive_only=True, topK = 100,**earlystopping_kwargs):

        assert l1_ratio>= 0 and l1_ratio<=1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(self.RECOMMENDER_NAME, l1_ratio)

        self.l1_ratio = l1_ratio
        self.positive_only = positive_only
        self.topK = topK


        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=alpha,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = URM_train[:, currentItem].toarray()

            # set the j-th column of X to zero
            start_pos = URM_train.indptr[currentItem]
            end_pos = URM_train.indptr[currentItem + 1]

            current_item_data_backup = URM_train.data[start_pos: end_pos].copy()
            URM_train.data[start_pos: end_pos] = 0.0

            # fit one ElasticNet model per column
            self.model.fit(URM_train, y)

            # self.model.coef_ contains the coefficient of the ElasticNet model
            # let's keep only the non-zero values

            # Select topK values
            # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
            # - Partition the data to extract the set of relevant items
            # - Sort only the relevant items
            # - Get the original item index

            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data

            local_topK = min(len(nonzero_model_coef_value)-1, self.topK)

            relevant_items_partition = (-nonzero_model_coef_value).argpartition(local_topK)[0:local_topK]
            relevant_items_partition_sorting = np.argsort(-nonzero_model_coef_value[relevant_items_partition])
            ranking = relevant_items_partition[relevant_items_partition_sorting]

            for index in range(len(ranking)):

                if numCells == len(rows):
                    rows = np.concatenate((rows, np.zeros(dataBlock, dtype=np.int32)))
                    cols = np.concatenate((cols, np.zeros(dataBlock, dtype=np.int32)))
                    values = np.concatenate((values, np.zeros(dataBlock, dtype=np.float32)))


                rows[numCells] = nonzero_model_coef_index[ranking[index]]
                cols[numCells] = currentItem
                values[numCells] = nonzero_model_coef_value[ranking[index]]

                numCells += 1

            # finally, replace the original values of the j-th column
            URM_train.data[start_pos:end_pos] = current_item_data_backup

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)


            if time.time() - start_time_printBatch > 300 or currentItem == n_items-1:
                self._print("Processed {} ({:4.1f}%) in {:.2f} {}. Items per second: {:.2f}".format(
                    currentItem+1,
                    100.0* float(currentItem+1)/n_items,
                    new_time_value,
                    new_time_unit,
                    float(currentItem)/elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        # generate the sparse weight matrix
        self.W_sparse = sps.csr_matrix((values[:numCells], (rows[:numCells], cols[:numCells])),
                                       shape=(n_items, n_items), dtype=np.float32)

In [22]:
slim_en = SLIMElasticNetRecommender(URM_train)
slim_en.fit(l1_ratio = 0.02, alpha = 0.0018503383172588782,  positive_only = True, topK = 600)
recommender_object_dict["SLIM_EN"] = slim_en

SLIMElasticNetRecommender: Processed 4230 (17.3%) in 5.00 min. Items per second: 14.09
SLIMElasticNetRecommender: Processed 8972 (36.6%) in 10.00 min. Items per second: 14.95
SLIMElasticNetRecommender: Processed 13727 (56.0%) in 15.00 min. Items per second: 15.25
SLIMElasticNetRecommender: Processed 18392 (75.0%) in 20.00 min. Items per second: 15.32
SLIMElasticNetRecommender: Processed 22750 (92.8%) in 25.00 min. Items per second: 15.16
SLIMElasticNetRecommender: Processed 24507 (100.0%) in 26.94 min. Items per second: 15.16


In [23]:
from Recommenders.MatrixFactorization.Cython.MatrixFactorization_Cython_Epoch import MatrixFactorization_Cython_Epoch
from Recommenders.SLIM.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython

slim_bpr = SLIM_BPR_Cython(URM_train)
slim_bpr.fit(epochs=650, sgd_mode = "sgd", topK = 483, lambda_i = 0.0006712905081189398, lambda_j = 0.06584150350451998, learning_rate = 0.0036482363905043207)
recommender_object_dict["SLIM_BPR"] = slim_bpr

SLIM_BPR_Recommender: Automatic selection of fastest train mode. Available RAM is 30585.00 MB (95.25%) of 32110.00 MB, required is 2402.37 MB. Using dense matrix.
Processed 41629 (100.0%) in 1.11 sec. BPR loss is 1.94E-03. Sample per second: 37650
SLIM_BPR_Recommender: Epoch 1 of 650. Elapsed time 0.22 sec
Processed 41629 (100.0%) in 0.33 sec. BPR loss is 1.28E-02. Sample per second: 124711
SLIM_BPR_Recommender: Epoch 2 of 650. Elapsed time 0.45 sec
Processed 41629 (100.0%) in 0.55 sec. BPR loss is 3.07E-02. Sample per second: 75293
SLIM_BPR_Recommender: Epoch 3 of 650. Elapsed time 0.67 sec
Processed 41629 (100.0%) in 0.77 sec. BPR loss is 5.39E-02. Sample per second: 53995
SLIM_BPR_Recommender: Epoch 4 of 650. Elapsed time 0.89 sec
Processed 41629 (100.0%) in 0.98 sec. BPR loss is 7.79E-02. Sample per second: 42416
SLIM_BPR_Recommender: Epoch 5 of 650. Elapsed time 1.10 sec
Processed 41629 (100.0%) in 1.20 sec. BPR loss is 1.10E-01. Sample per second: 34663
SLIM_BPR_Recommender: Epoc

In [24]:
from Recommenders.NonPersonalizedRecommender import TopPop

top_pop = TopPop(URM_train)
top_pop.fit()
recommender_object_dict["TOP_POP"] = top_pop

In [25]:
from Recommenders.EASE_R.EASE_R_Recommender import EASE_R_Recommender

EASE_R = EASE_R_Recommender(URM_train)
#%%
EASE_R.fit(topK = 416, l2_norm = 115.67139771839786, normalize_matrix = False)
recommender_object_dict["Ease_R"] = EASE_R

EASE_R_Recommender: Fitting model... 
EASE_R_Recommender: Fitting model... done in 10.50 min


In [26]:
from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
from Recommenders.GraphBased.P3alphaRecommender import P3alphaRecommender

RP3beta_all = RP3betaRecommender(URM_train)
RP3beta_all.fit(alpha= 1.0, beta= 0.28666076265452467, topK= 57, implicit= True)
recommender_object_dict["RP3"] = RP3beta_all

RP3betaRecommender: Similarity column 24507 (100.0%), 2305.96 column/sec. Elapsed time 10.63 sec


In [27]:
from numpy import linalg as LA
from Recommenders.BaseRecommender import BaseRecommender

class DifferentLossScoresHybridRecommender(BaseRecommender):
    """ ScoresHybridRecommender
    Hybrid of four predictions scores
    R = R1*alpha + R2*beta + R3*theta + R3*(1-alpha-beta-theta)
    
    Class from Dacrema exercise modified by Antonio Ercolani
    The original took as input 2 recommender

    """

    RECOMMENDER_NAME = "DifferentLossScoresHybridRecommender"


    def __init__(self, URM_train, recommender_1, recommender_2, recommender_3):
        super(DifferentLossScoresHybridRecommender, self).__init__(URM_train)

        self.URM_train = sps.csr_matrix(URM_train)
        self.recommender_1 = recommender_1
        self.recommender_2 = recommender_2
        self.recommender_3 = recommender_3
        
        
        
    def fit(self, norm, alpha = 0.5, beta = 0):

        self.alpha = alpha
        self.beta = beta
        self.norm = norm


    def _compute_item_score(self, user_id_array, items_to_compute):
        
        item_weights_1 = self.recommender_1._compute_item_score(user_id_array)
        item_weights_2 = self.recommender_2._compute_item_score(user_id_array)
        item_weights_3 = self.recommender_3._compute_item_score(user_id_array)

        norm_item_weights_1 = LA.norm(item_weights_1, self.norm)
        norm_item_weights_2 = LA.norm(item_weights_2, self.norm)
        norm_item_weights_3 = LA.norm(item_weights_3, self.norm)
        
        
        if norm_item_weights_1 == 0:
            raise ValueError("Norm {} of item weights for recommender 1 is zero. Avoiding division by zero".format(self.norm))
        
        if norm_item_weights_2 == 0:
            raise ValueError("Norm {} of item weights for recommender 2 is zero. Avoiding division by zero".format(self.norm))
            
        if norm_item_weights_3 == 0:
            raise ValueError("Norm {} of item weights for recommender 3 is zero. Avoiding division by zero".format(self.norm))
          
        item_weights = item_weights_1 / norm_item_weights_1 * self.alpha + item_weights_2 / norm_item_weights_2 * self.beta + item_weights_3 / norm_item_weights_3 * (1-(self.alpha+ self.beta))

        return item_weights

In [28]:
hybridDiffLoss = DifferentLossScoresHybridRecommender(URM_all, slim_en, RP3beta_all, EASE_R)
hybridDiffLoss.fit(1, 0.6, 0.35)
recommender_object_dict["HybridDiffLoss"] = hybridDiffLoss

In [29]:
slim_matrix = slim_en.W_sparse
rp3_matrix = RP3beta_all.W_sparse

alpha = 0.6
new_similarity = slim_matrix * alpha + rp3_matrix * (1-alpha)
Slim_rp3 = ItemKNNCustomSimilarityRecommender(URM_train)
Slim_rp3.fit(new_similarity)

recommender_object_dict["Slim_rp3"] = Slim_rp3

NameError: name 'ItemKNNCustomSimilarityRecommender' is not defined

In [None]:
# Plot results of models for different users groups
#%%
# here we perform validation over different user groups for each model
# then we plot a graph to compare them
cutoff = 10

for group_id in range(0, 20):
    
    start_pos = group_id*block_size
    end_pos = min((group_id+1)*block_size, len(profile_length))
    
    users_in_group = sorted_users[start_pos:end_pos]
    
    users_in_group_p_len = profile_length[users_in_group]
    
    print("Group {}, #users in group {}, average p.len {:.2f}, median {}, min {}, max {}".format(
        group_id, 
        users_in_group.shape[0],
        users_in_group_p_len.mean(),
        np.median(users_in_group_p_len),
        users_in_group_p_len.min(),
        users_in_group_p_len.max()))
    
    
    users_not_in_group_flag = np.isin(sorted_users, users_in_group, invert=True)
    users_not_in_group = sorted_users[users_not_in_group_flag]
    
    evaluator_validation = EvaluatorHoldout(URM_valid, cutoff_list=[cutoff], ignore_users=users_not_in_group)
    
    for label, recommender in recommender_object_dict.items():
        result_df, _ = evaluator_validation.evaluateRecommender(recommender)
        if label in MAP_recommender_per_group:
            MAP_recommender_per_group[label].append(result_df.loc[cutoff]["MAP"])
        else:
            MAP_recommender_per_group[label] = [result_df.loc[cutoff]["MAP"]]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline  

_ = plt.figure(figsize=(16, 9))
for label, recommender in recommender_object_dict.items():
    results = MAP_recommender_per_group[label]
    plt.scatter(x=np.arange(0,len(results)), y=results, label=label)
plt.ylabel('MAP')
plt.xlabel('User Group')
plt.legend()
plt.show()

In [None]:
#%% md
# Create final recommandations
#%%
#test_users = pd.read_csv('../input/recommender-system-2021-challenge-polimi/data_target_users_test.csv')
#test_users
#%%
#user_id = test_users['user_id']
#recommendations = []
#for user in user_id:
    #recommendations.append(recommender.recommend(user,cutoff = 10))
#%%
#for index in range(len(recommendations)):
    #recommendations[index]=np.array(recommendations[index])
    
#test_users['item_list']= recommendations
#test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
#test_users.to_csv('submission.csv', index=False)