In [1]:
!cp -r ../input/recsys-repo/RecSys_Course_AT_PoliMi-master/* ./

In [2]:
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot

In [3]:
URM_all_dataframe = pd.read_csv('/kaggle/input/urm-true-binary/URM_True_Binary.csv')
URM_all_dataframe
URM_all = sps.coo_matrix((URM_all_dataframe["Data"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)
URM_all

<41629x24507 sparse matrix of type '<class 'numpy.int64'>'
	with 1554640 stored elements in Compressed Sparse Row format>

In [4]:
URM_train_dataframe = pd.read_csv('/kaggle/input/urm-split/Train_df.csv')
URM_train_dataframe
URM_train = sps.coo_matrix((URM_train_dataframe["Data"].values, 
                          (URM_train_dataframe["UserID"].values, URM_train_dataframe["ItemID"].values)))
URM_train = URM_train.tocsr() # to obtain fast access to rows (users)
URM_train

<41629x24507 sparse matrix of type '<class 'numpy.int64'>'
	with 1243712 stored elements in Compressed Sparse Row format>

In [5]:
URM_val_dataframe = pd.read_csv('/kaggle/input/urm-split/Test_df.csv')
URM_val_dataframe
URM_valid = sps.coo_matrix((URM_val_dataframe["Data"].values, 
                          (URM_val_dataframe["UserID"].values, URM_val_dataframe["ItemID"].values)))
URM_valid = URM_valid.tocsr() # to obtain fast access to rows (users)
URM_valid

<41629x24507 sparse matrix of type '<class 'numpy.int64'>'
	with 310928 stored elements in Compressed Sparse Row format>

In [6]:
from Evaluation.Evaluator import EvaluatorHoldout

#create an evaluator object to evaluate validation set
#we will use it for hyperparameter tuning
evaluator_valid = EvaluatorHoldout(URM_valid, cutoff_list=[10])

EvaluatorHoldout: Ignoring 323 ( 0.8%) Users that have less than 1 test interactions


In [7]:
import scipy.sparse as sps
from Recommenders.Recommender_utils import check_matrix
from sklearn.linear_model import ElasticNet
from Recommenders.BaseSimilarityMatrixRecommender import BaseItemSimilarityMatrixRecommender
from Utils.seconds_to_biggest_unit import seconds_to_biggest_unit
import time, sys
from tqdm import tqdm
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

In [8]:
class SLIMElasticNetRecommender(BaseItemSimilarityMatrixRecommender):
    """
    Train a Sparse Linear Methods (SLIM) item similarity model.
    NOTE: ElasticNet solver is parallel, a single intance of SLIM_ElasticNet will
          make use of half the cores available
    See:
        Efficient Top-N Recommendation by Linear Regression,
        M. Levy and K. Jack, LSRS workshop at RecSys 2013.
        SLIM: Sparse linear methods for top-n recommender systems,
        X. Ning and G. Karypis, ICDM 2011.
        http://glaros.dtc.umn.edu/gkhome/fetch/papers/SLIM2011icdm.pdf
    """

    RECOMMENDER_NAME = "SLIMElasticNetRecommender"

    def __init__(self, URM_train, verbose = True):
        super(SLIMElasticNetRecommender, self).__init__(URM_train, verbose = verbose)

    @ignore_warnings(category=ConvergenceWarning)
    def fit(self, l1_ratio=0.1, alpha = 1.0, positive_only=True, topK = 100,**earlystopping_kwargs):

        assert l1_ratio>= 0 and l1_ratio<=1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(self.RECOMMENDER_NAME, l1_ratio)

        self.l1_ratio = l1_ratio
        self.positive_only = positive_only
        self.topK = topK


        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=alpha,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = URM_train[:, currentItem].toarray()

            # set the j-th column of X to zero
            start_pos = URM_train.indptr[currentItem]
            end_pos = URM_train.indptr[currentItem + 1]

            current_item_data_backup = URM_train.data[start_pos: end_pos].copy()
            URM_train.data[start_pos: end_pos] = 0.0

            # fit one ElasticNet model per column
            self.model.fit(URM_train, y)

            # self.model.coef_ contains the coefficient of the ElasticNet model
            # let's keep only the non-zero values

            # Select topK values
            # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
            # - Partition the data to extract the set of relevant items
            # - Sort only the relevant items
            # - Get the original item index

            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data

            local_topK = min(len(nonzero_model_coef_value)-1, self.topK)

            relevant_items_partition = (-nonzero_model_coef_value).argpartition(local_topK)[0:local_topK]
            relevant_items_partition_sorting = np.argsort(-nonzero_model_coef_value[relevant_items_partition])
            ranking = relevant_items_partition[relevant_items_partition_sorting]

            for index in range(len(ranking)):

                if numCells == len(rows):
                    rows = np.concatenate((rows, np.zeros(dataBlock, dtype=np.int32)))
                    cols = np.concatenate((cols, np.zeros(dataBlock, dtype=np.int32)))
                    values = np.concatenate((values, np.zeros(dataBlock, dtype=np.float32)))


                rows[numCells] = nonzero_model_coef_index[ranking[index]]
                cols[numCells] = currentItem
                values[numCells] = nonzero_model_coef_value[ranking[index]]

                numCells += 1

            # finally, replace the original values of the j-th column
            URM_train.data[start_pos:end_pos] = current_item_data_backup

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)


            if time.time() - start_time_printBatch > 300 or currentItem == n_items-1:
                self._print("Processed {} ({:4.1f}%) in {:.2f} {}. Items per second: {:.2f}".format(
                    currentItem+1,
                    100.0* float(currentItem+1)/n_items,
                    new_time_value,
                    new_time_unit,
                    float(currentItem)/elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        # generate the sparse weight matrix
        self.W_sparse = sps.csr_matrix((values[:numCells], (rows[:numCells], cols[:numCells])),
                                       shape=(n_items, n_items), dtype=np.float32)

In [9]:
recommender_class = SLIMElasticNetRecommender

In [10]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 50
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [11]:
from skopt.space import Real, Integer, Categorical

#param = l1_ratio=0.1, alpha = 1.0, positive_only=True, topK = 100

hyperparameters_range_dictionary = {
    "l1_ratio": Real(low = 0.004, high = 0.8, prior = 'log-uniform'),
    "alpha": Real(low = 0.001, high = 0.4, prior = 'log-uniform'),
    "positive_only": Categorical([True, False]),
    "topK": Integer(4000,20000)
}

In [12]:
#We also setup the early stopping 
earlystopping_keywargs = {"validation_every_n": 5,
                          "stop_on_validation": True,
                          "evaluator_object": evaluator_valid,
                          "lower_validations_allowed": 5,
                          "validation_metric": metric_to_optimize,
                          }

In [13]:
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

#create a bayesian optimizer object, we pass the recommender and the evaluator
hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_valid)

In [14]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = earlystopping_keywargs
)

In [15]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = earlystopping_keywargs
)

In [16]:
#let's run the bayesian search
hyperparameterSearch.search(recommender_input_args = recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'l1_ratio': 0.005949732331469106, 'alpha': 0.04679384803262339, 'positive_only': False, 'topK': 13470}
SLIMElasticNetRecommender: Processed 14796 (60.4%) in 5.00 min. Items per second: 49.31
SLIMElasticNetRecommender: Processed 24507 (100.0%) in 7.53 min. Items per second: 54.22
EvaluatorHoldout: Processed 41306 (100.0%) in 22.80 sec. Users per second: 1812
SearchBayesianSkopt: New best config found. Config 0: {'l1_ratio': 0.005949732331469106, 'alpha': 0.04679384803262339, 'positive_only': False, 'topK': 13470} - results: PRECISION: 0.0592529, PRECISION_RECALL_MIN_DEN: 0.0819598, RECALL: 0.0715336, MAP: 0.0295761, MAP_MIN_DEN: 0.0402181, MRR: 0.1907001, NDCG: 0.0847283, F1: 0.0648167, HIT_RATE: 0.3780807, ARHR_ALL_HITS: 0.2352305, NOVELTY: 0.0039963, AVERAGE_POPULARITY: 0.2785873, DIVERSITY_MEAN_INTER_LIST: 0.8692003, DIVERSITY_HERFINDAHL: 0.9869179, COVERAGE_ITEM: 0.0754478, COVERAGE_I

In [17]:
from Recommenders.DataIO import DataIO

#explore the results of the search
data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['algorithm_name_recommender', 'time_on_validation_total', 'time_on_train_total', 'result_on_validation_df', 'time_df', 'exception_list', 'hyperparameters_df', 'result_on_test_df', 'result_on_earlystopping_df', 'time_on_validation_avg', 'time_on_last_df', 'cutoff_to_optimize', 'result_on_last', 'time_on_train_avg', 'time_on_test_avg', 'result_on_test_best', 'algorithm_name_search', 'time_on_test_total', 'result_on_validation_best', 'hyperparameters_best', 'metric_to_optimize', 'hyperparameters_best_index'])

In [18]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,l1_ratio,alpha,positive_only,topK
0,0.00595,0.046794,False,13470
1,0.124648,0.005976,True,11538
2,0.00482,0.008092,False,8191
3,0.125938,0.16124,False,9443
4,0.453179,0.022135,True,13671
5,0.005345,0.033381,False,5772
6,0.246914,0.007375,False,11188
7,0.004184,0.005401,True,17121
8,0.048202,0.020253,True,13615
9,0.029829,0.01696,True,9844


In [19]:
result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.059253,0.08196,0.071534,0.029576,0.040218,0.1907,0.084728,0.064817,0.378081,0.235231,...,0.992241,0.375147,0.992241,0.007104,7.513599,0.987244,0.013987,0.556832,5.494586,0.242939
1,10,0.060275,0.082674,0.071903,0.030989,0.041793,0.197769,0.087009,0.065577,0.37796,0.245274,...,0.992241,0.375027,0.992241,0.007768,7.836037,0.991171,0.015294,0.580728,4.700563,0.253411
2,10,0.067888,0.093413,0.081336,0.034531,0.0466,0.212843,0.096572,0.074006,0.412168,0.268267,...,0.992241,0.40897,0.992241,0.01331,8.325136,0.991065,0.026207,0.616975,4.795888,0.251393
3,10,0.008461,0.011734,0.010296,0.006093,0.008327,0.055334,0.017023,0.009289,0.076502,0.058035,...,0.992241,0.075909,0.992241,0.000676,4.357296,0.940497,0.001331,0.322919,1.489568,0.362991
4,10,0.026948,0.035743,0.030391,0.013976,0.018539,0.101658,0.040065,0.028566,0.190699,0.118478,...,0.992241,0.189219,0.992241,0.001273,5.321722,0.969803,0.002507,0.394392,4.07151,0.304206
5,10,0.061504,0.084872,0.073991,0.030845,0.041836,0.196173,0.087735,0.067172,0.387111,0.243602,...,0.992241,0.384107,0.992241,0.008579,7.741713,0.988678,0.016892,0.573737,5.262983,0.244725
6,10,0.052765,0.072179,0.062672,0.027124,0.036666,0.179027,0.076947,0.057293,0.343219,0.218614,...,0.992241,0.340556,0.992241,0.00469,7.176693,0.988187,0.009233,0.531864,4.992976,0.256299
7,10,0.068922,0.094684,0.08242,0.035237,0.047455,0.216057,0.098091,0.075069,0.415291,0.273073,...,0.992241,0.412069,0.992241,0.014669,8.469961,0.991944,0.028882,0.627708,4.605218,0.253566
8,10,0.056641,0.07783,0.067753,0.028793,0.0389,0.186581,0.081686,0.0617,0.36278,0.229711,...,0.992241,0.359965,0.992241,0.005788,7.421604,0.988652,0.011396,0.550014,5.148034,0.249848
9,10,0.060664,0.083589,0.072827,0.030852,0.041749,0.196866,0.087218,0.066191,0.382099,0.244135,...,0.992241,0.379135,0.992241,0.007771,7.774909,0.990097,0.015301,0.576197,4.96706,0.249195


In [20]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'l1_ratio': 0.05652649721548329,
 'alpha': 0.001,
 'positive_only': True,
 'topK': 4000}

In [21]:
#recommender = SLIMElasticNetRecommender(URM_all)
#recommender.fit(l1_ratio = 0.001, alpha = 0.01, positive_only = True, topK = 200)

In [22]:
#recommender.save_model(output_folder_path, file_name = recommender.RECOMMENDER_NAME + "_my_own_save.zip" )


In [23]:
#test_users = pd.read_csv('../input/recsyschallenge/data_target_users_test.csv')

In [24]:
#user_id = test_users['user_id']
#recommendations = []
#for user in user_id:
 #   recommendations.append(recommender.recommend(user,cutoff = 10))
#for index in range(len(recommendations)):
 #   recommendations[index]=np.array(recommendations[index])
    
#test_users['item_list']= recommendations
#test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
#test_users.to_csv('submission.csv', index=False)