In [54]:
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
import time
import RecSys2022.Utils.DataReader as dr


from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Utils.DataReader import load_urm#, load_icm, load_target
from Recommenders.SLIM.Cython.SLIM_BPR_Cython import UserSlimBPRRecommender
from Recommenders.IR_feature_weighting import okapi_BM_25
#from Recommenders.FactorizationMachines.LightFMRecommender import LightFMItemHybridRecommender
from Recommenders.SLIM.SLIMElasticNetRecommender import SLIMElasticNetRecommender
from Recommenders.GraphBased.RP3betaRecommender import UserRp3Recommender, RP3betaRecommender
from Recommenders.EASE_R.EASE_R_Recommender import EASE_R_Recommender
from Recommenders.SLIM.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython
from Recommenders.MatrixFactorization.Cython.MatrixFactorizationImpressions_Cython import MatrixFactorization_FunkSVD_Cython

URM_all = dr.load_urm("../data/URMtries/RatingsBinary.csv")
URM_all_tanh = dr.load_urm("../data/URMtries/RatingsWithTanHCount.csv")
URM_all_tanh_type_len = dr.load_urm("../data/URMtries/RatingsWithTanHKnownTypeAndLength.csv")
ICM_tr = dr.load_icm("../data/data_ICM_type_truncated.csv")
URM_all_scores = sps.load_npz("URM_all_scores.npz")

URM_train = sps.load_npz("trainDefault.npz")
URM_train_TanH = sps.load_npz("trainTanH.npz")
URM_train_real = sps.load_npz("trainReal.npz")
URM_valid = sps.load_npz("validDefault.npz")
#evaluator_validation = EvaluatorHoldout(URM_valid, cutoff_list=[10])
stacked_URM = sps.vstack([URM_all, ICM_tr.T])
stacked_URM_TanH = sps.vstack([URM_all_tanh, ICM_tr.T])
stacked_URM_train = sps.vstack([URM_train, ICM_tr.T])
#20.42.95.202
evaluator_validation = EvaluatorHoldout(URM_valid, cutoff_list=[10])

EvaluatorHoldout: Ignoring 963 ( 2.3%) Users that have less than 1 test interactions
RP3betaRecommender: Similarity column 24507 (100.0%), 2437.15 column/sec. Elapsed time 10.06 sec
EvaluatorHoldout: Processed 40666 (100.0%) in 24.21 sec. Users per second: 1680


0.023799576339658426

# OKAPI bm25 if needed

In [2]:
URM_all_bm25 = URM_all.copy().astype(np.float32)
URM_all_bm25 = okapi_BM_25(URM_all_bm25)
URM_all_bm25 = URM_all_bm25.tocsr()

# Different Loss Hybrid class

In [40]:
from Recommenders.BaseRecommender import BaseRecommender

class DifferentLossScoresHybridRecommender2(BaseRecommender):
    """ ScoresHybridRecommender
    Hybrid of three predictions scores
    R = R1*alpha + R2*beta

    Class from Dacrema exercise modified by Antonio Ercolani
    The original took as input 2 recommender

    """

    RECOMMENDER_NAME = "DifferentLossScoresHybridRecommender"

    def __init__(self, URM_train, recommender_1, recommender_2):
        super(DifferentLossScoresHybridRecommender2, self).__init__(URM_train)

        self.URM_train = sps.csr_matrix(URM_train)
        self.recommender_1 = recommender_1
        self.recommender_2 = recommender_2

    def fit(self, alpha=0.5, beta=0.5):
        self.alpha = alpha
        self.beta = beta

    def _compute_item_score(self, user_id_array, items_to_compute = None):
        item_weights_1 = self.recommender_1._compute_item_score(user_id_array)
        item_weights_2 = self.recommender_2._compute_item_score(user_id_array)

        item_weights = item_weights_1 * self.alpha + item_weights_2 * self.beta

        return item_weights

from Recommenders.BaseRecommender import BaseRecommender

class DifferentLossScoresHybridRecommender4(BaseRecommender):
    """ ScoresHybridRecommender
    Hybrid of three predictions scores
    R = R1*alpha + R2*beta

    Class from Dacrema exercise modified by Antonio Ercolani
    The original took as input 2 recommender

    """

    RECOMMENDER_NAME = "DifferentLossScoresHybridRecommender"

    def __init__(self, URM_train, recommender_1, recommender_2, recommender_3, recommender_4):
        super(DifferentLossScoresHybridRecommender4, self).__init__(URM_train)

        self.URM_train = sps.csr_matrix(URM_train)
        self.recommender_1 = recommender_1
        self.recommender_2 = recommender_2
        self.recommender_3 = recommender_3
        self.recommender_4 = recommender_4

    def fit(self, alpha=0.5, beta=0.5, gamma=0.5, delta = 0.5):
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.delta = delta

    def _compute_item_score(self, user_id_array, items_to_compute = None):
        item_weights_1 = self.recommender_1._compute_item_score(user_id_array)
        item_weights_2 = self.recommender_2._compute_item_score(user_id_array)
        item_weights_3 = self.recommender_3._compute_item_score(user_id_array)
        item_weights_4 = self.recommender_4._compute_item_score(user_id_array)

        item_weights = item_weights_1 * self.alpha + item_weights_2 * self.beta + item_weights_3 * self.gamma + item_weights_4 * self.delta

        return item_weights

class DifferentLossScoresHybridRecommender3(BaseRecommender):
    """ ScoresHybridRecommender
    Hybrid of three predictions scores
    R = R1*alpha + R2*beta

    Class from Dacrema exercise modified by Antonio Ercolani
    The original took as input 2 recommender

    """

    RECOMMENDER_NAME = "DifferentLossScoresHybridRecommender"

    def __init__(self, URM_train, recommender_1, recommender_2, recommender_3):
        super(DifferentLossScoresHybridRecommender3, self).__init__(URM_train)

        self.URM_train = sps.csr_matrix(URM_train)
        self.recommender_1 = recommender_1
        self.recommender_2 = recommender_2
        self.recommender_3 = recommender_3

    def fit(self, alpha=0.5, beta=0.5, gamma=0.5):
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma

    def _compute_item_score(self, user_id_array, items_to_compute = None):
        item_weights_1 = self.recommender_1._compute_item_score(user_id_array)
        item_weights_2 = self.recommender_2._compute_item_score(user_id_array)
        item_weights_3 = self.recommender_3._compute_item_score(user_id_array)

        item_weights = item_weights_1 * self.alpha + item_weights_2 * self.beta + item_weights_3 * self.gamma

        return item_weights

    def save_model(self, folder_path, file_name = None):
        pass

In [41]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

n_cases = 500
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"
cutoff_to_optimize = 10

In [42]:
from skopt.space import Real, Integer, Categorical

hyperparameters_range_dictionary_SSLIM = {
    "topK": Categorical([950]),
    "l1_ratio": Categorical([0.004999999999999999]),
    "alpha": Categorical([0.0015746723778813712]),
    "nof_icms": Integer(low=1, high=4),
    "icm_coeff": Real(low=0.25, high=4)
}

hyperparameters_range_dictionary_SLIM_focus = {
    "topK": Integer(450, 1000),
    "l1_ratio": Real(low = 0.0001, high = 0.001, prior = 'log-uniform'),
    "alpha": Real(low = 0.01, high = 0.1, prior = 'uniform'),
}

hyperparameters_range_dictionary_SLIM = {
    "topK": Integer(5, 1000),
    "l1_ratio": Real(low = 1e-5, high = 1.0, prior = 'log-uniform'),
    "alpha": Real(low = 1e-3, high = 1.0, prior = 'uniform'),
}

hyperparameters_range_dictionary_EASE = {
    "topK": Integer(low=10, high=2000, prior="uniform"),
    "normalize_matrix": Categorical([False]),
    "l2_norm": Real(low = 1e0, high = 1e7, prior = 'log-uniform'),
}

hyperparameters_range_dictionary_RP3 = {
    "topK": Integer(175, 230),
    "alpha": Real(low = 0.2, high = 0.5, prior = 'uniform'),
    "beta": Real(low = 0.3, high = 0.6, prior = 'uniform'),
    "normalize_similarity": Categorical([True]),
    "stacked": Categorical([True]),
    "icm_coeff": Real(low=0.4, high=0.6, prior="uniform")
}

hyperparameters_range_dictionary_SuperiorRP3 = {
    "alpha": Real(low = 0, high = 100, prior = 'uniform'),
    "beta": Real(low = 0, high = 100, prior = 'uniform')
}

hyperparameters_range_dictionary_SLIMBPR = {
    "topK": Integer(20, 550),
    "epochs": Categorical([1500]),
    "symmetric": Categorical([True]),
    "sgd_mode": Categorical(["sgd", "adagrad"]),
    "lambda_i": Real(low = 1e-5, high = 1e-4, prior = 'log-uniform'),
    "lambda_j": Real(low = 1e-6, high = 1e-4, prior = 'log-uniform'),
    "learning_rate": Real(low = 0.05, high = 0.15, prior = 'log-uniform'),
    "random_seed": Categorical([42]),
    "stack": Categorical([True]),
    "icm_coeff": Real(low=0.5, high=2, prior="uniform"),
    "normalize_similarity": Categorical([True, False])
}

hyperparameters_range_dictionary_FM = {
    "epochs": Categorical([300]),
    "n_components": Integer(1, 200),
    "loss": Categorical(['bpr', 'warp', 'warp-kos']),
    "sgd_mode": Categorical(['adagrad', 'adadelta']),
    "learning_rate": Real(low = 1e-6, high = 1e-1, prior = 'log-uniform'),
    "item_alpha": Real(low = 1e-5, high = 1e-2, prior = 'log-uniform'),
    "user_alpha": Real(low = 1e-5, high = 1e-2, prior = 'log-uniform'),
}

hyperparameters_range_dictionary_hybrid = {
    "alpha": Real(low = 0, high = 100, prior = 'uniform'),
    "beta": Real(low = 0, high = 100, prior = 'uniform'),
    #"gamma": Real(low = 0, high = 100, prior = 'uniform'),
    #"delta": Real(low = 0, high = 30, prior = 'uniform'),
    #"topK": Integer(low=400, high=700, prior="uniform")
}

hyperparameters_range_dictionary_KNN = {
    "similarity": Categorical(['cosine', 'jaccard', "asymmetric", "dice", "tversky"]),
    "topK": Integer(10, 500),
    "shrink": Integer(1, 200),
    "normalize": Categorical([True, False]),
    "feature_weighting": Categorical(["none", "BM25", "TF-IDF"])
}

hyperparameters_range_dictionary_KNN_subset = {
    "similarity": Categorical(['cosine', 'jaccard']),
    "topK": Integer(35, 70),
    "shrink": Integer(140, 180),
    "normalize": Categorical([True]),
    "feature_weighting": Categorical(["TF-IDF"])
}

hyperparameters_range_dictionary_NMF = {
    "num_factors": Integer(1, 350),
    "solver": Categorical(["coordinate_descent", "multiplicative_update"]),
    "init_type": Categorical(["random", "nndsvda"]),
    "beta_loss": Categorical(["frobenius"]),
    "random_seed": Categorical([42])
}

#learning_rate=1e-5, beta=1, epochs=80, l1_reg=3e-5
hyperparameters_range_dictionary_SLIMRMSE = {
    "learning_rate": Real(low=1e-5, high=1e-3, prior="uniform"),
    "beta": Categorical([1]),
    "epochs": Integer(low=1, high=2, prior="uniform"),
    "l1_reg": Real(low=1e-5, high=1e-3, prior="uniform")
}

hyperparameters_range_dictionary_MF_IMPR = {
    "sgd_mode": Categorical(["sgd", "adagrad", "adam"]),
    "epochs": Categorical([150]),
    "use_bias": Categorical([True, False]),
    "batch_size": Categorical([1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]),
    "num_factors": Integer(1, 200),
    "item_reg": Real(low = 1e-8, high = 1e-4, prior = 'log-uniform'),
    "user_reg": Real(low = 1e-8, high = 1e-4, prior = 'log-uniform'),
    "positive_reg": Real(low = 1e-8, high = 1e-4, prior = 'log-uniform'),
    "negative_reg": Real(low = 1e-8, high = 1e-4, prior = 'log-uniform'),
    "learning_rate": Real(low = 1e-4, high = 1e-1, prior = 'log-uniform'),
    "negative_interactions_quota": Real(low = 0.0, high = 0.5, prior = 'uniform'),
    "random_seed": Categorical([42])
}

In [43]:
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt
from RecSys2022.Recommenders.EASE_R import EASE_R_Recommender
from Recommenders.KNN.UserKNNCFRecommender import UserKNNCFRecommender
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from Recommenders.KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from Recommenders.MatrixFactorization.NMFRecommender import NMFRecommender
from Recommenders.SLIM.SSLIMMSE import SSLIMRMSERecommender
recommender_class = DifferentLossScoresHybridRecommender2
#create a bayesian optimizer object, we pass the recommender and the evaluator
hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                           evaluator_validation=evaluator_validation)

In [47]:
slimbpr_bin = SLIM_BPR_Cython(URM_train)
slimbpr_bin.load_model("C:\\Users\\Andrea\\AppData\\Roaming\\JetBrains\\DataSpell2022.2\\projects\\RecSys2022\\data\\saved_models\\", "SLIMBPR_BIN_TRAIN")
slimbpr_bin.W_sparse = slimbpr_bin.W_sparse.get('W_sparse')

stacked_URM_TanH_train = sps.vstack([URM_train_TanH, ICM_tr.T])

slimbpr_tanh = SLIM_BPR_Cython(stacked_URM_TanH_train)
slimbpr_tanh.load_model("C:\\Users\\Andrea\\AppData\\Roaming\\JetBrains\\DataSpell2022.2\\projects\\RecSys2022\\data\\saved_models\\", "SLIMBPR_TANH_TRAIN_STACKED")
slimbpr_tanh.W_sparse = slimbpr_tanh.W_sparse.get('W_sparse')

SLIM_BPR_Recommender: Loading model from file 'C:\Users\Andrea\AppData\Roaming\JetBrains\DataSpell2022.2\projects\RecSys2022\data\saved_models\SLIMBPR_BIN_TRAIN'
  (0, 18)	0.2626935
  (0, 320)	0.2029069
  (0, 329)	0.24602552
  (0, 822)	0.30130786
  (0, 1256)	0.24716282
  (0, 1451)	0.22538476
  (0, 1507)	0.239857
  (0, 1837)	0.20446162
  (0, 2185)	0.21781924
  (0, 2207)	0.24833192
  (0, 2213)	0.23224708
  (0, 2882)	0.21117224
  (0, 2883)	0.22454911
  (0, 3606)	0.25452262
  (0, 4228)	0.27893725
  (0, 4553)	0.2630109
  (0, 4564)	0.2305793
  (0, 4644)	0.22434404
  (0, 4794)	0.27656624
  (0, 4969)	0.23842192
  (0, 5257)	0.2579844
  (0, 5414)	0.250959
  (0, 5523)	0.282262
  (0, 5639)	0.21033141
  (0, 5723)	0.21987584
  :	:
  (24506, 17636)	0.22373872
  (24506, 17644)	0.28662798
  (24506, 17928)	0.28674176
  (24506, 18082)	0.27270553
  (24506, 18159)	0.21880895
  (24506, 19225)	0.35958183
  (24506, 19638)	0.31348163
  (24506, 19967)	0.4079425
  (24506, 20034)	0.27222744
  (24506, 20111)	0.268

In [48]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs

#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [stacked_URM_train, slimbpr_bin, slimbpr_tanh],
    #CONSTRUCTOR_POSITIONAL_ARGS = [stacked_URM_train],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {},
    #EARLYSTOPPING_KEYWORD_ARGS = earlystopping_keywargs,
)

In [49]:
hyperparameterSearch.search(recommender_input_args = recommender_input_args,
                            hyperparameter_search_space = hyperparameters_range_dictionary_hybrid,
                            n_cases = n_cases,
                            n_random_starts = n_random_starts,
                            save_model = "no",
                            output_folder_path = output_folder_path, # Where to save the results
                            output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                            metric_to_optimize = metric_to_optimize,
                            cutoff_to_optimize = cutoff_to_optimize,
                            )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'alpha': 23.438254877308523, 'beta': 91.96624444591818}
DifferentLossScoresHybridRecommender: URM Detected 3 ( 0.0%) users with no interactions.
EvaluatorHoldout: Processed 40666 (100.0%) in 28.50 sec. Users per second: 1427
SearchBayesianSkopt: New best config found. Config 0: {'alpha': 23.438254877308523, 'beta': 91.96624444591818} - results: PRECISION: 0.0490852, PRECISION_RECALL_MIN_DEN: 0.0827817, RECALL: 0.0777111, MAP: 0.0238072, MAP_MIN_DEN: 0.0394610, MRR: 0.1657171, NDCG: 0.0794636, F1: 0.0601669, HIT_RATE: 0.3368416, ARHR_ALL_HITS: 0.1972251, NOVELTY: 0.0042560, AVERAGE_POPULARITY: 0.2222107, DIVERSITY_MEAN_INTER_LIST: 0.9242714, DIVERSITY_HERFINDAHL: 0.9924249, COVERAGE_ITEM: 0.4848411, COVERAGE_ITEM_HIT: 0.0445587, ITEMS_IN_GT: 0.9929816, COVERAGE_USER: 0.9768671, COVERAGE_USER_HIT: 0.3290495, USERS_IN_GT: 0.9768671, DIVERSITY_GINI: 0.0316541, SHANNON_ENTROPY: 8.8290805, RAT

KeyboardInterrupt: 

In [50]:
from Recommenders.DataIO import DataIO

#explore the results of the search
data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

{'algorithm_name_recommender': 'DifferentLossScoresHybridRecommender'}
{'algorithm_name_recommender': 'DifferentLossScoresHybridRecommender', 'algorithm_name_search': 'SearchBayesianSkopt'}
{'algorithm_name_recommender': 'DifferentLossScoresHybridRecommender', 'algorithm_name_search': 'SearchBayesianSkopt', 'cutoff_to_optimize': 10}
{'algorithm_name_recommender': 'DifferentLossScoresHybridRecommender', 'algorithm_name_search': 'SearchBayesianSkopt', 'cutoff_to_optimize': 10, 'exception_list': [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, Non

dict_keys(['algorithm_name_recommender', 'algorithm_name_search', 'cutoff_to_optimize', 'exception_list', 'hyperparameters_best', 'hyperparameters_best_index', 'hyperparameters_df', 'metric_to_optimize', 'result_on_earlystopping_df', 'result_on_last', 'result_on_test_best', 'result_on_test_df', 'result_on_validation_best', 'result_on_validation_df', 'time_df', 'time_on_last_df', 'time_on_test_avg', 'time_on_test_total', 'time_on_train_avg', 'time_on_train_total', 'time_on_validation_avg', 'time_on_validation_total'])

In [51]:
result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.049085,0.082782,0.077711,0.023807,0.039461,0.165717,0.079464,0.060167,0.336842,0.197225,...,0.976867,0.329049,0.976867,0.031654,8.82908,0.992744,0.061407,0.653104,4.354175,0.239545
1,10,0.049567,0.083104,0.077896,0.023973,0.039537,0.165936,0.079725,0.060584,0.338268,0.198048,...,0.976867,0.330443,0.976867,0.03901,9.249291,0.994828,0.075677,0.684187,3.808614,0.245522
2,10,0.04957,0.083132,0.077923,0.023981,0.039547,0.166004,0.079746,0.060593,0.338489,0.198115,...,0.976867,0.330659,0.976867,0.038738,9.235183,0.994769,0.075148,0.683144,3.826384,0.245312
3,10,0.049567,0.083084,0.077878,0.023971,0.039531,0.165899,0.079714,0.060578,0.338243,0.198015,...,0.976867,0.330419,0.976867,0.039047,9.251114,0.994836,0.075748,0.684322,3.806262,0.24555
4,10,0.049575,0.083112,0.077903,0.023974,0.039537,0.165933,0.079729,0.060591,0.338292,0.198052,...,0.976867,0.330467,0.976867,0.039015,9.249518,0.994829,0.075686,0.684204,3.808252,0.245525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,10,,,,,,,,,,,...,,,,,,,,,,
496,10,,,,,,,,,,,...,,,,,,,,,,
497,10,,,,,,,,,,,...,,,,,,,,,,
498,10,,,,,,,,,,,...,,,,,,,,,,


In [52]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'alpha': 79.6131479555512, 'beta': 79.16320148236152}

In [53]:
best_result = search_metadata["result_on_validation_best"]
best_result["MAP"]

0.023993314878697575