In [47]:
!cp -r ../input/recsysgit/RecSys_Course_AT_PoliMi-master/* ./

In [48]:
%config Completer.use_jedi = False
%load_ext Cython
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Evaluation.Evaluator import EvaluatorHoldout

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [49]:
def get_URM():
    return pd.read_csv('../input/binaryurmrecsys/URM_updated.csv')

In [50]:
URM_all_dataframe = get_URM()

In [51]:
URM_all_dataframe

Unnamed: 0,UserID,ItemID,Data
0,0,21,1
1,0,124,1
2,0,808,1
3,0,1326,1
4,0,1995,1
...,...,...,...
1051823,41628,15971,1
1051824,41628,19992,1
1051825,41628,20448,1
1051826,41628,22882,1


In [52]:
URM_all = sps.coo_matrix((URM_all_dataframe["Data"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)
URM_all

<41629x24507 sparse matrix of type '<class 'numpy.int64'>'
	with 1051828 stored elements in Compressed Sparse Row format>

In [53]:

from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

# split data into train and validation data 80/20
URM_train, URM_valid = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)



In [54]:
from Evaluation.Evaluator import EvaluatorHoldout

#create an evaluator object to evaluate validation set
#we will use it for hyperparameter tuning
evaluator_valid = EvaluatorHoldout(URM_valid, cutoff_list=[10])

EvaluatorHoldout: Ignoring 1509 ( 3.6%) Users that have less than 1 test interactions


In [55]:
from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender

#try a SLIM BPR model
recommender_class = RP3betaRecommender

In [56]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 50
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [57]:
from skopt.space import Real, Integer, Categorical
#RP3 beta
#to tuning hyperparam are typical of ML models to drive the learning process

hyperparameters_range_dictionary = {
    "alpha": Real(low=0, high=1, prior='uniform'),
    "beta": Real(low=0, high=1, prior='uniform'),
    "topK": Integer(1, 800),
    "implicit": Categorical([True, False])
}

In [58]:
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

#create a bayesian optimizer object, we pass the recommender and the evaluator
hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_valid)

In [59]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [60]:
#let's run the bayesian search
hyperparameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'alpha': 0.47705262727224373, 'beta': 0.8428463052949936, 'topK': 654, 'implicit': False}
RP3betaRecommender: URM Detected 1 ( 0.0%) users with no interactions.
RP3betaRecommender: Similarity column 24507 (100.0%), 1069.59 column/sec. Elapsed time 22.91 sec
EvaluatorHoldout: Processed 40120 (100.0%) in 41.38 sec. Users per second: 970
SearchBayesianSkopt: New best config found. Config 0: {'alpha': 0.47705262727224373, 'beta': 0.8428463052949936, 'topK': 654, 'implicit': False} - results: PRECISION: 0.0197034, PRECISION_RECALL_MIN_DEN: 0.0273031, RECALL: 0.0240617, MAP: 0.0086584, MAP_MIN_DEN: 0.0116910, MRR: 0.0533846, NDCG: 0.0257957, F1: 0.0216655, HIT_RATE: 0.1252742, ARHR_ALL_HITS: 0.0674954, NOVELTY: 0.0063744, AVERAGE_POPULARITY: 0.0115232, DIVERSITY_MEAN_INTER_LIST: 0.9974360, DIVERSITY_HERFINDAHL: 0.9997411, COVERAGE_ITEM: 0.7896519, COVERAGE_ITEM_HIT: 0.0554535, ITEMS_IN_GT: 0.9

In [61]:
from Recommenders.DataIO import DataIO

#explore the results of the search
data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['time_on_train_avg', 'exception_list', 'result_on_validation_best', 'time_on_train_total', 'hyperparameters_df', 'time_on_validation_avg', 'metric_to_optimize', 'hyperparameters_best_index', 'time_on_test_avg', 'result_on_test_best', 'time_on_last_df', 'cutoff_to_optimize', 'result_on_last', 'result_on_validation_df', 'result_on_earlystopping_df', 'time_df', 'hyperparameters_best', 'time_on_test_total', 'algorithm_name_search', 'algorithm_name_recommender', 'time_on_validation_total', 'result_on_test_df'])

In [62]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,alpha,beta,topK,implicit
0,0.477053,0.842846,654,False
1,0.400483,0.703917,530,False
2,0.649219,0.041219,463,True
3,0.793184,0.815937,651,False
4,0.445133,0.445443,249,True
5,0.793053,0.863134,386,False
6,0.324356,0.861594,523,False
7,0.52864,0.454117,114,False
8,0.508898,0.298038,552,False
9,0.805798,0.96058,63,False


In [63]:

result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.019703,0.027303,0.024062,0.008658,0.011691,0.053385,0.025796,0.021666,0.125274,0.067495,...,0.963751,0.120733,0.963751,0.278358,13.042244,1.000206,0.632186,0.989221,0.190673,0.586186
1,10,0.040252,0.061995,0.057118,0.019464,0.029492,0.120461,0.059561,0.047224,0.251097,0.151763,...,0.963751,0.241995,0.963751,0.255887,12.494844,0.999586,0.581153,0.947703,0.976793,0.528115
2,10,0.052866,0.093373,0.088191,0.025437,0.043922,0.166521,0.085977,0.066106,0.344716,0.204273,...,0.963751,0.33222,0.963751,0.031498,8.065012,0.986868,0.071537,0.611711,4.776695,0.367923
3,10,0.016605,0.022305,0.019309,0.007045,0.00916,0.04137,0.020616,0.017855,0.102642,0.053494,...,0.963751,0.098921,0.963751,0.317048,13.270938,1.000282,0.720057,1.006567,0.156023,0.591118
4,10,0.055334,0.095869,0.090364,0.028025,0.047846,0.179143,0.091357,0.068638,0.350573,0.222332,...,0.963751,0.337865,0.963751,0.113483,9.842293,0.99374,0.257735,0.746513,3.437879,0.41399
5,10,0.016478,0.022787,0.02004,0.007051,0.00955,0.043502,0.021264,0.018085,0.105708,0.054877,...,0.963751,0.101876,0.963751,0.293495,13.146411,1.000251,0.666566,0.997122,0.151449,0.590977
6,10,0.021194,0.030413,0.027206,0.009352,0.013184,0.06012,0.02878,0.023827,0.139581,0.074433,...,0.963751,0.134522,0.963751,0.251063,12.853716,1.000127,0.570197,0.974922,0.218541,0.580412
7,10,0.056291,0.097474,0.091833,0.028664,0.048727,0.181462,0.092829,0.069798,0.353415,0.226149,...,0.963751,0.340604,0.963751,0.100496,10.044786,0.995944,0.228238,0.761872,2.96963,0.417277
8,10,0.052572,0.091869,0.086687,0.025831,0.044453,0.168776,0.086177,0.065451,0.340503,0.207272,...,0.963751,0.328161,0.963751,0.122535,9.328786,0.988991,0.278292,0.707565,4.252629,0.402145
9,10,0.012026,0.018549,0.016926,0.005576,0.00849,0.039343,0.018005,0.014062,0.086092,0.046509,...,0.963751,0.082971,0.963751,0.166579,12.194948,0.999946,0.378323,0.924956,0.088432,0.60369


In [64]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'alpha': 0.5286401694524384,
 'beta': 0.4541169423300475,
 'topK': 114,
 'implicit': False}

In [65]:
#let's fit the model with the hyperparamethers obtained from the previous search and evaluate them on validation set

recommender = RP3betaRecommender(URM_all)
recommender.fit()

RP3betaRecommender: Similarity column 24507 (100.0%), 2200.86 column/sec. Elapsed time 11.14 sec


In [66]:
test_users = pd.read_csv('../input/recommender-system-2022-challenge-polimi/data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
41111,41624
41112,41625
41113,41626
41114,41627


In [67]:
user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user,cutoff = 10))

In [68]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])
    
test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('submission.csv', index=False)