In [139]:
!cp -r ../input/recsysgit/RecSys_Course_AT_PoliMi-master/* ./

In [140]:
%config Completer.use_jedi = False
%load_ext Cython
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Evaluation.Evaluator import EvaluatorHoldout

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [141]:
def get_URM():
    return pd.read_csv('../input/urm-recsys/URM.csv')

In [145]:
URM_all = get_URM()
URM_all

Unnamed: 0,UserID,ItemID,Data
0,0,11,1
1,0,21,1
2,0,22,1
3,0,24,1
4,0,44,1
...,...,...,...
663365,41628,11228,1
663366,41628,15033,1
663367,41628,15181,1
663368,41628,20896,1


In [None]:
!python run_compile_all_cython.py

In [146]:
URM_all = sps.coo_matrix((URM_all["Data"].values, 
                          (URM_all["UserID"].values, URM_all["ItemID"].values)))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)
URM_all

<41629x24507 sparse matrix of type '<class 'numpy.int64'>'
	with 663370 stored elements in Compressed Sparse Row format>

In [148]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

# split data into train and validation data 85/15
URM_train, URM_valid = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)



In [149]:
from Evaluation.Evaluator import EvaluatorHoldout

#create an evaluator object to evaluate validation set
#we will use it for hyperparameter tuning
evaluator_valid = EvaluatorHoldout(URM_valid, cutoff_list=[10])

EvaluatorHoldout: Ignoring 6852 (16.5%) Users that have less than 1 test interactions


In [150]:
from Recommenders.SLIM.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython

#try a SLIM BPR model
recommender_class = SLIM_BPR_Cython

In [151]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 10
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [152]:
from skopt.space import Real, Integer, Categorical
#SLIM BPR is machine learning-based technique
#to tuning hyperparam are typical of ML models to drive the learning process

hyperparameters_range_dictionary = {
    "epochs": Categorical([700]),
    "sgd_mode": Categorical(["sgd", "adagrad", "adam"]),
    "topK": Integer(5, 700),
    "lambda_i": Real(low = 1e-4, high = 1e-1, prior = 'log-uniform'),
    "lambda_j": Real(low = 1e-4, high = 1e-1, prior = 'log-uniform'),
    "learning_rate": Real(low = 1e-4, high = 1e-1, prior = 'log-uniform')
}

In [153]:
#We also setup the early stopping 
earlystopping_keywargs = {"validation_every_n": 15,
                          "stop_on_validation": True,
                          "evaluator_object": evaluator_valid,
                          "lower_validations_allowed": 5,
                          "validation_metric": metric_to_optimize,
                          }

In [154]:

from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

#create a bayesian optimizer object, we pass the recommender and the evaluator
hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_valid)

In [155]:
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = earlystopping_keywargs
)
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_all],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = earlystopping_keywargs
)

In [156]:
import pyximport
pyximport.install()

(None, <pyximport.pyximport.PyxImporter at 0x7fd31465c890>)

In [157]:
from Recommenders.MatrixFactorization.Cython.MatrixFactorization_Cython_Epoch import MatrixFactorization_Cython_Epoch
#let's run the bayesian search
hyperparameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'epochs': 700, 'sgd_mode': 'adam', 'topK': 76, 'lambda_i': 0.00012813490553592323, 'lambda_j': 0.003951141901377312, 'learning_rate': 0.014787390357590134}
SLIM_BPR_Recommender: URM Detected 99 ( 0.2%) users with no interactions.
SLIM_BPR_Recommender: Automatic selection of fastest train mode. Available RAM is 23678.00 MB (73.74%) of 32110.00 MB, required is 2402.37 MB. Using dense matrix.
Processed 41629 (100.0%) in 0.79 sec. BPR loss is 3.96E-01. Sample per second: 52458
SLIM_BPR_Recommender: Epoch 1 of 700. Elapsed time 0.13 sec
Processed 41629 (100.0%) in 0.92 sec. BPR loss is 1.74E+00. Sample per second: 45044
SLIM_BPR_Recommender: Epoch 2 of 700. Elapsed time 0.26 sec
Processed 41629 (100.0%) in 1.05 sec. BPR loss is 3.32E+00. Sample per second: 39470
SLIM_BPR_Recommender: Epoch 3 of 700. Elapsed time 0.39 sec
Processed 41629 (100.0%) in 0.18 sec. BPR loss is 5.13E+00. Sample per s

In [158]:
from Recommenders.DataIO import DataIO

#explore the results of the search
data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()

dict_keys(['time_on_train_avg', 'result_on_validation_best', 'result_on_earlystopping_df', 'result_on_test_best', 'exception_list', 'result_on_validation_df', 'time_on_last_df', 'hyperparameters_best', 'hyperparameters_df', 'algorithm_name_recommender', 'time_on_test_avg', 'time_on_train_total', 'metric_to_optimize', 'time_on_validation_avg', 'cutoff_to_optimize', 'hyperparameters_best_index', 'result_on_test_df', 'time_df', 'time_on_test_total', 'algorithm_name_search', 'result_on_last', 'time_on_validation_total'])

In [159]:
hyperparameters_df = search_metadata["hyperparameters_df"]
hyperparameters_df

Unnamed: 0,epochs,sgd_mode,topK,lambda_i,lambda_j,learning_rate
0,15,adam,76,0.000128,0.003951,0.014787
1,675,adagrad,630,0.017002,0.098362,0.001084
2,15,adam,183,0.00069,0.005216,0.000137
3,690,sgd,638,0.062634,0.029654,0.001159
4,15,adam,617,0.003188,0.080786,0.010919
5,540,adagrad,539,0.08791,0.068885,0.001383
6,405,sgd,17,0.0001,0.001053,0.000896
7,210,adagrad,494,0.0001,0.005359,0.016164
8,15,adam,424,0.000363,0.00026,0.000154
9,15,adagrad,11,0.007213,0.00161,0.097715


In [160]:
result_on_validation_df = search_metadata["result_on_validation_df"]
result_on_validation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
Unnamed: 0_level_1,cutoff,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,10,0.02349,0.049095,0.046264,0.010498,0.02207,0.077612,0.042095,0.031159,0.171378,0.089799,...,0.835403,0.143169,0.835403,0.247138,10.901305,0.993387,0.517578,0.811349,4.726453,0.69793
1,10,0.023932,0.050863,0.048034,0.010679,0.022566,0.079329,0.043176,0.031947,0.17549,0.091575,...,0.835403,0.146605,0.835403,0.239965,10.5812,0.991356,0.502557,0.787525,5.334283,0.686072
2,10,0.022601,0.048372,0.045756,0.009962,0.021421,0.075168,0.040952,0.030257,0.168473,0.086141,...,0.835403,0.140743,0.835403,0.23314,10.487526,0.990753,0.488262,0.780553,5.364607,0.683062
3,10,0.022926,0.049309,0.046706,0.010084,0.021673,0.076875,0.041681,0.030755,0.172442,0.087706,...,0.835403,0.144058,0.835403,0.206499,10.410476,0.990938,0.432469,0.774818,5.497528,0.676331
4,10,0.02257,0.048515,0.045898,0.009768,0.02098,0.073882,0.040536,0.03026,0.169279,0.084591,...,0.835403,0.141416,0.835403,0.235387,10.43018,0.989457,0.492969,0.776285,5.640189,0.682654
5,10,0.024671,0.051397,0.048398,0.011102,0.022919,0.081149,0.043952,0.032682,0.17756,0.094379,...,0.835403,0.148334,0.835403,0.243399,10.861509,0.993899,0.509747,0.808387,4.78406,0.693896
6,10,0.022777,0.047486,0.044774,0.010494,0.021822,0.078503,0.041487,0.030194,0.167812,0.090338,...,0.835403,0.140191,0.835403,0.227501,11.141554,0.995889,0.476452,0.82923,3.996637,0.702904
7,10,0.024289,0.051592,0.048739,0.010785,0.022862,0.079951,0.043738,0.032421,0.177962,0.092428,...,0.835403,0.14867,0.835403,0.253359,10.834211,0.992408,0.530607,0.806356,5.053791,0.695538
8,10,0.022072,0.047397,0.04489,0.009628,0.02072,0.073194,0.039892,0.029593,0.165857,0.083591,...,0.835403,0.138557,0.835403,0.226796,10.280434,0.989002,0.474977,0.76514,5.753734,0.676358
9,10,0.01844,0.035241,0.032533,0.008482,0.016131,0.062801,0.031703,0.023539,0.136498,0.072607,...,0.835403,0.114031,0.835403,0.321205,12.605439,0.99952,0.672697,0.938183,1.424423,0.788976


In [161]:
best_hyperparameters = search_metadata["hyperparameters_best"]
best_hyperparameters

{'epochs': 540,
 'sgd_mode': 'adagrad',
 'topK': 539,
 'lambda_i': 0.08791041776372564,
 'lambda_j': 0.0688853074825555,
 'learning_rate': 0.001382728691456265}

In [163]:
recommender = SLIM_BPR_Cython(URM_all)
recommender.fit(epochs=540, sgd_mode = "adagrad", topK = 539, lambda_i = 0.08791041776372564, lambda_j = 0.0688853074825555, learning_rate = 0.001382728691456265)

SLIM_BPR_Recommender: URM Detected 22 ( 0.1%) users with no interactions.
SLIM_BPR_Recommender: Automatic selection of fastest train mode. Available RAM is 21002.00 MB (65.41%) of 32110.00 MB, required is 2402.37 MB. Using dense matrix.
Processed 41629 (100.0%) in 1.07 sec. BPR loss is 5.02E-05. Sample per second: 38801
SLIM_BPR_Recommender: Epoch 1 of 540. Elapsed time 0.13 sec
Processed 41629 (100.0%) in 0.22 sec. BPR loss is 1.55E-04. Sample per second: 187992
SLIM_BPR_Recommender: Epoch 2 of 540. Elapsed time 0.28 sec
Processed 41629 (100.0%) in 0.36 sec. BPR loss is 2.63E-04. Sample per second: 116691
SLIM_BPR_Recommender: Epoch 3 of 540. Elapsed time 0.42 sec
Processed 41629 (100.0%) in 0.49 sec. BPR loss is 3.39E-04. Sample per second: 84886
SLIM_BPR_Recommender: Epoch 4 of 540. Elapsed time 0.55 sec
Processed 41629 (100.0%) in 0.63 sec. BPR loss is 4.91E-04. Sample per second: 66567
SLIM_BPR_Recommender: Epoch 5 of 540. Elapsed time 0.69 sec
Processed 41629 (100.0%) in 0.76 sec

(       PRECISION PRECISION_RECALL_MIN_DEN RECALL  MAP MAP_MIN_DEN  MRR NDCG  \
 cutoff                                                                        
 10           0.0                      0.0    0.0  0.0         0.0  0.0  0.0   
 
          F1 HIT_RATE ARHR_ALL_HITS  ... COVERAGE_USER COVERAGE_USER_HIT  \
 cutoff                              ...                                   
 10      0.0      0.0           0.0  ...      0.835403               0.0   
 
        USERS_IN_GT DIVERSITY_GINI SHANNON_ENTROPY RATIO_DIVERSITY_HERFINDAHL  \
 cutoff                                                                         
 10        0.835403       0.187826       10.415943                    0.99285   
 
        RATIO_DIVERSITY_GINI RATIO_SHANNON_ENTROPY RATIO_AVERAGE_POPULARITY  \
 cutoff                                                                       
 10                 0.388434              0.774713                 5.161547   
 
        RATIO_NOVELTY  
 cutoff             

In [165]:
test_users = pd.read_csv('../input/recsyschallenge/data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
41111,41624
41112,41625
41113,41626
41114,41627


In [167]:
user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user,cutoff = 10))

In [169]:
for index in range(len(recommendations)):
    recommendations[index]=np.array(recommendations[index])
    
test_users['item_list']= recommendations
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('submission.csv', index=False)