# Hyperparameters Optimization

### Importing github repository

In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token = user_secrets.get_secret("token")

!git clone https://{token}@github.com/Benedart/RecSys-2022-Challenge-Polimi.git

Cloning into 'RecSys-2022-Challenge-Polimi'...
remote: Enumerating objects: 216, done.[K
remote: Counting objects: 100% (216/216), done.[K
remote: Compressing objects: 100% (175/175), done.[K
remote: Total 216 (delta 42), reused 212 (delta 38), pack-reused 0[K
Receiving objects: 100% (216/216), 23.36 MiB | 24.82 MiB/s, done.
Resolving deltas: 100% (42/42), done.


#### Compiling Cython files

In [2]:
import os

os.chdir( "./RecSys-2022-Challenge-Polimi")
!python run_compile_all_cython.py

run_compile_all_cython: Found 10 Cython files in 4 folders...
run_compile_all_cython: All files will be compiled using your current python environment: '/opt/conda/bin/python'
Compiling [1/10]: MatrixFactorizationImpressions_Cython_Epoch.pyx... 
In file included from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/ndarraytypes.h:1969[m[K,
                 from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/ndarrayobject.h:12[m[K,
                 from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/arrayobject.h:4[m[K,
                 from [01m[KMatrixFactorizationImpressions_Cython_Epoch.c:746[m[K:
      |  [01;35m[K^~~~~~~[m[K
[01m[KMatrixFactorizationImpressions_Cython_Epoch.c:[m[K In function ‘[01m[K__pyx_f_43MatrixFactorizationImpressions_Cython_Epoch_32MatrixFactorization_Cython_Epoch_sampleBPR_Cython[m[K’:
12758 |       [01;35m[K__pyx_t_4 = (__pyx_v_start_pos_impression_ite

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.sparse as sps
import csv

from Recommenders.Recommender_import_list import *

#### Importing URM & ICM

In [4]:
URM_all = pd.read_csv("/kaggle/input/recommender-system-2022-challenge-polimi/interactions_and_impressions.csv")
ICM_all = pd.read_csv("/kaggle/input/recommender-system-2022-challenge-polimi/data_ICM_type.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
URM_all

Unnamed: 0,UserID,ItemID,Impressions,Data
0,0,11,012345678910111213141516171819,1
1,0,21,,0
2,0,21,,0
3,0,21,20212223242526272829,0
4,0,21,,1
...,...,...,...,...
5826501,41628,20448,,0
5826502,41628,20896,,1
5826503,41628,21506,,1
5826504,41628,22882,,0


In [6]:
ICM_all

Unnamed: 0,item_id,feature_id,data
0,0,1,1
1,1,3,1
2,2,4,1
3,3,1,1
4,4,3,1
...,...,...,...
23086,27963,1,1
23087,27964,2,1
23088,27965,1,1
23089,27966,1,1


#### Getting rid of duplicates and empty indices
Let's start with the indices, the mapping is done for consistency

In [7]:
mapped_id, original_id = pd.factorize(URM_all["UserID"].unique())
user_original_Id_to_index = pd.Series(mapped_id, index=original_id)

print("Unique ItemID in the URM are {}".format(len(original_id)))

all_item_indices = pd.concat([URM_all["ItemID"], ICM_all["item_id"]], ignore_index=True)
mapped_id, original_id = pd.factorize(all_item_indices.unique())
item_original_ID_to_index = pd.Series(mapped_id, index=original_id)

print("Unique ItemID in the URM and ICM are {}".format(len(original_id)))

mapped_id, original_id = pd.factorize(ICM_all["feature_id"].unique())
feature_original_ID_to_index = pd.Series(mapped_id, index=original_id)

print("Unique FeatureID in the URM are {}".format(len(feature_original_ID_to_index)))

URM_all["UserID"] = URM_all["UserID"].map(user_original_Id_to_index)
URM_all["ItemID"] = URM_all["ItemID"].map(item_original_ID_to_index)
ICM_all["item_id"] = ICM_all["item_id"].map(item_original_ID_to_index)
ICM_all["feature_id"] = ICM_all["feature_id"].map(feature_original_ID_to_index)

Unique ItemID in the URM are 41629
Unique ItemID in the URM and ICM are 27968
Unique FeatureID in the URM are 5


And then we remove duplicate values

In [8]:
URM_all = URM_all.drop("Impressions", axis=1).drop_duplicates()
ICM_all = ICM_all.drop("data", axis=1).drop_duplicates()

In [9]:
URM_all

Unnamed: 0,UserID,ItemID,Data
0,0,0,1
1,0,1,0
4,0,1,1
13,0,2,1
28,0,3,1
...,...,...,...
5826501,41628,3700,0
5826502,41628,12694,1
5826503,41628,15132,1
5826504,41628,18128,0


In [10]:
ICM_all

Unnamed: 0,item_id,feature_id
0,3668,0
1,1945,1
2,18373,2
3,7153,0
4,3423,1
...,...,...
23086,27963,0
23087,27964,3
23088,27965,0
23089,27966,0


#### Creating the matrices

In [11]:
n_users = len(user_original_Id_to_index)
n_items = len(item_original_ID_to_index)
n_features = len(feature_original_ID_to_index)

URM_all = sps.csr_matrix((URM_all["Data"].values, 
                          (URM_all["UserID"].values, URM_all["ItemID"].values)),
                        shape = (n_users, n_items))

ICM_all = sps.csr_matrix((np.ones(len(ICM_all["item_id"].values)), 
                          (ICM_all["item_id"].values, ICM_all["feature_id"].values)),
                        shape = (n_items, n_features))

ICM_all.data = np.ones_like(ICM_all.data)

ICM_all = sps.csr_matrix(ICM_all)
features_per_item = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csc_matrix(ICM_all)
items_per_feature = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csr_matrix(ICM_all)

#### Splitting data in train/test

In [12]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Evaluation.Evaluator import EvaluatorHoldout

URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train, train_percentage = 0.80)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 9386 (22.5%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 6888 (16.5%) Users that have less than 1 test interactions


#### Creating SLIM BPR model

In [13]:
SLIM_BPR = SLIM_BPR_Cython(URM_train)
SLIM_BPR.fit()

SLIM_BPR_Recommender: URM Detected 319 ( 0.8%) users with no interactions.
SLIM_BPR_Recommender: URM Detected 3467 (12.4%) items with no interactions.
SLIM_BPR_Recommender: Automatic selection of fastest train mode. Available RAM is 30460.00 MB (94.86%) of 32110.00 MB, required is 3128.84 MB. Using dense matrix.
Processed 41629 (100.0%) in 0.23 sec. BPR loss is 6.99E-08. Sample per second: 180679
SLIM_BPR_Recommender: Epoch 1 of 300. Elapsed time 0.08 sec
Processed 41629 (100.0%) in 0.32 sec. BPR loss is 2.19E-07. Sample per second: 130469
SLIM_BPR_Recommender: Epoch 2 of 300. Elapsed time 0.17 sec
Processed 41629 (100.0%) in 0.42 sec. BPR loss is 3.55E-07. Sample per second: 99831
SLIM_BPR_Recommender: Epoch 3 of 300. Elapsed time 0.27 sec
Processed 41629 (100.0%) in 0.53 sec. BPR loss is 4.89E-07. Sample per second: 79178
SLIM_BPR_Recommender: Epoch 4 of 300. Elapsed time 0.38 sec
Processed 41629 (100.0%) in 0.63 sec. BPR loss is 5.99E-07. Sample per second: 65924
SLIM_BPR_Recommende

Setting up evaluators

In [14]:
from Evaluation.Evaluator import EvaluatorHoldout

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[5, 10, 20])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[5, 10, 20])

evaluator_test.evaluateRecommender(SLIM_BPR)

EvaluatorHoldout: Ignoring 9386 (22.5%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 6888 (16.5%) Users that have less than 1 test interactions
EvaluatorHoldout: Processed 34741 (100.0%) in 34.79 sec. Users per second: 999


(       PRECISION PRECISION_RECALL_MIN_DEN    RECALL       MAP MAP_MIN_DEN  \
 cutoff                                                                      
 5       0.028025                  0.03662  0.028369  0.016283    0.021385   
 10      0.022276                 0.046055  0.043339  0.009856    0.020501   
 20      0.016908                 0.063476  0.062884  0.005726    0.021654   
 
              MRR      NDCG        F1  HIT_RATE ARHR_ALL_HITS  ...  \
 cutoff                                                        ...   
 5       0.067933  0.036079  0.028196  0.118333      0.074387  ...   
 10      0.074325  0.039663  0.029427  0.166201      0.085243  ...   
 20      0.077656  0.046304   0.02665  0.214185      0.093129  ...   
 
        COVERAGE_USER COVERAGE_USER_HIT USERS_IN_GT DIVERSITY_GINI  \
 cutoff                                                              
 5           0.834538          0.098753    0.834538       0.207566   
 10          0.834538          0.138701    0.8

#### Hyperparameters tuning 

In [15]:
from HyperparameterTuning.run_hyperparameter_search import runHyperparameterSearch_Collaborative
from functools import partial

output_folder_path = "result_experiments/SKOPT_test/"

runHyperparameterSearch_Collaborative_partial = partial(
                    runHyperparameterSearch_Collaborative,
                    URM_train = URM_train,
                    metric_to_optimize = "MAP",
                    cutoff_to_optimize = 10,
                    n_cases = 10,
                    n_random_starts = 3,
                    evaluator_validation_earlystopping = evaluator_validation,
                    evaluator_validation = evaluator_validation,
                    evaluator_test = evaluator_test,
                    output_folder_path = output_folder_path
)

runHyperparameterSearch_Collaborative_partial(SLIM_BPR_Cython)

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'topK': 186, 'epochs': 1500, 'symmetric': False, 'sgd_mode': 'adagrad', 'lambda_i': 0.004531243906793731, 'lambda_j': 0.0014629518890092457, 'learning_rate': 0.0020723218233632402}
SLIM_BPR_Recommender: URM Detected 319 ( 0.8%) users with no interactions.
SLIM_BPR_Recommender: URM Detected 3467 (12.4%) items with no interactions.
Processed 41629 (100.0%) in 1.45 sec. BPR loss is 1.42E-05. Sample per second: 28649
SLIM_BPR_Recommender: Epoch 1 of 1500. Elapsed time 0.48 sec
Processed 41629 (100.0%) in 0.78 sec. BPR loss is 3.67E-05. Sample per second: 53523
SLIM_BPR_Recommender: Epoch 2 of 1500. Elapsed time 0.80 sec
Processed 41629 (100.0%) in 1.02 sec. BPR loss is 5.10E-05. Sample per second: 40954
SLIM_BPR_Recommender: Epoch 3 of 1500. Elapsed time 1.04 sec
Processed 41629 (100.0%) in 0.20 sec. BPR loss is 6.67E-05. Sample per second: 207274
SLIM_BPR_Recommender: Epoch 4 of 1500. Elaps