# Hyperparameters Optimization

### Importing github repository

In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token = user_secrets.get_secret("token")

!git clone https://{token}@github.com/Benedart/RecSys-2022-Challenge-Polimi.git

Cloning into 'RecSys-2022-Challenge-Polimi'...
remote: Enumerating objects: 298, done.[K
remote: Counting objects: 100% (95/95), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 298 (delta 33), reused 91 (delta 29), pack-reused 203[K
Receiving objects: 100% (298/298), 331.29 MiB | 30.51 MiB/s, done.
Resolving deltas: 100% (68/68), done.
Updating files: 100% (225/225), done.


#### Compiling Cython files

In [2]:
import os

os.chdir( "./RecSys-2022-Challenge-Polimi")
!python run_compile_all_cython.py

run_compile_all_cython: Found 10 Cython files in 4 folders...
run_compile_all_cython: All files will be compiled using your current python environment: '/opt/conda/bin/python'
Compiling [1/10]: MatrixFactorizationImpressions_Cython_Epoch.pyx... 
In file included from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/ndarraytypes.h:1969[m[K,
                 from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/ndarrayobject.h:12[m[K,
                 from [01m[K/opt/conda/lib/python3.7/site-packages/numpy/core/include/numpy/arrayobject.h:4[m[K,
                 from [01m[KMatrixFactorizationImpressions_Cython_Epoch.c:746[m[K:
      |  [01;35m[K^~~~~~~[m[K
[01m[KMatrixFactorizationImpressions_Cython_Epoch.c:[m[K In function ‘[01m[K__pyx_f_43MatrixFactorizationImpressions_Cython_Epoch_32MatrixFactorization_Cython_Epoch_sampleBPR_Cython[m[K’:
12758 |       [01;35m[K__pyx_t_4 = (__pyx_v_start_pos_impression_ite

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.sparse as sps
import csv

from Recommenders.Recommender_import_list import *

#### Importing URM & ICM

In [4]:
URM_all = pd.read_csv("/kaggle/input/recommender-system-2022-challenge-polimi/interactions_and_impressions.csv")
ICM_type = pd.read_csv("/kaggle/input/recommender-system-2022-challenge-polimi/data_ICM_type.csv")
ICM_length = pd.read_csv("/kaggle/input/recommender-system-2022-challenge-polimi/data_ICM_length.csv")

ICM_all = pd.concat([ICM_type, ICM_length]).sort_values(by='item_id')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
URM_all

Unnamed: 0,UserID,ItemID,Impressions,Data
0,0,11,012345678910111213141516171819,1
1,0,21,,0
2,0,21,,0
3,0,21,20212223242526272829,0
4,0,21,,1
...,...,...,...,...
5826501,41628,20448,,0
5826502,41628,20896,,1
5826503,41628,21506,,1
5826504,41628,22882,,0


In [6]:
ICM_all

Unnamed: 0,item_id,feature_id,data
0,0,1,1
0,0,0,1
1,1,0,1
1,1,3,1
2,2,4,1
...,...,...,...
23088,27965,1,1
23089,27966,0,1
23089,27966,1,1
23090,27967,4,1


#### Getting rid of duplicates and empty indices
Let's start with the indices, the mapping is done for consistency

In [7]:
mapped_id, original_id = pd.factorize(URM_all["UserID"].unique())
user_original_Id_to_index = pd.Series(mapped_id, index=original_id)

print("Unique UserID in the URM are {}".format(len(original_id)))

mapped_id, original_id = pd.factorize(URM_all["ItemID"].unique())

print("Unique ItemID in the URM are {}".format(len(original_id)))

all_item_indices = pd.concat([URM_all["ItemID"], ICM_all["item_id"]], ignore_index=True)
mapped_id, original_id = pd.factorize(all_item_indices.unique())

print("Unique ItemID in the URM and ICM are {}".format(len(original_id)))

item_original_ID_to_index = pd.Series(mapped_id, index=original_id)

mapped_id, original_id = pd.factorize(ICM_all["feature_id"].unique())
feature_original_ID_to_index = pd.Series(mapped_id, index=original_id)

print("Unique FeatureID in the URM are {}".format(len(feature_original_ID_to_index)))

URM_all["UserID"] = URM_all["UserID"].map(user_original_Id_to_index)
URM_all["ItemID"] = URM_all["ItemID"].map(item_original_ID_to_index)
ICM_all["item_id"] = ICM_all["item_id"].map(item_original_ID_to_index)
ICM_all["feature_id"] = ICM_all["feature_id"].map(feature_original_ID_to_index)

Unique UserID in the URM are 41629
Unique ItemID in the URM are 24507
Unique ItemID in the URM and ICM are 27968
Unique FeatureID in the URM are 6


And then we remove duplicate values

In [8]:
URM_all = URM_all.drop(["Impressions", "Data"], axis=1).drop_duplicates()
URM_all["Rating"] = 1
ICM_all = ICM_all.drop_duplicates()

In [9]:
URM_all

Unnamed: 0,UserID,ItemID,Rating
0,0,0,1
1,0,1,1
13,0,2,1
28,0,3,1
29,0,4,1
...,...,...,...
5826501,41628,3700,1
5826502,41628,12694,1
5826503,41628,15132,1
5826504,41628,18128,1


In [10]:
ICM_all

Unnamed: 0,item_id,feature_id,data
0,3668,0,1
0,3668,1,1
1,1945,1,1
1,1945,2,1
2,18373,3,1
...,...,...,...
23088,27965,0,1
23089,27966,1,1
23089,27966,0,1
23090,27967,3,1


#### Creating the matrices

In [11]:
n_users = len(user_original_Id_to_index)
n_items = len(item_original_ID_to_index)
n_features = len(feature_original_ID_to_index)

from sklearn.model_selection import train_test_split

seed = 42

(user_ids_training_validation, user_ids_test,
 item_ids_training_validation, item_ids_test,
 ratings_training_validation, ratings_test) = train_test_split(URM_all.UserID,
                                                    URM_all.ItemID,
                                                    URM_all.Rating,
                                                    test_size=0.20,
                                                    shuffle=True,
                                                    random_state=seed)

(user_ids_training, user_ids_validation,
 item_ids_training, item_ids_validation,
 ratings_training, ratings_validation) = train_test_split(user_ids_training_validation,
                                                          item_ids_training_validation,
                                                          ratings_training_validation,
                                                          test_size=0.20,
                                                          shuffle=True,
                                                          random_state=seed)

URM_all = sps.csr_matrix((URM_all.Rating, (URM_all.UserID, URM_all.ItemID)),
                           shape = (n_users, n_items))

URM_train = sps.csr_matrix((ratings_training, (user_ids_training, item_ids_training)),
                           shape = (n_users, n_items))

URM_validation = sps.csr_matrix((ratings_validation, (user_ids_validation, item_ids_validation)),
                           shape = (n_users, n_items))

URM_train_validation = sps.csr_matrix((ratings_training_validation, (user_ids_training_validation, item_ids_training_validation)),
                           shape = (n_users, n_items))

URM_test = sps.csr_matrix((ratings_test, (user_ids_test, item_ids_test)),
                           shape = (n_users, n_items))


ICM_all = sps.csr_matrix((np.ones(len(ICM_all["item_id"].values)), 
                          (ICM_all["item_id"].values, ICM_all["feature_id"].values)),
                        shape = (n_items, n_features))

ICM_all.data = np.ones_like(ICM_all.data)

ICM_all = sps.csr_matrix(ICM_all)
features_per_item = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csc_matrix(ICM_all)
items_per_feature = np.ediff1d(ICM_all.indptr)

ICM_all = sps.csr_matrix(ICM_all)

#### Setting up evaluators

In [12]:
from Evaluation.Evaluator import EvaluatorHoldout

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 753 ( 1.8%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 323 ( 0.8%) Users that have less than 1 test interactions


#### Hyperparameters tuning 

In [13]:
from HyperparameterTuning.run_hyperparameter_search import runHyperparameterSearch_Collaborative
from functools import partial

output_folder_path = "result_experiments/SKOPT_test/"

n_cases = 10
n_random_starts = int(n_cases * 0.3)

runHyperparameterSearch_Collaborative_partial = partial(
                    runHyperparameterSearch_Collaborative,
                    URM_train = URM_train,
                    URM_train_last_test = URM_train_validation,
                    metric_to_optimize = "MAP",
                    cutoff_to_optimize = 10,
                    n_cases = n_cases,
                    n_random_starts = n_random_starts,
                    evaluator_validation_earlystopping = evaluator_validation,
                    evaluator_validation = evaluator_validation,
                    evaluator_test = evaluator_test,
                    output_folder_path = output_folder_path,
                    resume_from_saved = False
)

runHyperparameterSearch_Collaborative_partial(SLIMElasticNetRecommender)

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'topK': 7691, 'l1_ratio': 0.014988465635340734, 'alpha': 0.8473177739464293}
SLIMElasticNetRecommender: URM Detected 3461 (12.4%) items with no interactions.
SLIMElasticNetRecommender: Processed 24583 (87.9%) in 5.00 min. Items per second: 81.90
SLIMElasticNetRecommender: Processed 25500 (91.2%) in 10.00 min. Items per second: 42.49
SLIMElasticNetRecommender: Processed 26420 (94.5%) in 15.01 min. Items per second: 29.34
SLIMElasticNetRecommender: Processed 27339 (97.8%) in 20.01 min. Items per second: 22.77
SLIMElasticNetRecommender: Processed 27968 (100.0%) in 23.45 min. Items per second: 19.88
EvaluatorHoldout: Processed 40876 (100.0%) in 16.89 sec. Users per second: 2420
SearchBayesianSkopt: New best config found. Config 0: {'topK': 7691, 'l1_ratio': 0.014988465635340734, 'alpha': 0.8473177739464293} - results: PRECISION: 0.0099912, PRECISION_RECALL_MIN_DEN: 0.0160444, RECALL: 0.01494

## Testing the optimized model
#### Importing the model

In [14]:
recommender = SLIMElasticNetRecommender(URM_train_validation)
recommender.load_model(output_folder_path, 
                file_name = recommender.RECOMMENDER_NAME + "_best_model.zip")

SLIMElasticNetRecommender: URM Detected 3461 (12.4%) items with no interactions.
SLIMElasticNetRecommender: Loading model from file 'result_experiments/SKOPT_test/SLIMElasticNetRecommender_best_model.zip'
SLIMElasticNetRecommender: Loading complete


#### Evaluating the optimized model

In [15]:
evaluator_test.evaluateRecommender(recommender)

EvaluatorHoldout: Processed 41306 (100.0%) in 44.19 sec. Users per second: 935


(       PRECISION PRECISION_RECALL_MIN_DEN    RECALL      MAP MAP_MIN_DEN  \
 cutoff                                                                     
 10      0.065894                 0.089312  0.077296  0.03389    0.045042   
 
              MRR      NDCG        F1  HIT_RATE ARHR_ALL_HITS  ...  \
 cutoff                                                        ...   
 10      0.208425  0.093417  0.071141  0.399264      0.263004  ...   
 
        COVERAGE_USER COVERAGE_USER_HIT USERS_IN_GT DIVERSITY_GINI  \
 cutoff                                                              
 10          0.992241          0.396166    0.992241       0.011478   
 
        SHANNON_ENTROPY RATIO_DIVERSITY_HERFINDAHL RATIO_DIVERSITY_GINI  \
 cutoff                                                                   
 10            8.244524                   0.990287              0.02579   
 
        RATIO_SHANNON_ENTROPY RATIO_AVERAGE_POPULARITY RATIO_NOVELTY  
 cutoff                                      