# Hybrid Score combination: RP3beta + UserKNN

## Import

In [3]:
## Allow more than one output for a single code cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import pandas as pd
import scipy.sparse as sps
import numpy as np
import os

from skopt.space import Real, Integer, Categorical

## Set the numpy random seed
SEED = 42
np.random.seed(SEED)

os.getcwd()

'/home/jupyter/RecSysChallenge2021'

In [1]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

from Evaluation.Evaluator import EvaluatorHoldout

from Recommenders.Recommender_import_list import *

from Recommenders.DataIO import DataIO

In [4]:
## Utility Functions
from Dataset.load_data import load_data
from Dataset.write_submission import write_submission
from Dataset.load_test_user_array import load_test_user_array

## Data Loading and Split

In [5]:
URM_all, ICM_dict = load_data()

In [6]:
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

URM_aug_train = sps.vstack([URM_train.copy().tocoo(), 
                            #ICM_dict['ICM_genre'].T.tocoo(),
                            ICM_dict['ICM_subgenre'].T.tocoo(), 
                            #ICM_dict['ICM_event'].T.tocoo(), 
                            ICM_dict['ICM_channel'].T.tocoo()], format='csr')

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10], exclude_seen = True)

EvaluatorHoldout: Ignoring 13646 ( 0.0%) Users that have less than 1 test interactions


In [7]:
test_UserID_array = load_test_user_array()

## Optimization

In [16]:
output_folder_path = "result_experiments/UserKNN_CF_NO_AUG/"

best_result = pd.DataFrame()
best_result['MAP'] = 0

for similarity in ['cosine', 'asymmetric', 'dice', 'jaccard', 'tversky']:
    
    print('Results for ' + similarity + ' similarity:')

    data_loader = DataIO(folder_path = output_folder_path)
    search_metadata = data_loader.load_data(UserKNNCFRecommender.RECOMMENDER_NAME + "_" + similarity + "_metadata.zip")
    #search_metadata.keys()

    hyperparameters_df = search_metadata["hyperparameters_df"]
    result_on_validation_df = search_metadata["result_on_validation_df"]

    hyperparameters_df['MAP'] = result_on_validation_df['MAP'].values.tolist()

    threshold = 0.23

    hyperparameters_df[hyperparameters_df['MAP'] > threshold]
    print('Max value for the range: ', max(hyperparameters_df[hyperparameters_df['MAP'] > threshold].topK))
    print('Min value for the range: ', min(hyperparameters_df[hyperparameters_df['MAP'] > threshold].topK))
    print('Best MAP: ', max(hyperparameters_df['MAP']))
    hyperparameters_df[max(hyperparameters_df['MAP']) == hyperparameters_df['MAP']]
    
    if [best_result['MAP'] < max(hyperparameters_df['MAP'])]:
        best_result = hyperparameters_df[max(hyperparameters_df['MAP']) == hyperparameters_df['MAP']]

print('Best result:')
best_result

Results for cosine similarity:


Unnamed: 0,topK,shrink,similarity,normalize,feature_weighting,URM_bias,MAP
5,806,729,cosine,True,TF-IDF,33.436988,0.230919
10,564,10,cosine,True,none,54.303728,0.231717
12,729,257,cosine,True,TF-IDF,16.311847,0.231319
14,689,712,cosine,True,TF-IDF,44.114179,0.231542
22,671,990,cosine,True,none,1000.0,0.231251
26,370,823,cosine,True,none,243.012356,0.232201
29,461,1000,cosine,True,none,1000.0,0.231888
34,387,0,cosine,True,BM25,1000.0,0.232887
35,229,1000,cosine,True,BM25,143.228916,0.231135
36,812,0,cosine,True,none,0.927486,0.230498


Max value for the range:  879
Min value for the range:  229
Best MAP:  0.23288738894355918


Unnamed: 0,topK,shrink,similarity,normalize,feature_weighting,URM_bias,MAP
34,387,0,cosine,True,BM25,1000.0,0.232887


Results for asymmetric similarity:


Unnamed: 0,topK,shrink,similarity,normalize,asymmetric_alpha,feature_weighting,URM_bias,MAP
16,532,949,asymmetric,True,0.566227,TF-IDF,1000.0,0.232479
17,664,1000,asymmetric,True,0.456479,TF-IDF,1000.0,0.231299
18,418,1000,asymmetric,True,0.410963,TF-IDF,1000.0,0.231781
25,581,840,asymmetric,True,0.501278,TF-IDF,429.912491,0.231939
28,572,1000,asymmetric,True,0.500272,TF-IDF,1000.0,0.232163
29,590,1000,asymmetric,True,0.495382,TF-IDF,1000.0,0.231749
30,591,1000,asymmetric,True,0.495866,TF-IDF,1000.0,0.231884
33,1000,0,asymmetric,True,0.617255,TF-IDF,1000.0,0.230136
34,1000,0,asymmetric,True,0.610541,TF-IDF,1000.0,0.230067
35,1000,0,asymmetric,True,0.617007,TF-IDF,1000.0,0.230093


Max value for the range:  1000
Min value for the range:  418
Best MAP:  0.23247857968728577


Unnamed: 0,topK,shrink,similarity,normalize,asymmetric_alpha,feature_weighting,URM_bias,MAP
16,532,949,asymmetric,True,0.566227,TF-IDF,1000.0,0.232479


Results for dice similarity:


Unnamed: 0,topK,shrink,similarity,normalize,MAP
3,628,47,dice,True,0.230398
4,662,134,dice,False,0.230748
12,603,24,dice,True,0.2304
20,337,0,dice,False,0.230939
21,416,0,dice,True,0.230827
24,229,0,dice,False,0.230313
30,504,9,dice,False,0.230316
37,283,0,dice,True,0.230919
39,375,0,dice,True,0.231209
45,403,122,dice,False,0.231


Max value for the range:  662
Min value for the range:  229
Best MAP:  0.23120927058106353


Unnamed: 0,topK,shrink,similarity,normalize,MAP
39,375,0,dice,True,0.231209


Results for jaccard similarity:


Unnamed: 0,topK,shrink,similarity,normalize,MAP
1,560,178,jaccard,True,0.23075
7,624,86,jaccard,False,0.231409
15,366,0,jaccard,True,0.231986
19,465,0,jaccard,True,0.231891
21,534,0,jaccard,False,0.230697
23,275,0,jaccard,True,0.231995
27,198,0,jaccard,True,0.230365
29,417,0,jaccard,False,0.231918
30,596,0,jaccard,True,0.230387
31,321,0,jaccard,False,0.231891


Max value for the range:  624
Min value for the range:  198
Best MAP:  0.23254708339440167


Unnamed: 0,topK,shrink,similarity,normalize,MAP
38,426,78,jaccard,True,0.232547


Results for tversky similarity:


Unnamed: 0,topK,shrink,similarity,normalize,tversky_alpha,tversky_beta,MAP
0,646,135,tversky,True,1.380433,1.408946,0.231402
15,399,0,tversky,True,2.0,1.145593,0.233824
17,543,21,tversky,True,2.0,2.0,0.231281
21,691,115,tversky,True,1.742549,2.0,0.230556
23,304,11,tversky,True,0.982927,1.127706,0.231501
26,408,216,tversky,True,1.983017,1.384008,0.232193
31,298,0,tversky,True,1.506569,2.0,0.231088
33,572,0,tversky,True,2.0,1.185875,0.232619
35,218,0,tversky,True,1.507152,1.342878,0.231706
36,382,0,tversky,True,1.468715,1.265568,0.233109


Max value for the range:  691
Min value for the range:  218
Best MAP:  0.233823581071492


Unnamed: 0,topK,shrink,similarity,normalize,tversky_alpha,tversky_beta,MAP
15,399,0,tversky,True,2.0,1.145593,0.233824


Best result:


Unnamed: 0,topK,shrink,similarity,normalize,tversky_alpha,tversky_beta,MAP
15,399,0,tversky,True,2.0,1.145593,0.233824


In [8]:
output_folder_path = "result_experiments/UserKNN_CF_NO_AUG/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

n_cases = 50  # 50 with 30% random is a good number
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [None]:
from functools import partial
import os, multiprocessing

from HyperparameterTuning.run_hyperparameter_search import runHyperparameterSearch_Collaborative

runHyperparameterSearch_Collaborative(UserKNNCFRecommender,
                                      URM_train = URM_train,
                                      URM_train_last_test = None,
                                      metric_to_optimize = metric_to_optimize,
                                      cutoff_to_optimize = cutoff_to_optimize,
                                      n_cases = n_cases,
                                      n_random_starts = n_random_starts,
                                      evaluator_validation_earlystopping = evaluator_validation,
                                      evaluator_validation = evaluator_validation,
                                      evaluator_test = None,
                                      output_folder_path = output_folder_path,
                                      resume_from_saved = True,
                                      similarity_type_list = None,
                                      parallelizeKNN = True,
                                      allow_weighting = True,
                                      allow_bias_URM=True)