## Import

In [1]:
## Allow more than one output for a single code cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import pandas as pd
import scipy.sparse as sps
import numpy as np
import os

from skopt.space import Real, Integer, Categorical

## Set the numpy random seed
SEED = 42
np.random.seed(SEED)

os.getcwd()

'/home/jupyter/RecSysChallenge2021'

In [3]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

from Evaluation.Evaluator import EvaluatorHoldout

from Recommenders.Recommender_import_list import *

Tensorflow is not available


In [4]:
## Utility Functions
from Dataset.load_data import load_data
from Dataset.write_submission import write_submission
from Dataset.load_test_user_array import load_test_user_array

## Data Loading and Split

In [5]:
DATA_FILE_PATH = "./Dataset"

URM_PATH = os.path.join(DATA_FILE_PATH, "data_train.csv")

URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_PATH, 
                                sep=",",
                                dtype={0:int, 1:int, 2:int},
                                engine='python')

URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]


userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)


ICM_event_PATH = os.path.join(DATA_FILE_PATH, "data_ICM_event.csv")

ICM_event_dataframe = pd.read_csv(filepath_or_buffer=ICM_event_PATH, 
                            sep=",", 
                            dtype={0:int, 1:int, 2:int},
                            engine='python')

ICM_event_dataframe.columns = ["ItemID", "EventID", "Feature"]

n_event_features = max(ICM_event_dataframe["EventID"]) + 1 # this must be set to max since it is a processed matrix

ICM_event = sps.csr_matrix((np.ones(len(ICM_event_dataframe["ItemID"].values)), (ICM_event_dataframe["ItemID"].values, ICM_event_dataframe["EventID"].values)),
                            shape = (n_items, n_event_features)
                             )

ICM_event.data = np.ones_like(ICM_event.data)

In [6]:
URM_all, ICM_dict = load_data()
ICM_dict['ICM_event'] = ICM_event

In [7]:
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)

URM_aug_train = sps.vstack([URM_train.copy().tocoo(), 
                            #ICM_dict['ICM_genre'].T.tocoo(),
                            ICM_dict['ICM_subgenre'].T.tocoo(), 
                            #ICM_dict['ICM_event'].T.tocoo(), 
                            ICM_dict['ICM_channel'].T.tocoo()], format='csr')

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10], exclude_seen = True)

EvaluatorHoldout: Ignoring 13646 ( 0.0%) Users that have less than 1 test interactions


In [8]:
test_UserID_array = load_test_user_array()

## Optimization

In [9]:
output_folder_path = "result_experiments/ItemCBF_event_noPreprocessed/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

n_cases = 50  # 50 with 30% random is a good number
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"   
cutoff_to_optimize = 10

In [11]:
from functools import partial
import os, multiprocessing

from HyperparameterTuning.run_hyperparameter_search import runHyperparameterSearch_Collaborative

runHyperparameterSearch_Collaborative(MultVAERecommender,
                                URM_train = URM_train,
                                URM_train_last_test = None,
                                metric_to_optimize = metric_to_optimize,
                                cutoff_to_optimize = cutoff_to_optimize,
                                evaluator_validation = evaluator_validation,
                                evaluator_test = None,
                                output_folder_path = output_folder_path,
                                parallelizeKNN = True,
                                allow_weighting = True,
                                allow_bias_ICM = True,
                                resume_from_saved = True,
                                similarity_type_list = None,
                                ICM_name = 'ICM_event',
                                ICM_object = ICM_event.copy(),
                                n_cases = n_cases,
                                n_random_starts = n_random_starts
                               )

SearchBayesianSkopt: Resuming 'ItemKNNCBFRecommender_ICM_event_cosine' Failed, no such file exists.

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'topK': 6338, 'shrink': 24, 'similarity': 'cosine', 'normalize': True, 'feature_weighting': 'none', 'ICM_bias': 48.04236752064873}
SearchBayesianSkopt: Resuming 'ItemKNNCBFRecommender_ICM_event_jaccard' Failed, no such file exists.

ItemKNNCBFRecommender: ICM Detected 92 ( 0.5%) items with no features.
Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'topK': 37, 'shrink': 1060, 'similarity': 'jaccard', 'normalize': False}
SearchBayesianSkopt: Resuming 'ItemKNNCBFRecommender_ICM_event_asymmetric' Failed, no such file exists.

ItemKNNCBFRecommender: ICM Detected 92 ( 0.5%) items with no features.Iteration No: 1 started. Evaluating function at random point.

SearchBayesianSkopt: Testing config: {'topK': 5117, 'shrink': 1242, 'similarity': 'a

In [12]:
from Recommenders.DataIO import DataIO

recommender_class = ItemKNNCBFRecommender

data_loader = DataIO(folder_path = output_folder_path)

search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_ICM_event_asymmetric_metadata.zip")
result_on_validation_df = search_metadata["result_on_validation_df"]
print('Best asymmetric MAP: ', max(result_on_validation_df.MAP))

search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_ICM_event_cosine_metadata.zip")
result_on_validation_df = search_metadata["result_on_validation_df"]
print('Best cosine MAP: ', max(result_on_validation_df.MAP))

search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_ICM_event_dice_metadata.zip")
result_on_validation_df = search_metadata["result_on_validation_df"]
print('Best dice MAP: ', max(result_on_validation_df.MAP))

search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_ICM_event_jaccard_metadata.zip")
result_on_validation_df = search_metadata["result_on_validation_df"]
print('Best jaccard MAP: ', max(result_on_validation_df.MAP))

search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_ICM_event_tversky_metadata.zip")
result_on_validation_df = search_metadata["result_on_validation_df"]
print('Best tversky MAP: ', max(result_on_validation_df.MAP))

Best asymmetric MAP:  0.01728293831089527
Best cosine MAP:  0.014957613022247365
Best dice MAP:  0.00992955375027038
Best jaccard MAP:  0.009929196066525621
Best tversky MAP:  0.01718384246561002
