In [1]:
import numpy as np
import matplotlib.pyplot as pyplot
import pandas as pd
import scipy.sparse as sps
%matplotlib inline
%load_ext Cython

from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Evaluation.Evaluator import EvaluatorHoldout
from Recommenders.MatrixFactorization.NMFRecommender import NMFRecommender
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# data_train_path="/kaggle/input/recommender-system-2023-challenge-polimi/data_train.csv"
# data_target_user_path="/kaggle/input/recommender-system-2023-challenge-polimi/data_target_users_test.csv"
data_train_path="data_train.csv"
data_target_user_path="data_target_users_test.csv"
data_train = pd.read_csv(data_train_path)
data_target = pd.read_csv(data_target_user_path)

In [3]:
URM_all =  data_train.pivot(index='row', columns='col', values='data').fillna(0)
item_map = {i : item for i, item in enumerate(URM_all.columns)}
user_map = {i : user for i, user in enumerate(data_target["user_id"])}
item_map_inv = {item : i for i, item in item_map.items()}
user_map_inv = {user : i for i, user in user_map.items()}
missing_index = [x for x in range(1,13025) if x not in URM_all.index.tolist()]
add_urm = pd.DataFrame(index = missing_index, columns = URM_all.columns).fillna(0)
URM_all = pd.concat([URM_all, add_urm]).sort_index()
del add_urm
del missing_index
#data_target["user_id"] = data_target["user_id"]
URM_all = URM_all.to_numpy()
URM_all = sps.csr_matrix(URM_all)
URM_all

<13024x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [4]:
test_folds = 3

URM_trains = [None] * test_folds
evaluator_tests = [None] * test_folds
for i in range(0, test_folds):
    URM_trains[i], URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)
    evaluator_tests[i] = EvaluatorHoldout(URM_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 2562 (19.7%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 2571 (19.7%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 2608 (20.0%) Users that have less than 1 test interactions


In [5]:
def train_evaluate(optuna_trial):
    factors = optuna_trial.suggest_int("factors", 1, 1000)
    l1_ratio = optuna_trial.suggest_float("l1_ratio", 0.0, 1.0)
    
    mAP = 0.0
    for i in range(0, test_folds):
        SLIM_recommender = NMFRecommender(URM_trains[i], verbose=False)
        SLIM_recommender.fit(num_factors=factors, l1_ratio=l1_ratio)
        result_df, _ = evaluator_tests[i].evaluateRecommender(SLIM_recommender)
        mAP += result_df["MAP"].values[0]
    mAP /= test_folds
    return mAP

In [6]:
study = optuna.create_study(direction="maximize")
study.optimize(train_evaluate, n_trials=500)

[I 2023-12-11 21:56:00,187] A new study created in memory with name: no-name-411eace3-ed4a-42ec-b0be-1eb43fbfe7e8
  l1_ratio = optuna_trial.suggest_uniform("l1_ratio", 0.0, 1.0)


EvaluatorHoldout: Processed 10462 (100.0%) in 15.04 sec. Users per second: 696
EvaluatorHoldout: Processed 10453 (100.0%) in 17.44 sec. Users per second: 599


  numerator /= denominator
  W *= delta_W


EvaluatorHoldout: Processed 10416 (100.0%) in 11.12 sec. Users per second: 937


[I 2023-12-11 22:25:09,001] Trial 0 finished with value: 0.01581476115756993 and parameters: {'factors': 830, 'l1_ratio': 0.5642138470338769}. Best is trial 0 with value: 0.01581476115756993.
  l1_ratio = optuna_trial.suggest_uniform("l1_ratio", 0.0, 1.0)
