## XGBoot


In [1]:
import os 
import sys
while os.path.split(os.getcwd())[1] != 'RecSysChallenge2023-Team':
    os.chdir('..')
sys.path.insert(1, os.getcwd())

In [2]:
path_save= "Daniele/Recommenders/XGBoot/saved_models"
if not os.path.exists(path_save):
    os.makedirs(path_save)

In [3]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Evaluation.Evaluator import EvaluatorHoldout

import Daniele.Utils.MyDataManager as dm 
import Daniele.Utils.MatrixManipulation as mm
import Daniele.Utils.SaveSparceMatrix as ssm
import numpy as np 

URMv = dm.getURMviews()
URMo = dm.getURMopen()
ICMt=dm.getICMt()
ICMl=dm.getICMl()

name="urm_def.csv"
dir = os.path.join(path_save,name)
if not os.path.exists(dir):
    
    urm_def = mm.defaultExplicitURM(urmv=URMv,urmo=URMo,icml=ICMl,icmt=ICMt, normalize=True, add_aug=True,appendICM=True)
    name="urm_def.csv"
    dir = os.path.join(path_save,name)
    ssm.saveMatrix(dir,urm_def)

    urm_bin = mm.defaultExplicitURM(urmv=URMv,urmo=URMo, normalize=False, add_aug=True)
    urm_bin.data = np.ones(len(urm_bin.data))
    name="urm_bin.csv"
    dir = os.path.join(path_save,name)
    ssm.saveMatrix(dir,urm_bin)

else:
    name="urm_def.csv"
    dir = os.path.join(path_save,name)
    urm_def = ssm.readMatrix(dir)

    name="urm_bin.csv"
    dir = os.path.join(path_save,name)
    urm_bin = ssm.readMatrix(dir)

ICM =  mm.augmentedICM(dm.getICMt(),dm.getICMl())

/Users/daniele/Desktop/RecSys/RecSysChallenge2023-Team


In [4]:
#Cambio di nomenclatura

"""
URM_train = URMv_train
URM_test = URMv_test
URM_validation = URMv_validation
"""

URM_train, URM_validation = split_train_in_two_percentage_global_sample(urm_def, train_percentage = 0.8)

ICM_genres  = ICM



### Recommendations to use to train XGBoost


### RP3Beta

In [5]:
from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
name="rp3beta"
dir = os.path.join(path_save,name)


rp3beta_recommender = RP3betaRecommender(urm_bin)
if not os.path.exists(dir+".zip"):
    rp3beta_recommender.fit(topK= 89, alpha= 0.6361002951626124, beta= 0.27432996564004203, normalize_similarity= True)
    rp3beta_recommender.save_model(path_save,name)
else:
    rp3beta_recommender.load_model(path_save,name)

RP3betaRecommender: URM Detected 3461 (12.4%) items with no interactions.
RP3betaRecommender: Loading model from file 'Daniele/Recommenders/XGBoot/saved_modelsrp3beta'
RP3betaRecommender: Loading complete


## Build the dataframe with the predictions

Creating dataframes can be quite computationally expensive. Never use "append" or iteratively concatenate new elements when you have to run a significant number of iterations, it will take forever. Initialize instead the dimension you want.

In [8]:
import pandas as pd
from tqdm import tqdm
import scipy.sparse as sps
import numpy as np
from xgboost import XGBRanker

n_users = dm.n_users
n_items = dm.n_items

training_dataframe = pd.DataFrame(index=range(0,n_users), columns = ["ItemID"])
training_dataframe.index.name='UserID'

In [9]:
#Hyper parameter !!!
cutoff = 40

for user_id in tqdm(range(n_users)):    
    recommendations = rp3beta_recommender.recommend(user_id, cutoff = cutoff)
    training_dataframe.loc[user_id, "ItemID"] = recommendations

training_dataframe = training_dataframe.explode("ItemID")

100%|██████████| 41629/41629 [00:11<00:00, 3549.87it/s]


### We should add the target column, which is the correct recommendation as contained in the validation data

In [10]:
URM_validation_coo = sps.coo_matrix(URM_validation)

correct_recommendations = pd.DataFrame({"UserID": URM_validation_coo.row,
                                        "ItemID": URM_validation_coo.col})
correct_recommendations

Unnamed: 0,UserID,ItemID
0,0,808
1,0,987
2,0,2218
3,0,2292
4,0,7301
...,...,...
483597,74391,26503
483598,74391,26984
483599,74391,27013
483600,74391,27452


In [11]:
training_dataframe = pd.merge(training_dataframe, correct_recommendations, on=['UserID','ItemID'], how='left', indicator='Exist')
training_dataframe

Unnamed: 0,UserID,ItemID,Exist
0,0,29,left_only
1,0,25,left_only
2,0,20,left_only
3,0,391,left_only
4,0,23,left_only
...,...,...,...
1665155,41628,1738,left_only
1665156,41628,14709,left_only
1665157,41628,23397,left_only
1665158,41628,10045,left_only


In [12]:
training_dataframe["Label"] = training_dataframe["Exist"] == "both"
training_dataframe.drop(columns = ['Exist'], inplace=True)
training_dataframe

Unnamed: 0,UserID,ItemID,Label
0,0,29,False
1,0,25,False
2,0,20,False
3,0,391,False
4,0,23,False
...,...,...,...
1665155,41628,1738,False
1665156,41628,14709,False
1665157,41628,23397,False
1665158,41628,10045,False


## Now let's add some features. For example, the prediction of other algorithms

This may take some time so it is a good idea to save this data and load it instead of calculating the scores every time.

###  KNN_CFCBF

In [13]:
from Daniele.Recommenders.KNN_CFCBF.ItemKNN_CFCBF_Hybrid_Recommender import KNN_CFCBF_custom
name="knn_cfcbf"
dir = os.path.join(path_save,name)

KNN_recommender = KNN_CFCBF_custom(URMv,URMo,ICM_train=mm.augmentedICM(ICMt,ICMl))
if not os.path.exists(dir+".zip"):
    KNN_recommender.fit(topK= 744, shrink= 457, similarity= 'cosine', normalize= True, feature_weighting='TF-IDF')
    KNN_recommender.save_model(path_save,name)
else:
    KNN_recommender.load_model(path_save,name)

KNN_CFCBF_custom: URM Detected 3461 (12.4%) items with no interactions.
KNN_CFCBF_custom: Loading model from file 'Daniele/Recommenders/XGBoot/saved_modelsknn_cfcbf'
KNN_CFCBF_custom: Loading complete


### RP3Beta - KNNCFCBF

In [14]:
from Recommenders.KNN.ItemKNNCustomSimilarityRecommender import ItemKNNCustomSimilarityRecommender
alpha_knn_rp3 = 0.7
name="rp3beta-knn"
dir = os.path.join(path_save,name)


knn_rp3_recommender = ItemKNNCustomSimilarityRecommender(urm_def)
if not os.path.exists(dir+".zip"):
    knn_rp3_recommender.fit((1 - alpha_knn_rp3) * KNN_recommender.W_sparse + alpha_knn_rp3* rp3beta_recommender.W_sparse)
    knn_rp3_recommender.save_model(path_save,name)
else:
    knn_rp3_recommender.load_model(path_save,name)

ItemKNNCustomSimilarityRecommender: URM Detected 2 ( 0.0%) users with no interactions.
ItemKNNCustomSimilarityRecommender: Loading model from file 'Daniele/Recommenders/XGBoot/saved_modelsrp3beta-knn'
ItemKNNCustomSimilarityRecommender: Loading complete


### SSLIM- BPR
Ottimizzato per utenti con poche interazioni


In [15]:
from Recommenders.SLIM.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython
name="sslim01"
dir = os.path.join(path_save,name)

sslim_recommender = SLIM_BPR_Cython(URM_train=urm_def)
if not os.path.exists(dir+".zip"):
    #{'topK': 51, 'epochs': 15, 'symmetric': True, 'sgd_mode': 'adam', 'lambda_i': 1e-05, 'lambda_j': 0.003215687724797301, 'learning_rate': 0.007114410195895492}
    sslim_recommender.fit(topK= 51, epochs=15,symmetric=True, sgd_mode = 'adam', lambda_i = 1e-05, lambda_j=0.003215687724797301, learning_rate = 0.007114410195895492)
    sslim_recommender.save_model(path_save,name)
else:
    sslim_recommender.load_model(path_save,name)

SLIM_BPR_Recommender: URM Detected 2 ( 0.0%) users with no interactions.
SLIM_BPR_Recommender: Loading model from file 'Daniele/Recommenders/XGBoot/saved_modelssslim01'
SLIM_BPR_Recommender: Loading complete


### TopPop

In [16]:
from Recommenders.NonPersonalizedRecommender import TopPop
name="TopPop"
dir = os.path.join(path_save,name)

most_viewed = TopPop(urm_bin)
if not os.path.exists(dir+".zip"):
    most_viewed.fit()
    most_viewed.save_model(path_save,name)
else:
    most_viewed.load_model(path_save,name)

TopPopRecommender: URM Detected 3461 (12.4%) items with no interactions.
TopPopRecommender: Loading model from file 'Daniele/Recommenders/XGBoot/saved_modelsTopPop'
TopPopRecommender: Loading complete


In [17]:

other_algorithms = {
    "KNN_recommender": KNN_recommender,
    "knn_rp3_recommender": knn_rp3_recommender,
    "SLIM_BPR": sslim_recommender,
    "SLIM_BPR": sslim_recommender,
    "TopPop": most_viewed,
}


In [18]:
training_dataframe = training_dataframe.set_index('UserID')

for user_id in tqdm(range(1000)):       
    for rec_label, rec_instance in other_algorithms.items():
        
        item_list = training_dataframe.loc[user_id, "ItemID"].values.tolist()
        
        all_item_scores = rec_instance._compute_item_score([user_id], items_to_compute = item_list)

        training_dataframe.loc[user_id, rec_label] = all_item_scores[0, item_list] 

training_dataframe = training_dataframe.reset_index()
training_dataframe = training_dataframe.rename(columns = {"index": "UserID"})
training_dataframe

100%|██████████| 1000/1000 [00:01<00:00, 623.18it/s]


Unnamed: 0,UserID,ItemID,Label,KNN_recommender,knn_rp3_recommender,SLIM_BPR,TopPop
0,0,29,False,14.357360,2.473777,67.270805,5907.0
1,0,25,False,13.556995,2.533153,57.762074,6540.0
2,0,20,False,15.192987,2.902956,73.059799,7887.0
3,0,391,False,2.542434,4.446660,28.556477,552.0
4,0,23,False,14.649439,2.611018,53.351173,6743.0
...,...,...,...,...,...,...,...
1665155,41628,1738,False,,,,
1665156,41628,14709,False,,,,
1665157,41628,23397,False,,,,
1665158,41628,10045,False,,,,


## Or the profile length URMv and URMo

In [19]:
n_views = np.ediff1d(sps.csr_matrix(URMv).indptr)
n_opens = np.ediff1d(sps.csr_matrix(URMo).indptr)

training_dataframe['n_views'] = n_views[training_dataframe["UserID"].values.astype(int)]
training_dataframe['n_opens'] = n_opens[training_dataframe["UserID"].values.astype(int)]
training_dataframe

Unnamed: 0,UserID,ItemID,Label,KNN_recommender,knn_rp3_recommender,SLIM_BPR,TopPop,n_views,n_opens
0,0,29,False,14.357360,2.473777,67.270805,5907.0,12,58
1,0,25,False,13.556995,2.533153,57.762074,6540.0,12,58
2,0,20,False,15.192987,2.902956,73.059799,7887.0,12,58
3,0,391,False,2.542434,4.446660,28.556477,552.0,12,58
4,0,23,False,14.649439,2.611018,53.351173,6743.0,12,58
...,...,...,...,...,...,...,...,...,...
1665155,41628,1738,False,,,,,15,8
1665156,41628,14709,False,,,,,15,8
1665157,41628,23397,False,,,,,15,8
1665158,41628,10045,False,,,,,15,8


## The same can be done with item features

It is better in this case to first create a sparse matrix replicating the rows needed and ten transform it into a sparse dataframe

**WARNING** dataframes are not sparse structures and this may cause the memory requirements to explode

In [20]:
features_df = pd.DataFrame.sparse.from_spmatrix(ICM_genres)
features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1,0,0,0,0,0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0,1,0,0,0,1,0
3,1,0,0,0,0,0,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27963,1,0,0,0,0,0,0,0,1,0,0,0,0
27964,0,0,0,1,0,0,0,0,1,0,0,0,0
27965,1,0,0,0,0,0,0,0,1,0,0,0,0
27966,1,0,0,0,0,0,0,0,1,0,0,0,0


In [21]:
training_dataframe = training_dataframe.set_index('ItemID').join(features_df, how='inner')
training_dataframe = training_dataframe.reset_index()
training_dataframe = training_dataframe.rename(columns = {"index": "ItemID"})
training_dataframe

  return Index(sequences[0], name=names)


Unnamed: 0,ItemID,UserID,Label,KNN_recommender,knn_rp3_recommender,SLIM_BPR,TopPop,n_views,n_opens,0,...,3,4,5,6,7,8,9,10,11,12
0,0,1691,False,,,,,10,7,1,...,0,0,0,0,0,1,0,0,0,0
1,0,10360,False,,,,,14,6,1,...,0,0,0,0,0,1,0,0,0,0
2,0,10879,False,,,,,13,4,1,...,0,0,0,0,0,1,0,0,0,0
3,0,25372,False,,,,,12,3,1,...,0,0,0,0,0,1,0,0,0,0
4,0,25850,False,,,,,7,10,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1665155,24506,35355,False,,,,,14,15,1,...,0,0,0,0,0,1,0,0,0,0
1665156,24506,36088,False,,,,,6,7,1,...,0,0,0,0,0,1,0,0,0,0
1665157,24506,36097,False,,,,,11,5,1,...,0,0,0,0,0,1,0,0,0,0
1665158,24506,39542,False,,,,,12,4,1,...,0,0,0,0,0,1,0,0,0,0


## Then you can train XGBoost to rerank those prediction using as lable whether they should be recommended or not

In [22]:
training_dataframe = training_dataframe.sort_values("UserID").reset_index()
training_dataframe.drop(columns = ['index'], inplace=True)
training_dataframe

Unnamed: 0,ItemID,UserID,Label,KNN_recommender,knn_rp3_recommender,SLIM_BPR,TopPop,n_views,n_opens,0,...,3,4,5,6,7,8,9,10,11,12
0,4987,0,False,1.832254,2.780736,3.386995,268.0,12,58,0,...,0,0,0,0,1,0,1,0,0,0
1,18484,0,False,4.805912,2.343379,16.118126,2079.0,12,58,0,...,0,0,0,0,0,1,0,0,0,0
2,353,0,False,12.909456,2.078973,50.712906,2913.0,12,58,1,...,0,0,0,0,0,1,0,0,0,0
3,29,0,False,14.357360,2.473777,67.270805,5907.0,12,58,0,...,0,0,0,0,0,1,0,0,0,0
4,16887,0,False,2.512191,2.736353,10.388037,1494.0,12,58,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1665155,22547,41628,False,,,,,15,8,1,...,0,0,0,0,0,1,0,0,0,0
1665156,2904,41628,False,,,,,15,8,1,...,0,0,0,0,0,1,0,0,0,0
1665157,4464,41628,False,,,,,15,8,0,...,0,0,0,0,0,1,0,0,0,0
1665158,11920,41628,False,,,,,15,8,0,...,0,0,0,0,1,0,0,1,0,0


### To use the ranker one first needs to specify the size of the groups, a group is the dimension you rank on, in this case each group corresponds to a user. Since we have generated a fixed number of candidates for each user (30) all groups have the same length.

In [23]:
from xgboost import XGBRanker

In [24]:
n_estimators = 50
learning_rate = 1e-1
reg_alpha = 1e-1
reg_lambda = 1e-1
max_depth = 5
max_leaves = 0
grow_policy = "depthwise"
objective = "pairwise"
booster = "gbtree"
use_user_profile = False
random_seed = None

XGB_model = XGBRanker(objective='rank:{}'.format(objective),
                      n_estimators = int(n_estimators),
                      random_state = random_seed,
                      learning_rate = learning_rate,
                      reg_alpha = reg_alpha,
                      reg_lambda = reg_lambda,
                      max_depth = int(max_depth),
                      max_leaves = int(max_leaves),
                      grow_policy = grow_policy,
                      verbosity = 0, # 2 if self.verbose else 0,
                      booster = booster,
                      )

In [25]:
from sklearn.model_selection import train_test_split
import xgboost as xgb


y_train = training_dataframe["Label"]
X_train = training_dataframe.drop(columns=["Label"])


In [26]:
X_train,x_test, y_train, y_test = train_test_split(
                                X_train, y_train, train_size=0.8)

In [27]:
groups = X_train.groupby("UserID").size().values
groups

array([27, 30, 37, ..., 30, 30, 28])

In [28]:
XGB_model.fit(X_train,
          y_train,
          group=groups,
          verbose=True)

### Once the model is trained we can use it to compute predictions. Each prediction will refer to a specific user-item pair, which we will ten need to rank as we do in any other recommender model.

### Important: In order to use this model to predict the score of new datapoints (i.e., new recommendations) we have to repeat the same data processing steps but:
- We do not need a train-label split, we can user all the data we have to compute the predictions and the features
- The recommendation models used to generate the scores should be trained on all the available data

In [29]:
XGB_model.predict(X_train)

array([0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5], dtype=float32)

### We can have a look to the feature importance to assess which are the most informative ones

In [30]:
%matplotlib inline
from xgboost import plot_importance
plot_importance(XGB_model, importance_type='weight', title='Weight (Frequence)')

ValueError: Booster.get_score() results in empty.  This maybe caused by having all trees as decision dumps.

### Note, here ItemID and UserID are provided as integers, meaning that XGBoost will use them as any integer number and may split user groups according to whether their ID is < or > of a certain value. This makes no sense of course because the IDs are not ordinal, they are categorical and the specific numerical value of an IDs has no relation with the semantics of the problem. 

How to address this? 
- Use one-hot-encoded values -> drawback, the number of columns becomes very large
- Use the native "Categorical" data type -> drawback, it is still experimental and may not work very well
- Use another representation of the IDs, such as target encoding -> drawback, some further processing is needed and no teaching material is provided on this

### How to perform hyperparameter tuning?
The issue with this method is that you need a label which should be an item the user has not interacted with but that is a correct recommendation. In practice the idea is:
- Split the data in the usual training-validation-test
- Split the training data in two: one part you use to train the recommenders and another you use as the hidden Label to train XGBoost
- Evaluate your predictions on the validation data as you did for any other recommender model. Use this to select the optimal hyperparameters.
- Given the selected hyperparameters, train the recommender models on all the available data and use all the available data to compute the features used by XGBoost.

Challenge: Since the label we use for training XGBoost is the split of a split, it may happen that the actual correct recommendations are very few. This will result in a problem that is very unbalanced towards zero and will make the training difficult and the evaluation noisy. To mitigate this you may use k-fold cross validation and define the valdation result of a certain hyperparameter configuration as the average obtained with k different training-label splits.

In [None]:
from sklearn.metrics import precision_score

preds = XGB_model.predict(x_test)
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Precision: {:.2f} %".format(precision_score(y_test, best_preds, average='macro')))

Precision: 1.00 %


array([0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5], dtype=float32)