# XGBoost

### Importing github repository

In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token = user_secrets.get_secret("token")

## Compiling cython

In [2]:
!git clone https://FrancescoZanella:{token}@github.com/FrancescoZanella/RecSys.git
%cd RecSys

Cloning into 'RecSys'...
remote: Enumerating objects: 1876, done.[K
remote: Counting objects: 100% (198/198), done.[K
remote: Compressing objects: 100% (164/164), done.[K
remote: Total 1876 (delta 113), reused 79 (delta 31), pack-reused 1678[K
Receiving objects: 100% (1876/1876), 22.31 MiB | 20.20 MiB/s, done.
Resolving deltas: 100% (1010/1010), done.
/kaggle/working/RecSys


In [3]:
import sys
sys.path.append("/kaggle/working/RecSys")

In [4]:
!pip install Cython==0.29.23
!pip install nltk==3.6.1
!pip install nose==1.3.7
!pip install numpy>=1.19
! python run_compile_all_cython.py

Collecting Cython==0.29.23
  Downloading Cython-0.29.23-py2.py3-none-any.whl (978 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m978.0/978.0 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Cython
  Attempting uninstall: Cython
    Found existing installation: Cython 3.0.0
    Uninstalling Cython-3.0.0:
      Successfully uninstalled Cython-3.0.0
Successfully installed Cython-0.29.23
Collecting nltk==3.6.1
  Downloading nltk-3.6.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.4
    Uninstalling nltk-3.2.4:
      Successfully uninstalled nltk-3.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency 

## Importing libraries

In [5]:
import scipy.sparse as sps
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as pyplot
import csv
from datetime import datetime
import time
from tqdm import tqdm
import xgboost as xgb
from xgboost import XGBRanker
from xgboost import plot_importance

In [6]:
from Utils.seconds_to_biggest_unit import *
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
from Recommenders.SLIM.SLIMElasticNetRecommender import SLIMElasticNetRecommender
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from Recommenders.KNN.ItemKNNCustomSimilarityRecommender import ItemKNNCustomSimilarityRecommender
from Recommenders.GraphBased.P3alphaRecommender import P3alphaRecommender
from Recommenders.EASE_R.EASE_R_Recommender import EASE_R_Recommender
from Recommenders.SLIM.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython
from Recommenders.NonPersonalizedRecommender import TopPop
from Recommenders.MatrixFactorization.IALSRecommender import IALSRecommender
from Recommenders.SLIM.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython

from MyTuning.Recall.best_pars_recall import *
from MyTuning.best_pars import *

## Importing URM

In [7]:
# open the dataset
path = "/kaggle/working/RecSys/recsys1/data_train.csv"
path_target = "/kaggle/working/RecSys/recsys1/data_target_users_test.csv"

file = open(path, 'r')



file1 = open(path_target, 'r')

# load the dataset removing the header
df = pd.read_csv(filepath_or_buffer=path,
                 header=0,
                 dtype={0:int, 1:int, 2:int},     
                 sep=",",
                 engine='python')

df_users = pd.read_csv(
    filepath_or_buffer = path_target,
    header=0,
    dtype={0: int},
    sep=",",
    engine="python"   , 
)


df_users.columns = ["UserID"]



# rename the columns
df.columns = ["UserID", "ItemID", "Interaction"]



mapped_id, original_id = pd.factorize(df["ItemID"].unique())
item_original_ID_to_index = pd.Series(mapped_id, index=original_id)



mapped_id, original_id = pd.factorize(df["UserID"].unique())
user_original_ID_to_index = pd.Series(mapped_id, index=original_id)


df["UserID"] = df["UserID"].map(user_original_ID_to_index)
df["ItemID"] = df["ItemID"].map(item_original_ID_to_index)


URM_all = sps.coo_matrix((df["Interaction"].values, 
                          (df["UserID"].values, df["ItemID"].values)))

In [8]:
URM_all

<12638x22222 sparse matrix of type '<class 'numpy.int64'>'
	with 478730 stored elements in COOrdinate format>

## Creating the splits

In [9]:
URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.85)



In [10]:
def create_dataframe_with_label(URM_features,URM_label,other_algorithms,cutoff):
    n_users_features, n_items_features = URM_features.shape
    training_dataframe = pd.DataFrame(index=range(0,n_users_features), columns = ["ItemID"])
    training_dataframe.index.name='UserID'
    #print(training_dataframe.columns)

    for user_id in tqdm(range(n_users_features)):    
        recommendations = other_algorithms["candidate_generator"].recommend(user_id, cutoff = cutoff)
        training_dataframe.loc[user_id, "ItemID"] = recommendations
        
    #print(training_dataframe.columns)
    training_dataframe = training_dataframe.explode("ItemID")

    URM_label_coo = sps.coo_matrix(URM_label)

    correct_recommendations = pd.DataFrame({"UserID": URM_label_coo.row,
                                        "ItemID": URM_label_coo.col})

    training_dataframe = pd.merge(training_dataframe, correct_recommendations, on=['UserID','ItemID'], how='left', indicator='Exist')
    training_dataframe["Label"] = training_dataframe["Exist"] == "both"
    training_dataframe.drop(columns = ['Exist'], inplace=True)
    
    training_dataframe = training_dataframe.set_index('UserID')

    for user_id in tqdm(range(n_users_features)):  
        for rec_label, rec_instance in other_algorithms.items():
            if rec_label!= "candidate_generator":
                item_list = training_dataframe.loc[user_id, "ItemID"].values.tolist()
        
                all_item_scores = rec_instance._compute_item_score([user_id], items_to_compute = item_list)

                training_dataframe.loc[user_id, rec_label] = all_item_scores[0, item_list] 

    training_dataframe = training_dataframe.reset_index()
    training_dataframe = training_dataframe.rename(columns = {"index": "UserID"})
    
    item_popularity = np.ediff1d(sps.csc_matrix(URM_features).indptr)

    training_dataframe['item_popularity'] = item_popularity[training_dataframe["ItemID"].values.astype(int)]
    
    user_popularity = np.ediff1d(sps.csr_matrix(URM_features).indptr)

    training_dataframe['user_profile_len'] = user_popularity[training_dataframe["UserID"].values.astype(int)]
    
    training_dataframe = training_dataframe.sort_values("UserID").reset_index()
    training_dataframe.drop(columns = ['index'], inplace=True)
    
    return training_dataframe

In [11]:
def create_dataframe_without_label(URM_features,other_algorithms,cutoff):
    n_users_features, n_items_features = URM_features.shape
    training_dataframe = pd.DataFrame(index=range(0,n_users_features), columns = ["ItemID"])
    training_dataframe.index.name='UserID'
    #print(training_dataframe.columns)
    #print(training_dataframe)
    
    for user_id in tqdm(range(n_users_features)):    
        recommendations = other_algorithms["candidate_generator"].recommend(user_id, cutoff = cutoff)
        training_dataframe.loc[user_id, "ItemID"] = recommendations
        
    #print(training_dataframe.columns)
    #print(training_dataframe)
    training_dataframe = training_dataframe.explode("ItemID")
    #training_dataframe = training_dataframe.set_index('UserID')

    for user_id in tqdm(range(n_users_features)):  
        for rec_label, rec_instance in other_algorithms.items():
            if rec_label!= "candidate_generator":
                item_list = training_dataframe.loc[user_id, "ItemID"].values.tolist()
        
                all_item_scores = rec_instance._compute_item_score([user_id], items_to_compute = item_list)

                training_dataframe.loc[user_id, rec_label] = all_item_scores[0, item_list] 

    training_dataframe = training_dataframe.reset_index()
    training_dataframe = training_dataframe.rename(columns = {"index": "UserID"})
    
    item_popularity = np.ediff1d(sps.csc_matrix(URM_features).indptr)

    training_dataframe['item_popularity'] = item_popularity[training_dataframe["ItemID"].values.astype(int)]
    
    user_popularity = np.ediff1d(sps.csr_matrix(URM_features).indptr)

    training_dataframe['user_profile_len'] = user_popularity[training_dataframe["UserID"].values.astype(int)]
    
    training_dataframe = training_dataframe.sort_values("UserID").reset_index()
    training_dataframe.drop(columns = ['index'], inplace=True)
    
    return training_dataframe

### Creating the hybrid recommender

In [12]:
def evaluate_algorithm(URM_test, recommender_object, at=10):
    
    cumulative_AP = 0.0
    
    num_eval = 0

    # we look for all the users 
    for user_id in tqdm(range(URM_test.shape[0])):
        
        # we get the relevant items for this user
        relevant_items = URM_test.indices[URM_test.indptr[user_id]:URM_test.indptr[user_id+1]]
        
        # if the user have something in the test data we evaluate it
        if len(relevant_items)>0:
            
            recommended_items = recommender_object.recommend(user_id)
            num_eval+=1

            cumulative_AP += AP(recommended_items, relevant_items)
            
    MAP = cumulative_AP / num_eval
    
    return MAP

def AP(recommended_items, relevant_items):
   
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    
    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

In [13]:
def evaluate_algorithm_xgboost(URM_test,dataframe_to_evaluate,model):
    
    cumulative_AP = 0.0
    
    num_eval = 0

    # we look for all the users 
    for user_id in tqdm(range(URM_test.shape[0])):
        
        # we get the relevant items for this user
        relevant_items = URM_test.indices[URM_test.indptr[user_id]:URM_test.indptr[user_id+1]]
        
        # if the user have something in the test data we evaluate it
        if len(relevant_items)>0:
            
            X_to_predict = dataframe_to_evaluate[dataframe_to_evaluate["UserID"] == user_id].copy()

            preds = model.predict(X_to_predict)
            X_to_predict["rank"] = preds
            X_to_predict = X_to_predict.sort_values(by = "rank", ascending = False)
            recommended_items = np.array(X_to_predict["ItemID"])[:10]
            
            num_eval+=1

            cumulative_AP += AP(recommended_items, relevant_items)
            
    MAP = cumulative_AP / num_eval
    
    return MAP

In [14]:
SLIM_ElasticRecall = SLIMElasticNetRecommender(URM_train)
SLIM_ElasticRecall.fit(**best_pars_recall_dict["SLIMElasticNetRecommender"]
    
)

SLIMElasticNetRecommender: URM Detected 166 ( 1.3%) users with no interactions.
SLIMElasticNetRecommender: URM Detected 63 ( 0.3%) items with no interactions.
SLIMElasticNetRecommender: Processed 7705 (34.7%) in 5.00 min. Items per second: 25.68
SLIMElasticNetRecommender: Processed 16460 (74.1%) in 10.00 min. Items per second: 27.43
SLIMElasticNetRecommender: Processed 22222 (100.0%) in 13.18 min. Items per second: 28.09


In [15]:
SLIM_Elastic = SLIMElasticNetRecommender(URM_train)
SLIM_Elastic.fit(**best_pars_dict["SLIMElasticNetRecommender"]
    
)

SLIMElasticNetRecommender: URM Detected 166 ( 1.3%) users with no interactions.
SLIMElasticNetRecommender: URM Detected 63 ( 0.3%) items with no interactions.
SLIMElasticNetRecommender: Processed 10353 (46.6%) in 5.00 min. Items per second: 34.51
SLIMElasticNetRecommender: Processed 22222 (100.0%) in 9.86 min. Items per second: 37.54


In [16]:
RP3Beta = RP3betaRecommender(URM_train)
RP3Beta.fit(
    **best_pars_dict["RP3betaRecommender"]
)

RP3betaRecommender: URM Detected 166 ( 1.3%) users with no interactions.
RP3betaRecommender: URM Detected 63 ( 0.3%) items with no interactions.
RP3betaRecommender: Similarity column 22222 (100.0%), 3306.10 column/sec. Elapsed time 6.72 sec


In [17]:
RP3BetaRecall = RP3betaRecommender(URM_train)
RP3BetaRecall.fit(
    **best_pars_recall_dict["RP3betaRecommender"]
)

RP3betaRecommender: URM Detected 166 ( 1.3%) users with no interactions.
RP3betaRecommender: URM Detected 63 ( 0.3%) items with no interactions.
RP3betaRecommender: Similarity column 22222 (100.0%), 1393.48 column/sec. Elapsed time 15.95 sec


In [18]:
topPop = TopPop(URM_train)
topPop.fit()

TopPopRecommender: URM Detected 166 ( 1.3%) users with no interactions.
TopPopRecommender: URM Detected 63 ( 0.3%) items with no interactions.


In [19]:
item_knn = ItemKNNCFRecommender(URM_train)
item_knn.fit(**best_pars_dict["ItemKNNCFRecommender"])

ItemKNNCFRecommender: URM Detected 166 ( 1.3%) users with no interactions.
ItemKNNCFRecommender: URM Detected 63 ( 0.3%) items with no interactions.
Similarity column 22222 (100.0%), 4523.27 column/sec. Elapsed time 4.91 sec


In [20]:
item_knnRecall = ItemKNNCFRecommender(URM_train)
item_knnRecall.fit(**best_pars_recall_dict["ItemKNNCFRecommender"])

ItemKNNCFRecommender: URM Detected 166 ( 1.3%) users with no interactions.
ItemKNNCFRecommender: URM Detected 63 ( 0.3%) items with no interactions.
Similarity column 22222 (100.0%), 4372.16 column/sec. Elapsed time 5.08 sec


In [21]:
ease_r = EASE_R_Recommender(URM_train)
ease_r.fit(**best_pars_dict["EASE_R_Recommender"])

EASE_R_Recommender: URM Detected 166 ( 1.3%) users with no interactions.
EASE_R_Recommender: URM Detected 63 ( 0.3%) items with no interactions.
EASE_R_Recommender: Fitting model... 
EASE_R_Recommender: Fitting model... done in 7.00 min


In [22]:
new_similarity = 0.6 * SLIM_Elastic.W_sparse + 0.2 * RP3Beta.W_sparse + 0.2 * item_knn.W_sparse
best_hybrid = ItemKNNCustomSimilarityRecommender(URM_train)
best_hybrid.fit(new_similarity)

ItemKNNCustomSimilarityRecommender: URM Detected 166 ( 1.3%) users with no interactions.
ItemKNNCustomSimilarityRecommender: URM Detected 63 ( 0.3%) items with no interactions.


In [23]:
SLIM_bpr=SLIM_BPR_Cython(URM_train)
SLIM_bpr.fit(**best_pars_dict["SLIM_BPR_Cython"])

SLIM_BPR_Recommender: URM Detected 166 ( 1.3%) users with no interactions.
SLIM_BPR_Recommender: URM Detected 63 ( 0.3%) items with no interactions.
SLIM_BPR_Recommender: Automatic selection of fastest train mode. Available RAM is 28782.00 MB (89.64%) of 32110.00 MB, required is 1975.27 MB. Using dense matrix.
Processed 12638 (100.0%) in 0.85 sec. BPR loss is 4.64E-03. Sample per second: 14831
SLIM_BPR_Recommender: Epoch 1 of 445. Elapsed time 0.06 sec
Processed 12638 (100.0%) in 0.91 sec. BPR loss is 1.09E-02. Sample per second: 13822
SLIM_BPR_Recommender: Epoch 2 of 445. Elapsed time 0.12 sec
Processed 12638 (100.0%) in 0.98 sec. BPR loss is 2.04E-02. Sample per second: 12929
SLIM_BPR_Recommender: Epoch 3 of 445. Elapsed time 0.19 sec
Processed 12638 (100.0%) in 1.04 sec. BPR loss is 2.43E-02. Sample per second: 12197
SLIM_BPR_Recommender: Epoch 4 of 445. Elapsed time 0.25 sec
Processed 12638 (100.0%) in 0.10 sec. BPR loss is 3.27E-02. Sample per second: 128999
SLIM_BPR_Recommender: 

In [24]:
P3aRecall=P3alphaRecommender(URM_train)
P3aRecall.fit(**best_pars_recall_dict["P3alphaRecommender"])

P3alphaRecommender: URM Detected 166 ( 1.3%) users with no interactions.
P3alphaRecommender: URM Detected 63 ( 0.3%) items with no interactions.
P3alphaRecommender: Similarity column 22222 (100.0%), 892.51 column/sec. Elapsed time 24.90 sec


In [25]:
P3a=P3alphaRecommender(URM_train)
P3a.fit(**best_pars_dict["P3alphaRecommender"])

P3alphaRecommender: URM Detected 166 ( 1.3%) users with no interactions.
P3alphaRecommender: URM Detected 63 ( 0.3%) items with no interactions.
P3alphaRecommender: Similarity column 22222 (100.0%), 3250.93 column/sec. Elapsed time 6.84 sec


In [26]:
IALSrec = IALSRecommender(URM_train)
IALSrec.fit(**best_pars_dict["IALSRecommender"])

IALSRecommender: URM Detected 166 ( 1.3%) users with no interactions.
IALSRecommender: URM Detected 63 ( 0.3%) items with no interactions.
IALSRecommender: Epoch 1 of 10. Elapsed time 1.44 min
IALSRecommender: Epoch 2 of 10. Elapsed time 2.93 min
IALSRecommender: Epoch 3 of 10. Elapsed time 4.39 min
IALSRecommender: Epoch 4 of 10. Elapsed time 5.86 min
IALSRecommender: Epoch 5 of 10. Elapsed time 7.30 min
IALSRecommender: Epoch 6 of 10. Elapsed time 8.76 min
IALSRecommender: Epoch 7 of 10. Elapsed time 10.23 min
IALSRecommender: Epoch 8 of 10. Elapsed time 11.66 min
IALSRecommender: Epoch 9 of 10. Elapsed time 13.13 min
IALSRecommender: Epoch 10 of 10. Elapsed time 14.59 min
IALSRecommender: Terminating at epoch 10. Elapsed time 14.59 min


In [27]:
new_similarityRecall = 0.3648920858420368 * SLIM_ElasticRecall.W_sparse + 0.30413965737024856 * RP3BetaRecall.W_sparse + 0.053940767085383985 * item_knnRecall.W_sparse + 0.27702748970233065 * P3aRecall.W_sparse
candidate_gen = ItemKNNCustomSimilarityRecommender(URM_train)
candidate_gen.fit(new_similarityRecall)

ItemKNNCustomSimilarityRecommender: URM Detected 166 ( 1.3%) users with no interactions.
ItemKNNCustomSimilarityRecommender: URM Detected 63 ( 0.3%) items with no interactions.


In [28]:
other_algorithms = {
    "TopPop": topPop,
    "RP3beta": RP3Beta,
    "SLIM": SLIM_Elastic,
    "KNN": item_knn,
    "EASER": ease_r,
    "best_hybrid": best_hybrid,
    "SLIMbpr":SLIM_bpr,
    "P3alpha":P3a,
    "IALS": IALSrec,
    "candidate_generator": candidate_gen
}

In [29]:
cutoff = 249
n_estimators = 1323
learning_rate =0.03253703223058349
reg_alpha = 26.431827628978155
reg_lambda = 0.11504013550215164
max_depth = 2
max_leaves = 8
grow_policy = "depthwise"
objective = "pairwise"
booster = "gbtree"
random_seed = None


training_dataframe=create_dataframe_with_label(URM_train,URM_test,other_algorithms,cutoff)
groups = training_dataframe.groupby("UserID").size().values

y_train = training_dataframe["Label"]
X_train = training_dataframe.drop(columns=["Label"])

X_train["UserID"] = X_train["UserID"].astype("category")
X_train["ItemID"] = X_train["ItemID"].astype("category")


XGB_model = XGBRanker(objective='rank:{}'.format(objective),
    n_estimators = int(n_estimators),
    random_state = random_seed,
    learning_rate = learning_rate,
    reg_alpha = reg_alpha,
    reg_lambda = reg_lambda,
    max_depth = int(max_depth),
    max_leaves = int(max_leaves),
    grow_policy = grow_policy,
    verbosity = 0, # 2 if self.verbose else 0,
    booster = booster,
    enable_categorical = True,
    tree_method = "hist",  # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
    )

XGB_model.fit(X_train,
    y_train,
    group=groups,
    verbose=True)




100%|██████████| 12638/12638 [00:17<00:00, 737.50it/s]
100%|██████████| 12638/12638 [02:38<00:00, 79.55it/s]


## Generate submission

In [30]:
SLIM_ElasticRecall = SLIMElasticNetRecommender(URM_all)
SLIM_ElasticRecall.fit(**best_pars_recall_dict["SLIMElasticNetRecommender"]
    
)


SLIMElasticNetRecommender: Processed 7104 (32.0%) in 5.00 min. Items per second: 23.67
SLIMElasticNetRecommender: Processed 15128 (68.1%) in 10.00 min. Items per second: 25.21
SLIMElasticNetRecommender: Processed 22222 (100.0%) in 14.11 min. Items per second: 26.25


In [31]:
SLIM_Elastic = SLIMElasticNetRecommender(URM_all)
SLIM_Elastic.fit(**best_pars_dict["SLIMElasticNetRecommender"]
    
)


SLIMElasticNetRecommender: Processed 9107 (41.0%) in 5.00 min. Items per second: 30.35
SLIMElasticNetRecommender: Processed 19396 (87.3%) in 10.00 min. Items per second: 32.32
SLIMElasticNetRecommender: Processed 22222 (100.0%) in 11.12 min. Items per second: 33.29


In [32]:
RP3Beta = RP3betaRecommender(URM_all)
RP3Beta.fit(
    **best_pars_dict["RP3betaRecommender"]
)



RP3betaRecommender: Similarity column 22222 (100.0%), 3116.43 column/sec. Elapsed time 7.13 sec


In [33]:
RP3BetaRecall = RP3betaRecommender(URM_all)
RP3BetaRecall.fit(
    **best_pars_recall_dict["RP3betaRecommender"]
)

RP3betaRecommender: Similarity column 22222 (100.0%), 1311.66 column/sec. Elapsed time 16.94 sec


In [34]:
P3aRecall=P3alphaRecommender(URM_all)
P3aRecall.fit(**best_pars_recall_dict["P3alphaRecommender"])

P3alphaRecommender: Similarity column 22222 (100.0%), 803.27 column/sec. Elapsed time 27.66 sec


In [35]:
topPop = TopPop(URM_all)
topPop.fit()



In [36]:
item_knn = ItemKNNCFRecommender(URM_all)
item_knn.fit(**best_pars_dict["ItemKNNCFRecommender"])



Similarity column 22222 (100.0%), 4194.95 column/sec. Elapsed time 5.30 sec


In [37]:
item_knnRecall = ItemKNNCFRecommender(URM_all)
item_knnRecall.fit(**best_pars_recall_dict["ItemKNNCFRecommender"])

Similarity column 22222 (100.0%), 4224.07 column/sec. Elapsed time 5.26 sec


In [38]:
ease_r = EASE_R_Recommender(URM_all)
ease_r.fit(**best_pars_dict["EASE_R_Recommender"])



EASE_R_Recommender: Fitting model... 
EASE_R_Recommender: Fitting model... done in 7.46 min


In [39]:
new_similarity = 0.6 * SLIM_Elastic.W_sparse + 0.2 * RP3Beta.W_sparse + 0.2 * item_knn.W_sparse
best_hybrid = ItemKNNCustomSimilarityRecommender(URM_all)
best_hybrid.fit(new_similarity)

In [40]:
SLIM_bpr=SLIM_BPR_Cython(URM_all)
SLIM_bpr.fit(**best_pars_dict["SLIM_BPR_Cython"])

SLIM_BPR_Recommender: Automatic selection of fastest train mode. Available RAM is 25585.00 MB (79.68%) of 32110.00 MB, required is 1975.27 MB. Using dense matrix.
Processed 12638 (100.0%) in 0.86 sec. BPR loss is 6.05E-03. Sample per second: 14690
SLIM_BPR_Recommender: Epoch 1 of 445. Elapsed time 0.07 sec
Processed 12638 (100.0%) in 0.93 sec. BPR loss is 1.65E-02. Sample per second: 13629
SLIM_BPR_Recommender: Epoch 2 of 445. Elapsed time 0.14 sec
Processed 12638 (100.0%) in 1.00 sec. BPR loss is 2.95E-02. Sample per second: 12691
SLIM_BPR_Recommender: Epoch 3 of 445. Elapsed time 0.21 sec
Processed 12638 (100.0%) in 1.07 sec. BPR loss is 3.73E-02. Sample per second: 11830
SLIM_BPR_Recommender: Epoch 4 of 445. Elapsed time 0.28 sec
Processed 12638 (100.0%) in 0.14 sec. BPR loss is 4.88E-02. Sample per second: 91239
SLIM_BPR_Recommender: Epoch 5 of 445. Elapsed time 0.35 sec
Processed 12638 (100.0%) in 0.21 sec. BPR loss is 5.66E-02. Sample per second: 58812
SLIM_BPR_Recommender: Epoch

In [41]:
P3a=P3alphaRecommender(URM_all)
P3a.fit(**best_pars_dict["P3alphaRecommender"])

P3alphaRecommender: Similarity column 22222 (100.0%), 3089.29 column/sec. Elapsed time 7.19 sec


In [42]:
IALSrec = IALSRecommender(URM_all)
IALSrec.fit(**best_pars_dict["IALSRecommender"])

IALSRecommender: Epoch 1 of 10. Elapsed time 1.53 min
IALSRecommender: Epoch 2 of 10. Elapsed time 3.19 min
IALSRecommender: Epoch 3 of 10. Elapsed time 4.84 min
IALSRecommender: Epoch 4 of 10. Elapsed time 6.40 min
IALSRecommender: Epoch 5 of 10. Elapsed time 7.96 min
IALSRecommender: Epoch 6 of 10. Elapsed time 9.52 min
IALSRecommender: Epoch 7 of 10. Elapsed time 11.12 min
IALSRecommender: Epoch 8 of 10. Elapsed time 12.62 min
IALSRecommender: Epoch 9 of 10. Elapsed time 14.39 min
IALSRecommender: Epoch 10 of 10. Elapsed time 15.90 min
IALSRecommender: Terminating at epoch 10. Elapsed time 15.90 min


In [43]:
new_similarityRecall = 0.3648920858420368 * SLIM_ElasticRecall.W_sparse + 0.30413965737024856 * RP3BetaRecall.W_sparse + 0.053940767085383985 * item_knnRecall.W_sparse + 0.27702748970233065 * P3aRecall.W_sparse
candidate_gen = ItemKNNCustomSimilarityRecommender(URM_all)
candidate_gen.fit(new_similarityRecall)

In [44]:
other_algorithms = {
    "TopPop": topPop,
    "RP3beta": RP3Beta,
    "SLIM": SLIM_Elastic,
    "KNN": item_knn,
    "EASER": ease_r,
    "best_hybrid": best_hybrid,
    "SLIMbpr":SLIM_bpr,
    "P3alpha":P3a,
    "IALS": IALSrec,
    "candidate_generator": candidate_gen
}

Deallocating Cython objects


In [45]:
final_dataframe=create_dataframe_without_label(URM_all,other_algorithms,249)

100%|██████████| 12638/12638 [00:18<00:00, 700.38it/s]
100%|██████████| 12638/12638 [02:48<00:00, 75.00it/s]


In [46]:
final_dataframe["UserID"] = final_dataframe["UserID"].astype("category")
final_dataframe["ItemID"] = final_dataframe["ItemID"].astype("category")

In [47]:
def generate_submission_xgb(model,dataframe):
        
        hour = str(int(datetime.now().strftime("%H"))+1)

        current_datetime = datetime.now().strftime("%M:%S")
    
        output_file = f'submission_{hour + ":" + current_datetime}.csv'
    
    


        with open(output_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)

            # Write the header
            writer.writerow(['user_id', 'item_list'])
        
        
        
        
    
            # Generate and write recommendations for each user
            for user_id in tqdm(df_users["UserID"]):
                if user_id in original_id:
                    X_to_predict = dataframe[dataframe["UserID"] == user_original_ID_to_index[user_id]].copy()
                    preds = model.predict(X_to_predict)
                    X_to_predict["rank"] = preds
                    X_to_predict = X_to_predict.sort_values(by = "rank", ascending = False)
                    recommended_items = np.array(X_to_predict["ItemID"])[:10]
                    writer.writerow([user_id, ' '.join(map(str,[item_original_ID_to_index[item_original_ID_to_index== val].index[0] for val in recommended_items] ))])
                else:
                    writer.writerow([user_id, ' '.join(map(str,[item_original_ID_to_index[item_original_ID_to_index == val].index[0] for val in topPop.recommend(user_original_ID_to_index[3])[:10]] ))])
        
        return output_file

In [48]:
output_file = generate_submission_xgb(XGB_model,final_dataframe)

100%|██████████| 10882/10882 [02:23<00:00, 75.72it/s]
