In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.sparse as sps

In [2]:
path = "recommender-system-2022-challenge-polimi-data/interactions_and_impressions.csv"

Interactions_and_Impressions = pd.read_csv(filepath_or_buffer=path,
                                           sep=",",
                                           header=1,
                                           engine='python',
                                           names=['UserId', 'ItemId', 'ImpressionList', 'Data'])
Interactions_and_Impressions

Unnamed: 0,UserId,ItemId,ImpressionList,Data
0,0,21,,0
1,0,21,,0
2,0,21,20212223242526272829,0
3,0,21,,1
4,0,21,,1
...,...,...,...,...
5826500,41628,20448,,0
5826501,41628,20896,,1
5826502,41628,21506,,1
5826503,41628,22882,,0


## Data Preparation

In [3]:
userId_unique = Interactions_and_Impressions["UserId"].unique()
itemId_unique = Interactions_and_Impressions["ItemId"].unique()

In [4]:
mapped_id, original_id = pd.factorize(Interactions_and_Impressions["UserId"].unique())
user_original_Id_to_index = pd.Series(mapped_id, index=original_id)

mapped_id, original_id = pd.factorize(Interactions_and_Impressions["ItemId"].unique())
item_original_Id_to_index = pd.Series(mapped_id, index=original_id)


In [5]:
Interactions_and_Impressions["UserId"] = Interactions_and_Impressions["UserId"].map(user_original_Id_to_index)
Interactions_and_Impressions["ItemId"] = Interactions_and_Impressions["ItemId"].map(item_original_Id_to_index)

In [6]:
Interactions_and_Impressions.head(15)

Unnamed: 0,UserId,ItemId,ImpressionList,Data
0,0,0,,0
1,0,0,,0
2,0,0,20212223242526272829,0
3,0,0,,1
4,0,0,,1
5,0,0,,1
6,0,0,,1
7,0,0,,1
8,0,0,,1
9,0,0,,1


## URM

We want an Implicit/Binary Matrix, so:
1) We drop the last two columns, only keeping User and Item Id
2) We remove duplicates
3) Split data into Train/Test/Validation
4) We build the Sparse URM using Scipy

##### Step 1

In [7]:
Distinct_Interactions = Interactions_and_Impressions.copy(deep=True)
Distinct_Interactions = Distinct_Interactions.drop(["ImpressionList", "Data"], axis=1)

##### Step 2

In [8]:
Distinct_Interactions = Distinct_Interactions.drop_duplicates()
Distinct_Interactions['Rating'] = 1
Distinct_Interactions

Unnamed: 0,UserId,ItemId,Rating
0,0,0,1
12,0,1,1
27,0,2,1
28,0,3,1
29,0,4,1
...,...,...,...
5826500,41628,3699,1
5826501,41628,12693,1
5826502,41628,15131,1
5826503,41628,18127,1


In [9]:
unique_userId = Distinct_Interactions.UserId.unique()
num_users = len(unique_userId)
print(num_users)

unique_itemId = Distinct_Interactions.ItemId.unique()
num_items = len(unique_itemId)
print(num_items)

41629
24507


##### Step 3

In [10]:
from sklearn.model_selection import train_test_split
import numpy as np

seed = 1234


(user_ids_training, user_ids_test,
 item_ids_training, item_ids_test,
 ratings_training, ratings_test) = train_test_split(Distinct_Interactions.UserId,
                                                    Distinct_Interactions.ItemId,
                                                    Distinct_Interactions.Rating,
                                                    test_size=0.20,
                                                    shuffle=True,
                                                    random_state=seed)


(user_ids_training, user_ids_validation,
 item_ids_training, item_ids_validation,
 ratings_training, ratings_validation) = train_test_split(user_ids_training,
                                                          item_ids_training,
                                                          ratings_training,
                                                          test_size=0.10,
                                                          )

##### Step 5

In [11]:
import scipy.sparse as sp
urm_train = sp.csr_matrix((ratings_training, (user_ids_training, item_ids_training)),
                          shape=(num_users, num_items))

urm_test = sp.csr_matrix((ratings_test, (user_ids_test, item_ids_test)),
                         shape=(num_users, num_items))

urm_validation = sp.csr_matrix((ratings_validation, (user_ids_validation, item_ids_validation)),
                               shape=(num_users, num_items))

urm_train, urm_test, urm_validation

(<41629x24507 sparse matrix of type '<class 'numpy.int64'>'
 	with 1119339 stored elements in Compressed Sparse Row format>,
 <41629x24507 sparse matrix of type '<class 'numpy.int64'>'
 	with 310928 stored elements in Compressed Sparse Row format>,
 <41629x24507 sparse matrix of type '<class 'numpy.int64'>'
 	with 124372 stored elements in Compressed Sparse Row format>)

# SLIM BPR

In [12]:
from Recommenders.SLIM.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython

SLIM_BPR = SLIM_BPR_Cython(urm_train)

In [14]:
import pyximport
pyximport.install()

(None, <pyximport.pyximport.PyxImporter at 0x7fadc74a22b0>)

In [15]:
#prepare the environment to run Cython code
!python ../run_compile_all_cython.py

run_compile_all_cython: Found 0 Cython files in 4 folders...
run_compile_all_cython: All files will be compiled using your current python environment: '/home/sam/anaconda3/envs/RecSysFramework/bin/python'
run_compile_all_cython: Compilation finished. SUCCESS.
Compilation log can be found here: './result_experiments/run_compile_all_cython.txt'


In [16]:
SLIM_BPR.fit()

SLIM_BPR_Recommender: Automatic selection of fastest train mode. Available RAM is 7034.00 MB (44.28%) of 15887.00 MB, required is 2402.37 MB. Using dense matrix.
Processed 41629 (100.0%) in 0.15 sec. BPR loss is 4.83E-07. Sample per second: 269767
SLIM_BPR_Recommender: Epoch 1 of 300. Elapsed time 0.12 sec
Processed 41629 (100.0%) in 0.32 sec. BPR loss is 1.43E-06. Sample per second: 131175
SLIM_BPR_Recommender: Epoch 2 of 300. Elapsed time 0.29 sec
Processed 41629 (100.0%) in 0.48 sec. BPR loss is 2.54E-06. Sample per second: 86629
SLIM_BPR_Recommender: Epoch 3 of 300. Elapsed time 0.45 sec
Processed 41629 (100.0%) in 0.65 sec. BPR loss is 3.70E-06. Sample per second: 64406
SLIM_BPR_Recommender: Epoch 4 of 300. Elapsed time 0.61 sec
Processed 41629 (100.0%) in 0.80 sec. BPR loss is 4.50E-06. Sample per second: 52276
SLIM_BPR_Recommender: Epoch 5 of 300. Elapsed time 0.76 sec
Processed 41629 (100.0%) in 0.94 sec. BPR loss is 5.75E-06. Sample per second: 44241
SLIM_BPR_Recommender: Epoc

# Evaluation

In [13]:
from Evaluation.Evaluator import EvaluatorHoldout
e = EvaluatorHoldout(urm_test, [10])

EvaluatorHoldout: Ignoring 309 ( 0.7%) Users that have less than 1 test interactions


In [18]:
result = e.evaluateRecommender(SLIM_BPR)

EvaluatorHoldout: Processed 41320 (100.0%) in 21.01 sec. Users per second: 1967


In [26]:
str(result)[:250]

'(       PRECISION PRECISION_RECALL_MIN_DEN    RECALL       MAP MAP_MIN_DEN  \\\ncutoff                                                                      \n10      0.050586                 0.070637  0.062122  0.023955    0.033038   \n\n            MRR  '

# Hyperparameter Tuning

Experiment on epochs: let's see how much the number of epochs impacts map

In [17]:
epo = [200, 100, 50]
# for ep in epo:
SLIM_BPR.fit( epochs=50)
print("---------------------------------------------------------------------------------")
print(e.evaluateRecommender(SLIM_BPR))
print("---------------------------------------------------------------------------------")


SLIM_BPR_Recommender: Automatic selection of fastest train mode. Available RAM is 5486.00 MB (34.53%) of 15887.00 MB, required is 2402.37 MB. Using dense matrix.
Deallocating Cython objects
Processed 41629 (100.0%) in 1.09 sec. BPR loss is 4.56E-07. Sample per second: 38280
SLIM_BPR_Recommender: Epoch 1 of 50. Elapsed time 0.13 sec
Processed 41629 (100.0%) in 0.24 sec. BPR loss is 1.37E-06. Sample per second: 173031
SLIM_BPR_Recommender: Epoch 2 of 50. Elapsed time 0.28 sec
Processed 41629 (100.0%) in 0.40 sec. BPR loss is 2.41E-06. Sample per second: 103785
SLIM_BPR_Recommender: Epoch 3 of 50. Elapsed time 0.44 sec
Processed 41629 (100.0%) in 0.54 sec. BPR loss is 3.30E-06. Sample per second: 76550
SLIM_BPR_Recommender: Epoch 4 of 50. Elapsed time 0.59 sec
Processed 41629 (100.0%) in 0.69 sec. BPR loss is 4.33E-06. Sample per second: 60012
SLIM_BPR_Recommender: Epoch 5 of 50. Elapsed time 0.74 sec
Processed 41629 (100.0%) in 0.83 sec. BPR loss is 5.70E-06. Sample per second: 50229
SLI

n_epochs = 200 -> MAP = 0,023872
n_epochs = 100 -> MAP = 0,023804
n_epochs = 50 -> MAP = 0,023749




#### A Grid Search just to see the potential of SLIM BPR

In [28]:
# questo domani
learning_rates = [10e-4]
topKs = [600]
gammas = [0.0001, 0.00001]
beta_1s = [0.0001, 0.00001]
beta_2s = [0.0001, 0.00001]

In [29]:
result_file = open("slimBPR_result_grid_search_3x600.txt", "w")
result_file.write("LR   TK    G     B1     B2 \n")

for tk in topKs:
    for g in gammas:
        for b1 in beta_1s:
            for b2 in beta_2s:
                for lr in learning_rates:
                    SLIM_BPR.fit(learning_rate=lr, topK=tk, gamma=g, beta_1=b1, beta_2=b2, epochs=50)
                    r = e.evaluateRecommender(SLIM_BPR)

                    print("<------------------------------------->\n")
                    # stampa hyperparametri
                    print(str(lr) + "   " + str(tk) + "   " + str(g) + "   " + str(b1) + "   " + str(b2) + " \n")
                    result_file.write(str(lr) + "   " + str(tk) + "   " + str(g) + "   " + str(b1) + "   " + str(b2) + " \n" )

                    # stampa risultato
                    print(str(r)[:250] + "\n")
                    result_file.write(str(r)[:250] + "\n")

                    # aggiungi linea
                    print("<------------------------------------->\n")
                    result_file.write("<------------------------------------->\n")

result_file.close()

SLIM_BPR_Recommender: Automatic selection of fastest train mode. Available RAM is 5205.00 MB (32.76%) of 15887.00 MB, required is 2402.37 MB. Using dense matrix.
Deallocating Cython objects
Processed 41629 (100.0%) in 0.21 sec. BPR loss is 4.56E-05. Sample per second: 193899
SLIM_BPR_Recommender: Epoch 1 of 50. Elapsed time 0.09 sec
Processed 41629 (100.0%) in 0.32 sec. BPR loss is 1.46E-04. Sample per second: 130472
SLIM_BPR_Recommender: Epoch 2 of 50. Elapsed time 0.19 sec
Processed 41629 (100.0%) in 0.42 sec. BPR loss is 2.51E-04. Sample per second: 98460
SLIM_BPR_Recommender: Epoch 3 of 50. Elapsed time 0.30 sec
Processed 41629 (100.0%) in 0.53 sec. BPR loss is 3.49E-04. Sample per second: 78012
SLIM_BPR_Recommender: Epoch 4 of 50. Elapsed time 0.41 sec
Processed 41629 (100.0%) in 0.65 sec. BPR loss is 4.24E-04. Sample per second: 64297
SLIM_BPR_Recommender: Epoch 5 of 50. Elapsed time 0.52 sec
Processed 41629 (100.0%) in 0.75 sec. BPR loss is 5.16E-04. Sample per second: 55167
SLI

In [33]:
lambda_Is = [1e-1, 1e-2]
lambda_Js = [1e-1, 1e-2]

result_file = open("slimBPR_result_grid_search_i_j_2.txt", "w")
result_file.write("lam_i    lam_j\n")

for i in lambda_Is:
    for j in lambda_Js:
        SLIM_BPR.fit(learning_rate=1e4, topK=500, lambda_i=i, lambda_j=j, epochs=50)
        r = e.evaluateRecommender(SLIM_BPR)

        print("<------------------------------------->\n")
        # stampa hyperparametri
        print("i=" + str(i) + "   j=" + str(j) + " \n")
        result_file.write(str(i) + "   " + str(j) + " \n")

        # stampa risultato
        print(str(r)[:250] + "\n")
        result_file.write(str(r)[:250] + "\n")

        # aggiungi linea
        print("<------------------------------------->\n")
        result_file.write("<------------------------------------->\n")

result_file.close()

SLIM_BPR_Recommender: Automatic selection of fastest train mode. Available RAM is 4651.00 MB (29.28%) of 15887.00 MB, required is 2402.37 MB. Using sparse matrix.
Deallocating Cython objects
Processed 41629 (100.0%) in 1.29 sec. BPR loss is 3.60E+200. Sample per second: 32373
SLIM_BPR_Recommender: Epoch 1 of 50. Elapsed time 1.07 sec
Processed 41629 (100.0%) in 1.92 sec. BPR loss is 3.52E+266. Sample per second: 21695
SLIM_BPR_Recommender: Epoch 2 of 50. Elapsed time 2.70 sec
Processed 41629 (100.0%) in 3.18 sec. BPR loss is INF. Sample per second: 13101
SLIM_BPR_Recommender: Epoch 3 of 50. Elapsed time 4.96 sec
Processed 41629 (100.0%) in 3.13 sec. BPR loss is 2.24E+181. Sample per second: 13310
SLIM_BPR_Recommender: Epoch 4 of 50. Elapsed time 7.91 sec
Processed 41629 (100.0%) in 3.93 sec. BPR loss is 5.41E+228. Sample per second: 10594
SLIM_BPR_Recommender: Epoch 5 of 50. Elapsed time 11.71 sec
Processed 41629 (100.0%) in 5.61 sec. BPR loss is 7.61E+204. Sample per second: 7421
SLIM