In [4]:
import numpy as np
import matplotlib.pyplot as pyplot
import pandas as pd
import scipy.sparse as sps
%matplotlib inline
%load_ext Cython

from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Evaluation.Evaluator import EvaluatorHoldout


The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [5]:
# data_train_path="/kaggle/input/recommender-system-2023-challenge-polimi/data_train.csv"
# data_target_user_path="/kaggle/input/recommender-system-2023-challenge-polimi/data_target_users_test.csv"
data_train_path="data_train.csv"
data_target_user_path="data_target_users_test.csv"
data_train = pd.read_csv(data_train_path)
data_target = pd.read_csv(data_target_user_path)

In [6]:
URM_all =  data_train.pivot(index='row', columns='col', values='data').fillna(0)
item_map = {i : item for i, item in enumerate(URM_all.columns)}
user_map = {i : user for i, user in enumerate(data_target["user_id"])}
item_map_inv = {item : i for i, item in item_map.items()}
user_map_inv = {user : i for i, user in user_map.items()}
missing_index = [x for x in range(1,13025) if x not in URM_all.index.tolist()]
add_urm = pd.DataFrame(index = missing_index, columns = URM_all.columns).fillna(0)
URM_all = pd.concat([URM_all, add_urm]).sort_index()
del add_urm
del missing_index
#data_target["user_id"] = data_target["user_id"]
URM_all = URM_all.to_numpy()
URM_all = sps.csr_matrix(URM_all)
URM_all

<13024x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [7]:
URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8)
n_users, n_items = URM_train.shape
# evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])



In [8]:
%%cython

import numpy as np
import time

from libc.stdlib cimport rand, srand, RAND_MAX

def train_multiple_epochs(URM_train, learning_rate_input, regularization_2_input, n_epochs):

    URM_train_coo = URM_train.tocoo()
    cdef int n_items = URM_train.shape[1]
    cdef int n_interactions = URM_train.nnz
    cdef int[:] URM_train_coo_row = URM_train_coo.row
    cdef int[:] URM_train_coo_col = URM_train_coo.col
    cdef double[:] URM_train_coo_data = URM_train_coo.data
    cdef int[:] URM_train_indices = URM_train.indices
    cdef int[:] URM_train_indptr = URM_train.indptr
    cdef double[:] URM_train_data = URM_train.data

    cdef double[:,:] item_item_S = np.zeros((n_items, n_items), dtype = float)
    cdef double learning_rate = learning_rate_input
    cdef double regularization_2 = regularization_2_input
    cdef double loss = 0.0
    cdef long start_time
    cdef double true_rating, predicted_rating, prediction_error, profile_rating
    cdef int start_profile, end_profile
    cdef int index, sample_num, user_id, item_id, profile_item_id
    
    for n_epoch in range(n_epochs):
        
        loss = 0.0
        start_time = time.time()
        
        for sample_num in range(n_interactions):

            # Randomly pick sample
            index = rand() % n_interactions

            user_id = URM_train_coo_row[index]
            item_id = URM_train_coo_col[index]
            true_rating = URM_train_coo_data[index]

            # Compute prediction
            start_profile = URM_train_indptr[user_id]
            end_profile = URM_train_indptr[user_id+1]
            predicted_rating = 0.0

            for index in range(start_profile, end_profile):
                profile_item_id = URM_train_indices[index]
                profile_rating = URM_train_data[index]
                predicted_rating += item_item_S[profile_item_id,item_id] * profile_rating

            # Compute prediction error, or gradient
            prediction_error = true_rating - predicted_rating
            loss += prediction_error**2

            # Update model, in this case the similarity
            for index in range(start_profile, end_profile):
                profile_item_id = URM_train_indices[index]
                profile_rating = URM_train_data[index]
                item_item_S[profile_item_id,item_id] += learning_rate * (prediction_error * profile_rating - 
                                                                         regularization_2 * item_item_S[profile_item_id,item_id])

            # Ensure diagonal is always zero
            item_item_S[item_id,item_id] = 0.0
        
#             if sample_num % 1000000 == 0:
#                 print("Epoch {}: {:.2f}%".format(n_epoch+1, sample_num/n_interactions*100))
            
            
        elapsed_time = time.time() - start_time
        samples_per_second = (sample_num+1)/elapsed_time
     
        print("Epoch {} complete in in {:.2f} seconds, loss is {:.3E}. Samples per second {:.2f}".format(n_epoch+1, time.time() - start_time, loss/(sample_num+1), samples_per_second))

    return np.array(item_item_S), loss/(sample_num+1), samples_per_second

In [9]:
n_items = URM_train.shape[1]
learning_rate = 1e-6
regularization_2 = 1e-3
    
item_item_S, loss, samples_per_second = train_multiple_epochs(URM_train, learning_rate, regularization_2, 10)

Epoch 1 complete in in 2.44 seconds, loss is 9.988E-01. Samples per second 157144.04
Epoch 2 complete in in 1.79 seconds, loss is 9.964E-01. Samples per second 214497.40
Epoch 3 complete in in 2.00 seconds, loss is 9.941E-01. Samples per second 191611.67
Epoch 4 complete in in 2.11 seconds, loss is 9.917E-01. Samples per second 181122.86
Epoch 5 complete in in 1.22 seconds, loss is 9.894E-01. Samples per second 312849.36
Epoch 6 complete in in 1.42 seconds, loss is 9.870E-01. Samples per second 269317.92
Epoch 7 complete in in 1.63 seconds, loss is 9.847E-01. Samples per second 235489.60
Epoch 8 complete in in 1.82 seconds, loss is 9.823E-01. Samples per second 210473.04
Epoch 9 complete in in 1.93 seconds, loss is 9.801E-01. Samples per second 198596.86
Epoch 10 complete in in 2.08 seconds, loss is 9.777E-01. Samples per second 184490.32


In [10]:
item_item_S_sparse = sps.csr_matrix(item_item_S)
item_item_S_sparse

<22222x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 2377751 stored elements in Compressed Sparse Row format>

In [31]:
predicted_ratings = URM_all.dot(item_item_S_sparse)
predicted_ratings = predicted_ratings.multiply(1 - URM_all.todense())
predicted_ratings.todense(), predicted_ratings.shape

(matrix([[0.0106289 , 0.01160036, 0.01072798, ..., 0.        , 0.        ,
          0.        ],
         [0.        , 0.02337011, 0.01952597, ..., 0.        , 0.        ,
          0.        ],
         [0.00114513, 0.00080015, 0.00045769, ..., 0.        , 0.        ,
          0.        ],
         ...,
         [0.00045349, 0.00012243, 0.00097181, ..., 0.        , 0.        ,
          0.        ],
         [0.00715307, 0.00882841, 0.00608888, ..., 0.        , 0.        ,
          0.        ],
         [0.        , 0.02104363, 0.01509153, ..., 0.        , 0.        ,
          0.        ]]),
 (13024, 22222))

In [32]:
np.flip(np.argsort(predicted_ratings.todense(), axis=1), axis=1)

matrix([[    1,     2,     0, ..., 14284, 14285, 11110],
        [    1,     2,     3, ..., 14245, 14246,     0],
        [    0,     8,     6, ..., 14764, 14765, 11110],
        ...,
        [  808,  1673,    30, ..., 14787, 14788, 11110],
        [    8,     1,    10, ..., 14489, 14490, 11110],
        [    1,     3,     2, ..., 14166, 14167,     0]], dtype=int64)

In [33]:
np.max(predicted_ratings.todense())

0.20584114517549096

In [None]:
# matrix([[    2,     3,    14, ..., 11123, 11122, 11110],
#         [    0,     1,     3, ..., 14357, 14358, 11110],
#         [   10,     8,     1, ..., 14780, 14781, 11110],
#         ...,
#         [  808,  1673,     2, ..., 14797, 14798, 11110],
#         [    1,     0,     6, ..., 14538, 14539, 11110],
#         [    1,     3,     2, ..., 14213, 14214, 11110]], dtype=int64)