In [1]:
import numpy as np
import matplotlib.pyplot as pyplot
import pandas as pd
import scipy.sparse as sps
%matplotlib inline  
import time
%load_ext Cython
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample


In [2]:
# data_train_path="/kaggle/input/recommender-system-2023-challenge-polimi/data_train.csv"
# data_target_user_path="/kaggle/input/recommender-system-2023-challenge-polimi/data_target_users_test.csv"
data_train_path="data_train.csv"
data_target_user_path="data_target_users_test.csv"
data_train = pd.read_csv(data_train_path)
data_target = pd.read_csv(data_target_user_path)

In [3]:
URM_all =  data_train.pivot(index='row', columns='col', values='data').fillna(0)
item_map = {i : item for i, item in enumerate(URM_all.columns)}
user_map = {i : user for i, user in enumerate(data_target["user_id"])}
item_map_inv = {item : i for i, item in item_map.items()}
user_map_inv = {user : i for i, user in user_map.items()}
missing_index = [x for x in range(1,13025) if x not in URM_all.index.tolist()]
add_urm = pd.DataFrame(index = missing_index, columns = URM_all.columns).fillna(0)
URM_all = pd.concat([URM_all, add_urm]).sort_index()
del add_urm
del missing_index
#data_target["user_id"] = data_target["user_id"]
URM_all = URM_all.to_numpy()
URM_all = sps.csr_matrix(URM_all)
URM_all

<13024x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [4]:
URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8)
n_users, n_items = URM_train.shape



In [5]:
# Parameters
num_factors = 10
learning_rate = 1e-4
regularization = 1e-5

In [6]:
user_factors = np.random.random((n_users, num_factors))
item_factors = np.random.random((n_items, num_factors))

In [10]:
%%cython
import numpy as np
import time

from libc.stdlib cimport rand, srand, RAND_MAX

def train_multiple_epochs(URM_train, learning_rate_input, regularization_input, n_epochs):
    
    URM_train_coo = URM_train.tocoo()
    n_users, n_items = URM_train_coo.shape
    cdef int n_interactions = URM_train.nnz
    
    cdef int sample_num, sample_index, user_id, item_id, factor_index
    cdef double rating, predicted_rating, prediction_error

    cdef int num_factors = 10
    cdef double learning_rate = learning_rate_input
    cdef double regularization = regularization_input
    
    cdef int[:] URM_train_coo_row = URM_train_coo.row
    cdef int[:] URM_train_coo_col = URM_train_coo.col
    cdef double[:] URM_train_coo_data = URM_train_coo.data

    cdef double[:,:] user_factors = np.random.random((n_users, num_factors))
    cdef double[:,:] item_factors = np.random.random((n_items, num_factors))
    cdef double H_i, W_u
    cdef double item_factors_update, user_factors_update
                
    cdef double loss = 0.0
    cdef long start_time = time.time()
    
    for n_epoch in range(n_epochs):

        loss = 0.0
        start_time = time.time()

        for sample_num in range(URM_train.nnz):

            # Randomly pick sample
            sample_index = rand() % n_interactions

            user_id = URM_train_coo_row[sample_index]
            item_id = URM_train_coo_col[sample_index]
            rating = URM_train_coo_data[sample_index]

            # Compute prediction
            predicted_rating = 0.0

            for factor_index in range(num_factors):
                predicted_rating += user_factors[user_id, factor_index] * item_factors[item_id, factor_index]

            # Compute prediction error, or gradient
            prediction_error = rating - predicted_rating
            loss += prediction_error**2

            # Copy original value to avoid messing up the updates
            for factor_index in range(num_factors):

                H_i = item_factors[item_id,factor_index]
                W_u = user_factors[user_id,factor_index]  

                user_factors_update = prediction_error * H_i - regularization * W_u
                item_factors_update = prediction_error * W_u - regularization * H_i

                user_factors[user_id,factor_index] += learning_rate * user_factors_update 
                item_factors[item_id,factor_index] += learning_rate * item_factors_update    
            
        elapsed_time = time.time() - start_time
        samples_per_second = sample_num/elapsed_time
     
        print("Epoch {} complete in in {:.2f} seconds, loss is {:.3E}. Samples per second {:.2f}".format(n_epoch+1, time.time() - start_time, loss/sample_num, samples_per_second))

    return np.array(user_factors), np.array(item_factors), loss, samples_per_second    

Content of stdout:
_cython_magic_cc5d14dfcd83a8341b25d111fec1e25792178258.c
   Creating library C:\Users\feder\.ipython\cython\Users\feder\.ipython\cython\_cython_magic_cc5d14dfcd83a8341b25d111fec1e25792178258.cp311-win_amd64.lib and object C:\Users\feder\.ipython\cython\Users\feder\.ipython\cython\_cython_magic_cc5d14dfcd83a8341b25d111fec1e25792178258.cp311-win_amd64.exp
Generating code
Finished generating code

In [43]:
URM_train_coo = URM_train.tocoo()
n_items = URM_train.shape[1]
# learning_rate = 1e-3
# regularization = 1e-5
    
user_factors, item_factors, loss, samples_per_second =  train_multiple_epochs(URM_train, learning_rate, regularization, 10)

Epoch 1 complete in in 0.61 seconds, loss is 2.189E+00. Samples per second 627879.62
Epoch 2 complete in in 0.68 seconds, loss is 1.464E+00. Samples per second 564472.73
Epoch 3 complete in in 0.75 seconds, loss is 1.080E+00. Samples per second 512010.08
Epoch 4 complete in in 0.82 seconds, loss is 8.489E-01. Samples per second 466481.47
Epoch 5 complete in in 0.90 seconds, loss is 6.924E-01. Samples per second 427187.37
Epoch 6 complete in in 0.98 seconds, loss is 5.827E-01. Samples per second 391721.57
Epoch 7 complete in in 1.07 seconds, loss is 4.972E-01. Samples per second 356855.24
Epoch 8 complete in in 0.15 seconds, loss is 4.351E-01. Samples per second 2523849.83
Epoch 9 complete in in 0.23 seconds, loss is 3.843E-01. Samples per second 1659727.28
Epoch 10 complete in in 0.31 seconds, loss is 3.460E-01. Samples per second 1218674.56


In [44]:
predicted_ratings = np.dot(user_factors, item_factors.T)
predicted_ratings

array([[0.68128018, 1.21804642, 1.37793618, ..., 2.32866806, 1.78345187,
        1.41119735],
       [0.79221499, 0.8352394 , 1.05985302, ..., 1.17768739, 1.70447816,
        1.24540483],
       [0.56441693, 0.68698047, 0.73337364, ..., 1.38481651, 1.32442867,
        0.82690279],
       ...,
       [1.18864977, 0.97073095, 1.1060913 , ..., 1.10637618, 1.73921501,
        1.10302115],
       [1.04627693, 0.91649331, 1.3881368 , ..., 1.99714265, 1.63572175,
        1.85892543],
       [1.61489082, 1.26187279, 1.63519443, ..., 2.70275227, 2.43960451,
        2.18028229]])

In [45]:
predicted_ratings = np.multiply(predicted_ratings ,(URM_train.todense()==0))
predicted_ratings

matrix([[0.68128018, 1.21804642, 1.37793618, ..., 2.32866806, 1.78345187,
         1.41119735],
        [0.        , 0.8352394 , 1.05985302, ..., 1.17768739, 1.70447816,
         1.24540483],
        [0.56441693, 0.68698047, 0.73337364, ..., 1.38481651, 1.32442867,
         0.82690279],
        ...,
        [1.18864977, 0.97073095, 1.1060913 , ..., 1.10637618, 1.73921501,
         1.10302115],
        [1.04627693, 0.91649331, 1.3881368 , ..., 1.99714265, 1.63572175,
         1.85892543],
        [0.        , 1.26187279, 1.63519443, ..., 2.70275227, 2.43960451,
         2.18028229]])

In [58]:
suggestions = np.argsort(predicted_ratings, axis=1)[:,::-1][:,:10]
sueggestions_values = np.sort(predicted_ratings, axis=1)[:,::-1][:,:10]
suggestions, sueggestions_values

(matrix([[ 9397, 12022, 11490, ...,  1670, 12166, 11202],
         [10188, 19493, 21395, ...,  5450, 17517, 21182],
         [18868, 15914, 17402, ..., 22008, 11129, 21873],
         ...,
         [ 6919, 15889, 19582, ..., 13581, 18160,  9433],
         [17265, 18868,  8261, ...,  6337, 19582,  2423],
         [17981, 10290, 13581, ...,  7730,  9397,  6166]], dtype=int64),
 matrix([[3.4035185 , 3.39925413, 3.38049523, ..., 3.2948075 , 3.26846954,
          3.2662502 ],
         [2.69921001, 2.65681272, 2.62755849, ..., 2.58786391, 2.57015307,
          2.56421895],
         [2.32740105, 2.29179789, 2.28667641, ..., 2.26090991, 2.2575797 ,
          2.24489548],
         ...,
         [2.66839713, 2.65204001, 2.63260686, ..., 2.59270286, 2.5846774 ,
          2.58294429],
         [3.18658799, 3.1825936 , 3.18235254, ..., 3.11312614, 3.11198123,
          3.10836858],
         [4.80950782, 4.70853758, 4.62950825, ..., 4.56507523, 4.53463912,
          4.51357563]]))

In [59]:
suggestions_df = pd.DataFrame(columns=["user_id", "item_list", "item_values"])
suggestions_df["user_id"] = range(1,n_users+1)
suggestions_df["item_list"] = suggestions.tolist()
suggestions_df["item_values"] = sueggestions_values.tolist()
suggestions_df["item_list"] = suggestions_df["item_list"].apply(lambda x: " ".join(str(item_map[item]) for item in x))
suggestions_df.to_csv("outputs/funk_SVD.csv", index=False)

In [60]:
suggestions_df

Unnamed: 0,user_id,item_list,item_values
0,1,9404 12032 11500 19323 6874 17209 21926 1671 1...,"[3.4035184978456443, 3.3992541318331613, 3.380..."
1,2,10195 19564 21508 21699 6723 11587 18039 5452 ...,"[2.6992100136122787, 2.6568127177111105, 2.627..."
2,3,18935 15951 17452 15500 19564 15694 14752 2213...,"[2.3274010461053654, 2.291797893511126, 2.2866..."
3,4,21699 16763 7128 12016 8446 13729 15102 6267 1...,"[3.2221121802951713, 3.139639053749286, 3.0831..."
4,5,13817 18428 19734 9075 6880 6182 19574 15639 1...,"[2.5792848723608217, 2.419613464956564, 2.3935..."
...,...,...,...
13019,13020,6840 8913 17033 15926 17717 21699 9440 15500 2...,"[3.9487642127083906, 3.9261493931045366, 3.920..."
13020,13021,21699 18039 19654 12016 13600 8266 6267 6339 1...,"[4.993896243689156, 4.959149720621143, 4.93284..."
13021,13022,6922 15926 19654 8913 21699 12016 6840 13600 1...,"[2.6683971295666997, 2.652040012851108, 2.6326..."
13022,13023,17311 18935 8266 20402 13607 6022 11920 6339 1...,"[3.18658799104698, 3.1825936045670082, 3.18235..."
