# Product Recommendation Sparse Preprocessing
Reference: [https://ieeexplore.ieee.org/document/5430993](https://www2.seas.gwu.edu/~simhaweb/champalg/cf/papers/wroberts.pdf)

In [None]:
!pip install -r requirements.txt --use-feature=2020-resolver

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.ops.linalg.sparse.sparse_csr_matrix_ops import *
from tensorflow.raw_ops import SparseMatrixAdd, SparseMatrixMatMul, SparseMatrixSparseMatMul, SparseMatrixZeros
from tqdm import tqdm

In [None]:
print(tf.config.experimental.list_physical_devices('GPU'))
tf.__version__

In [None]:
for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)

In [None]:
!nvidia-smi

In [None]:
!lscpu

## Data Preprocessing

In [None]:
%%time
# Y_data = pd.read_csv('data/Y.csv', header=None, names=['Rating','Movie','User'], dtype=np.int32) # training data
# P_data = pd.read_csv('data/P.csv', header=None, names=['Rating','Movie','User'], dtype=np.int32) # test data ('probe-set' mentioned in paper)

Y_data = pd.read_csv('data/Y_full.csv', header=None, names=['Rating','Movie','User'], dtype=np.int32) # training data
P_data = pd.read_csv('data/P_full.csv', header=None, names=['Rating','Movie','User'], dtype=np.int32) # test data ('probe-set' mentioned in paper)

In [None]:
display(Y_data.head())
display(P_data.head())

In [None]:
Y_data.shape, P_data.shape

In [None]:
print(Y_data['Rating'].max(), Y_data['Movie'].max(), Y_data['User'].max())
print(P_data['Rating'].max(), P_data['Movie'].max(), P_data['User'].max())

In [None]:
k, n = Y_data['Movie'].max(), Y_data['User'].max()
k, n

In [None]:
def generate_indices_pair_list(data):
    user_id = 1
    indices_list = list()
    for index, row in enumerate(tqdm(data, total=data.shape[0])):
        if row[2] != user_id:
            user_id = row[2]
            indices_list.append((user_id - 1, index - 1))
    indices_list.append((user_id, index)) # append the last user id and ending index pair

    indices_pair_list = list()
    for (user_id, index_ending) in indices_list:
        if index_ending == indices_list[0][1]: # if the first ending index
            indices_pair_list.append((1, 0, index_ending))
        else:
            index_beginning = indices_pair_list[-1][2] + 1
            indices_pair_list.append((user_id, index_beginning, index_ending))
    return indices_pair_list

In [None]:
Y_data = Y_data.values
P_data = P_data.values

indices_pair_list_Y_data = generate_indices_pair_list(Y_data)
indices_pair_list_P_data = generate_indices_pair_list(P_data)
len(indices_pair_list_Y_data) == len(indices_pair_list_P_data)

In [None]:
user_id_indices_pair_dict_P_data = dict()
for (user_id, index_beginning, index_ending) in indices_pair_list_P_data:
    user_id_indices_pair_dict_P_data[user_id] = (index_beginning, index_ending)

In [None]:
def _float_feature(tensor):
    if isinstance(tensor, np.ndarray):
        value = tensor.flatten()
    else:
        value = tensor.numpy().flatten()
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _sparse_feature(sparse_tensor):
    value = tf.io.serialize_sparse(sparse_tensor).numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

filename = 'data/data_preprocessed.tfrecord'
with tf.io.TFRecordWriter(filename) as writer:
    for index_pair_Y_data in tqdm(indices_pair_list_Y_data, total=len(indices_pair_list_Y_data)):
        t = index_pair_Y_data[0]
        Y_data_t = Y_data[index_pair_Y_data[1]:index_pair_Y_data[2]+1, :]

        movie_ids_t_indices = (Y_data_t[:, 1] - 1).astype(np.int64)
        movie_ids_t_indices_deleted = np.setdiff1d(np.arange(k, dtype=np.int64), movie_ids_t_indices)

        H_yt_st_indices = np.vstack((np.arange(movie_ids_t_indices.shape[0]), movie_ids_t_indices)).T
        H_xt_st_indices = np.vstack((np.arange(movie_ids_t_indices_deleted.shape[0]), movie_ids_t_indices_deleted)).T
        z_t_st_indices = np.vstack((movie_ids_t_indices, np.zeros(movie_ids_t_indices.shape[0], dtype=np.int64))).T

        H_yt_st = tf.SparseTensor(indices=H_yt_st_indices, values=np.ones(H_yt_st_indices.shape[0], dtype=np.float32), dense_shape=[H_yt_st_indices.shape[0], k])
        H_xt_st = tf.SparseTensor(indices=H_xt_st_indices, values=np.ones(H_xt_st_indices.shape[0], dtype=np.float32), dense_shape=[H_xt_st_indices.shape[0], k])
        z_t_st = tf.SparseTensor(indices=z_t_st_indices, values=Y_data_t[:, 0].astype(np.float32), dense_shape=[k, 1])

        H_yt_sm = sparse_tensor_to_csr_sparse_matrix(indices=H_yt_st.indices, values=H_yt_st.values, dense_shape=H_yt_st.dense_shape)
        H_xt_sm = sparse_tensor_to_csr_sparse_matrix(indices=H_xt_st.indices, values=H_xt_st.values, dense_shape=H_xt_st.dense_shape)
        z_t_sm = sparse_tensor_to_csr_sparse_matrix(indices=z_t_st.indices, values=z_t_st.values, dense_shape=z_t_st.dense_shape)

        y_t_sm = SparseMatrixSparseMatMul(a=H_yt_sm, b=z_t_sm, type=tf.float32)
        y_t = csr_sparse_matrix_to_dense(y_t_sm, tf.float32)
        k_t = tf.constant(H_yt_st_indices.shape[0], dtype=tf.float32)
        
        if t in user_id_indices_pair_dict_P_data:
            index_beginning, index_ending = user_id_indices_pair_dict_P_data[t]
            P_data_t = P_data[index_beginning:index_ending+1, :]
            movie_ids_t_P_data_feature = _float_feature(P_data_t[:, 1])
            ratings_t_P_data_feature = _float_feature(tf.expand_dims(P_data_t[:, 0].astype(np.float32), axis=1))
        else:
            movie_ids_t_P_data_feature = _float_feature(np.array([999.999]))
            ratings_t_P_data_feature = _float_feature(np.array([999.999]))
            
        feature = {
                'H_yt_st': _sparse_feature(H_yt_st),
                'H_xt_st': _sparse_feature(H_xt_st),
                'y_t': _float_feature(y_t),
                'k_t': _float_feature(k_t),
                'movie_ids_t_P_data': movie_ids_t_P_data_feature,
                'ratings_t_P_data': ratings_t_P_data_feature,
        }
        writer.write(tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString())

In [None]:
del Y_data
del P_data

## Initialization

$\mu$ has 1 type available

$N = \sum_{t=1}^{n}H_{y_t}'H_{y_t}$

$\hat{\mu}^0 = N^{-1}\sum_{t-1}^{n}H_{y_t}'y_{t}$

In [None]:
# initial estimate of mu
N_sm = SparseMatrixZeros(dense_shape=(k, k), type=tf.float32)
H_yty_t = 0

for features_parsed in tqdm(data_preprocessed, total=len(indices_pair_list_Y_data)):
    H_yt_st, y_t = features_parsed['H_yt_st'], features_parsed['y_t']
    H_yt_sm = sparse_tensor_to_csr_sparse_matrix(indices=H_yt_st.indices, values=H_yt_st.values, dense_shape=H_yt_st.dense_shape)
    
    N_sm = SparseMatrixAdd(a=N_sm, b=SparseMatrixSparseMatMul(a=H_yt_sm, b=H_yt_sm, type=tf.float32, transpose_a=True), alpha=1.0, beta=1.0)
    H_yty_t += SparseMatrixMatMul(a=H_yt_sm, b=y_t, transpose_a=True)

In [None]:
# The ith diagonal element of N equals the total number of ratings of the ith product.
N_inv = tf.linalg.inv(csr_sparse_matrix_to_dense(N_sm, tf.float32))
mu_hat0 = tf.matmul(N_inv, H_yty_t)
tf.transpose(mu_hat0)

R has 4 types available

$R_{1} = I$

$R_{2} = N^{-1}diag(S)$

$R_{3} = diag(S)^{-1/2}Sdiag(S)^{-1/2}$

$R_{4} = N^{-1/2}SN^{-1/2}$

where $S = \sum_{t=1}^{n}H_{y_{t}}'(y_t - H_{y_{t}}\hat{\mu}^0)(y_t - H_{y_{t}}\hat{\mu}^0)'H_{y_{t}}$

In [None]:
# initial estimates of R (4 types available)
R_hat0_1 = tf.eye(k, dtype=tf.float32)
R_hat0_1

In [None]:
S = 0
for features_parsed in tqdm(data_preprocessed, total=len(indices_pair_list_Y_data)):
    H_yt_st y_t = features_parsed['H_yt_st'], features_parsed['y_t']
    H_yt_sm = sparse_tensor_to_csr_sparse_matrix(indices=H_yt_st.indices, values=H_yt_st.values, dense_shape=H_yt_st.dense_shape)
    
    Hytmu_hat0 = SparseMatrixMatMul(a=H_yt_sm, b=mu_hat0)
    intermediate_result = SparseMatrixMatMul(a=H_yt_sm, b=y_t - Hytmu_hat0, transpose_a=True)
    S += tf.matmul(intermediate_result, intermediate_result, transpose_b=True)

In [None]:
# diag_S is the diagonal matrix consisting of the diagonal elements of S
diag_S = tf.linalg.diag(tf.linalg.tensor_diag_part(S))
R_hat0_2 = tf.matmul(N_inv, diag_S)
R_hat0_2

In [None]:
# R_hat0_3 is not a good initializer when rating variances are far from one
diag_S_inv_sqrtm = tf.linalg.sqrtm(tf.linalg.inv(diag_S))
R_hat0_3 = diag_S_inv_sqrtm @ S @ diag_S_inv_sqrtm
R_hat0_3

In [None]:
# R_hat0_4 = tf.matmul(tf.linalg.sqrtm(N_inv), tf.matmul(S, tf.linalg.sqrtm(N_inv)))
N_inv_sqrtm = tf.linalg.sqrtm(N_inv)
R_hat0_4 = N_inv_sqrtm @ S @ N_inv_sqrtm
R_hat0_4

In [None]:
np.save('mu_hat0.npy', mu_hat0)
np.save('R_hat0_4.npy', R_hat0_4)