# Product Recommendation
Reference: https://ieeexplore.ieee.org/document/5430993

In [1]:
import numexpr as ne
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

In [2]:
tf.__version__

'2.2.0'

## Data Preprocessing

In [3]:
Y_data = pd.read_csv('data/Y.csv', header=None, names=['Rating','Movie','User'], dtype=np.int32) # training data
P_data = pd.read_csv('data/P.csv', header=None, names=['Rating','Movie','User'], dtype=np.int32) # test data ('probe-set' mentioned in paper)

In [4]:
display(Y_data.head())
display(P_data.head())
Y_data.shape, P_data.shape

Unnamed: 0,Rating,Movie,User
0,5,2,1
1,4,7,1
2,4,8,1
3,4,11,1
4,4,12,1


Unnamed: 0,Rating,Movie,User
0,3,6,1
1,5,96,1
2,3,1,2
3,3,33,3
4,5,42,4


((3399874, 3), (189699, 3))

In [5]:
print(Y_data['Rating'].unique().max(), Y_data['Movie'].unique().max(), Y_data['User'].unique().max())
print(P_data['Rating'].unique().max(), P_data['Movie'].unique().max(), P_data['User'].unique().max())

5 100 137328
5 100 137328


In [6]:
k, n = Y_data['Movie'].unique().max(), Y_data['User'].unique().max()
k, n

(100, 137328)

In [7]:
Z_sparse = tf.SparseTensor(indices=Y_data[['Movie', 'User']].values-1, values=Y_data['Rating'].values, dense_shape=[k, n])
Z_sparse = tf.cast(Z_sparse, tf.float64)

In [8]:
# use dense matrices for faster linear transformations since all matrices can fit in memory
Z = tf.sparse.to_dense(Z_sparse, validate_indices=False)
Z

<tf.Tensor: shape=(100, 137328), dtype=float64, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 3., 0., 0.],
       ...,
       [5., 0., 0., ..., 4., 0., 4.],
       [4., 0., 3., ..., 0., 0., 4.],
       [3., 4., 0., ..., 4., 5., 4.]])>

In [9]:
# memoization
t_k_dict = {}
t_Z_dict = {}
t_y_dict = {}
t_x_dict = {}
t_Hy_dict = {}
t_Hx_dict = {}
t_Hy_trans_dict = {}
t_Hx_trans_dict = {}
t_movie_ids_dict = {}
t_labels_dict = {}

Y_data_user_ids = Y_data['User'].values
P_data_user_ids = P_data['User'].values

for t in tqdm(range(n)):
    movie_ids_indices = Y_data[ne.evaluate(f'Y_data_user_ids == {t+1}')]['Movie'].values - 1
    H_yt = tf.constant(np.identity(k)[movie_ids_indices], dtype=tf.float64)
    H_xt = tf.constant(np.delete(np.identity(k), movie_ids_indices, 0), dtype=tf.float64)
    
    k_t = tf.constant(H_yt.shape[0], dtype=tf.float64)
    Z_t = tf.expand_dims(Z[:, t], axis=1) 
    y_t = tf.matmul(H_yt, Z_t)
    x_t = tf.matmul(H_xt, Z_t)
    
    # store the variables for fast future reference
    t_Hy_dict[t] = H_yt
    t_Hx_dict[t] = H_xt
    t_Hx_trans_dict[t] = tf.transpose(H_xt)
    t_Hy_trans_dict[t] = tf.transpose(H_yt)
    
    t_k_dict[t] = k_t
    t_x_dict[t] = x_t
    t_y_dict[t] = y_t
    t_Z_dict[t] = Z_t
    
    t_movie_ids_dict[t] = P_data[ne.evaluate(f'P_data_user_ids == {t+1}')]['Movie'].values
    t_labels_dict[t] = tf.expand_dims(P_data[ne.evaluate(f'P_data_user_ids == {t+1}')]['Rating'].values, axis=1)
    
del Y_data
del P_data
del Z_sparse
del Y_data_user_ids
del P_data_user_ids

100%|██████████| 137328/137328 [10:42<00:00, 213.70it/s]


## Initialization
$\mu$ has 1 type available <br />
R has 4 types available

In [10]:
# initial estimate of mu
N = 0
H_yty_t = 0

for t in tqdm(range(n)):
    N += tf.matmul(t_Hy_trans_dict[t], t_Hy_dict[t])
    H_yty_t += tf.matmul(t_Hy_trans_dict[t], t_y_dict[t])

100%|██████████| 137328/137328 [00:17<00:00, 8070.88it/s] 


In [11]:
# The ith diagonal element of N equals the total number of ratings of the ith product.
N

<tf.Tensor: shape=(100, 100), dtype=float64, numpy=
array([[20017.,     0.,     0., ...,     0.,     0.,     0.],
       [    0., 23917.,     0., ...,     0.,     0.,     0.],
       [    0.,     0., 31634., ...,     0.,     0.,     0.],
       ...,
       [    0.,     0.,     0., ..., 60896.,     0.,     0.],
       [    0.,     0.,     0., ...,     0., 61521.,     0.],
       [    0.,     0.,     0., ...,     0.,     0., 64506.]])>

In [12]:
mu_hat0 = tf.matmul(tf.linalg.inv(N), H_yty_t)
tf.transpose(mu_hat0)

<tf.Tensor: shape=(1, 100), dtype=float64, numpy=
array([[3.45266523, 3.57674457, 3.28788645, 3.90478757, 3.79035475,
        3.44415598, 3.19071562, 4.52835008, 3.82013753, 3.6159503 ,
        3.40382731, 3.83725101, 4.07603884, 4.22836664, 3.35395465,
        4.0645276 , 3.72119599, 3.48700861, 4.16388921, 3.40982441,
        3.86926003, 3.43583485, 3.20324443, 4.08487897, 3.23199846,
        3.88664794, 4.33189497, 4.38358165, 4.31638739, 3.86591733,
        4.33975717, 3.89147883, 3.70029269, 3.36247781, 4.32901523,
        4.06706884, 4.56922029, 3.77104091, 3.68586682, 3.84532386,
        4.3454114 , 3.90999207, 3.39949928, 3.60786807, 3.96267104,
        4.14386102, 3.4072049 , 3.7040225 , 4.00350359, 4.64280228,
        3.21623279, 3.77238583, 4.26565116, 4.45377313, 3.83848945,
        3.79374176, 3.7629172 , 3.88698608, 3.80041727, 4.34696995,
        3.80469565, 3.84624795, 3.64122601, 3.27221683, 3.42333499,
        3.71631568, 3.20698918, 4.45410441, 4.26541296, 3.86109184

In [13]:
# initial estimates of R (4 types available)
R_hat0_1 = tf.constant(np.identity(k), dtype=tf.float64)
R_hat0_1

<tf.Tensor: shape=(100, 100), dtype=float64, numpy=
array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])>

In [14]:
S = 0
for t in tqdm(range(n)):
    Hyt = t_Hy_dict[t]
    yt = t_y_dict[t]
    Hytmu_hat0 = tf.matmul(Hyt, mu_hat0)
    S += tf.transpose(Hyt) @ (yt - Hytmu_hat0) @ tf.transpose(yt - Hytmu_hat0) @ Hyt

100%|██████████| 137328/137328 [00:31<00:00, 4358.03it/s]


In [15]:
# diag_S is the diagonal matrix consisting of the diagonal elements of S
diag_S = tf.linalg.diag(tf.linalg.tensor_diag_part(S))
R_hat0_2 = tf.matmul(tf.linalg.inv(N), diag_S)
R_hat0_2

<tf.Tensor: shape=(100, 100), dtype=float64, numpy=
array([[1.72440427, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.94219113, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.43659411, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.18291506, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.03485685,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.26227449]])>

In [16]:
# R_hat0_3 is not a good initializer when rating variances are far from one
R_hat0_3 = tf.matmul(tf.linalg.sqrtm(tf.linalg.inv(diag_S)), tf.matmul(S, tf.linalg.sqrtm(tf.linalg.inv(diag_S))))
R_hat0_3

<tf.Tensor: shape=(100, 100), dtype=float64, numpy=
array([[ 1.        ,  0.07418256, -0.01158277, ..., -0.01462987,
        -0.02215371, -0.01844816],
       [ 0.07418256,  1.        ,  0.03674347, ...,  0.0256191 ,
         0.03563234,  0.03926307],
       [-0.01158277,  0.03674347,  1.        , ...,  0.10955311,
         0.12823359,  0.15560634],
       ...,
       [-0.01462987,  0.0256191 ,  0.10955311, ...,  1.        ,
         0.19781317,  0.15164928],
       [-0.02215371,  0.03563234,  0.12823359, ...,  0.19781317,
         1.        ,  0.18995689],
       [-0.01844816,  0.03926307,  0.15560634, ...,  0.15164928,
         0.18995689,  1.        ]])>

In [17]:
R_hat0_4 = tf.matmul(tf.linalg.sqrtm(tf.linalg.inv(N)), tf.matmul(S, tf.linalg.sqrtm(tf.linalg.inv(N))))
R_hat0_4

<tf.Tensor: shape=(100, 100), dtype=float64, numpy=
array([[ 1.72440427,  0.09455639, -0.01823052, ..., -0.02089473,
        -0.02959417, -0.02721758],
       [ 0.09455639,  0.94219113,  0.04274809, ...,  0.02704644,
         0.03518471,  0.04281842],
       [-0.01823052,  0.04274809,  1.43659411, ...,  0.14281326,
         0.15635399,  0.20954206],
       ...,
       [-0.02089473,  0.02704644,  0.14281326, ...,  1.18291506,
         0.21886288,  0.18530794],
       [-0.02959417,  0.03518471,  0.15635399, ...,  0.21886288,
         1.03485685,  0.21710614],
       [-0.02721758,  0.04281842,  0.20954206, ...,  0.18530794,
         0.21710614,  1.26227449]])>

## Expectation Maximization Algorithm

In [18]:
LOG_2PI = tf.math.log(2*tf.constant(np.pi, dtype=tf.float64))

@tf.function(experimental_relax_shapes=True)
def run_graph_em(mu, R, y_t, H_xt, H_xt_trans, H_yt, H_yt_trans, k_t):
    # for R estimation
    R_xt = H_xt @ R @ H_xt_trans
    R_yt = H_yt @ R @ H_yt_trans
    R_yt_det = tf.linalg.det(R_yt)
    R_yt_inv = tf.linalg.inv(R_yt)
    R_xtyt = H_xt @ R @ H_yt_trans

    mu_yt = tf.matmul(H_yt, mu)
    mu_xt = tf.matmul(H_xt, mu)

    X_t_hat = R_xtyt @ R_yt_inv @ (y_t - mu_yt) + mu_xt
    Z_t_hat = H_yt_trans @ y_t + H_xt_trans @ X_t_hat
    
    R_hat_sum_part = (Z_t_hat - mu) @ tf.transpose(Z_t_hat - mu) \
                        + H_xt_trans @ (R_xt - R_xtyt @ R_yt_inv @ tf.transpose(R_xtyt)) @ H_xt

    # for mu estimation
    Hyt_trans_Ryt_inv_Hyt_sum_part = H_yt_trans @ R_yt_inv @ H_yt
    Hyt_trans_Ryt_inv_yt_sum_part = H_yt_trans @ R_yt_inv @ y_t
    
    # for log likelihood calculation
    log_p_hat_part = -1/2*(tf.math.log(R_yt_det) + tf.transpose(y_t - mu_yt) @ R_yt_inv @ (y_t - mu_yt) + k_t*LOG_2PI) 
    
    return R_hat_sum_part, Hyt_trans_Ryt_inv_Hyt_sum_part, Hyt_trans_Ryt_inv_yt_sum_part, log_p_hat_part

In [19]:
def expectation_maximization(mu, R):
    Hyt_trans_Ryt_inv_Hyt_sum = 0
    Hyt_trans_Ryt_inv_yt_sum = 0
    R_hat_sum = 0
    log_p_hat = 0
    
    for t in tqdm(range(n)):
        k_t = t_k_dict[t]
        y_t = t_y_dict[t]
        H_xt = t_Hx_dict[t]
        H_xt_trans = t_Hx_trans_dict[t]
        H_yt = t_Hy_dict[t]
        H_yt_trans = t_Hy_trans_dict[t]
        
        R_hat_sum_part, Hyt_trans_Ryt_inv_Hyt_sum_part, Hyt_trans_Ryt_inv_yt_sum_part, log_p_hat_part = \
            run_graph_em(mu, R, y_t, H_xt, H_xt_trans, H_yt, H_yt_trans, k_t)
        
        R_hat_sum += R_hat_sum_part
        Hyt_trans_Ryt_inv_Hyt_sum += Hyt_trans_Ryt_inv_Hyt_sum_part
        Hyt_trans_Ryt_inv_yt_sum += Hyt_trans_Ryt_inv_yt_sum_part
        log_p_hat += log_p_hat_part
        
    R_hat = R_hat_sum / n
    mu_hat = tf.matmul(tf.linalg.inv(Hyt_trans_Ryt_inv_Hyt_sum), Hyt_trans_Ryt_inv_yt_sum)    
    return mu_hat, R_hat, log_p_hat

In [20]:
delta = 0.0005
mu = mu_hat0
R = R_hat0_4
log_p = tf.constant(-np.inf, dtype=tf.float64)

for i in range(30):
    if i % 5 == 0:
        print(f'iteration: {i}')
    
    mu_hat, R_hat, log_p_hat = expectation_maximization(mu, R)
    convergence_criterion = log_p_hat/n - log_p/n < delta
    
    print('normalized log_p_hat:', (log_p_hat/n).numpy())
    print('normalized log_p:', (log_p/n).numpy())
    print('convergence gap:', (log_p_hat/n - log_p/n).numpy())
    
    if convergence_criterion:
        break
        
    # use new estimattions for next iteration
    mu = mu_hat
    R = R_hat
    log_p = log_p_hat

  0%|          | 0/137328 [00:00<?, ?it/s]

iteration: 0


100%|██████████| 137328/137328 [01:49<00:00, 1258.18it/s]
  0%|          | 136/137328 [00:00<01:40, 1358.34it/s]

normalized log_p_hat: [[-32.23267267]]
normalized log_p: -inf
convergence gap: [[inf]]


100%|██████████| 137328/137328 [01:37<00:00, 1403.04it/s]
  0%|          | 149/137328 [00:00<01:32, 1486.11it/s]

normalized log_p_hat: [[-31.93570812]]
normalized log_p: [[-32.23267267]]
convergence gap: [[0.29696454]]


100%|██████████| 137328/137328 [01:33<00:00, 1473.69it/s]
  0%|          | 151/137328 [00:00<01:30, 1507.63it/s]

normalized log_p_hat: [[-31.78397177]]
normalized log_p: [[-31.93570812]]
convergence gap: [[0.15173636]]


100%|██████████| 137328/137328 [01:35<00:00, 1443.61it/s]
  0%|          | 133/137328 [00:00<01:43, 1322.96it/s]

normalized log_p_hat: [[-31.68821476]]
normalized log_p: [[-31.78397177]]
convergence gap: [[0.09575701]]


100%|██████████| 137328/137328 [01:37<00:00, 1413.79it/s]
  0%|          | 151/137328 [00:00<01:30, 1508.99it/s]

normalized log_p_hat: [[-31.62162646]]
normalized log_p: [[-31.68821476]]
convergence gap: [[0.0665883]]
iteration: 5


100%|██████████| 137328/137328 [01:29<00:00, 1534.95it/s]
  0%|          | 164/137328 [00:00<01:23, 1634.62it/s]

normalized log_p_hat: [[-31.57324325]]
normalized log_p: [[-31.62162646]]
convergence gap: [[0.04838321]]


100%|██████████| 137328/137328 [01:31<00:00, 1506.45it/s]
  0%|          | 160/137328 [00:00<01:26, 1593.04it/s]

normalized log_p_hat: [[-31.53722374]]
normalized log_p: [[-31.57324325]]
convergence gap: [[0.03601951]]


100%|██████████| 137328/137328 [01:27<00:00, 1574.22it/s]
  0%|          | 153/137328 [00:00<01:29, 1528.38it/s]

normalized log_p_hat: [[-31.50996466]]
normalized log_p: [[-31.53722374]]
convergence gap: [[0.02725908]]


100%|██████████| 137328/137328 [01:30<00:00, 1525.77it/s]
  0%|          | 118/137328 [00:00<01:56, 1173.33it/s]

normalized log_p_hat: [[-31.48908709]]
normalized log_p: [[-31.50996466]]
convergence gap: [[0.02087757]]


100%|██████████| 137328/137328 [01:53<00:00, 1209.76it/s]
  0%|          | 116/137328 [00:00<01:58, 1156.78it/s]

normalized log_p_hat: [[-31.47295385]]
normalized log_p: [[-31.48908709]]
convergence gap: [[0.01613325]]
iteration: 10


100%|██████████| 137328/137328 [01:52<00:00, 1220.27it/s]
  0%|          | 81/137328 [00:00<02:50, 805.18it/s]

normalized log_p_hat: [[-31.4604017]]
normalized log_p: [[-31.47295385]]
convergence gap: [[0.01255214]]


100%|██████████| 137328/137328 [01:46<00:00, 1294.82it/s]
  0%|          | 147/137328 [00:00<01:33, 1469.05it/s]

normalized log_p_hat: [[-31.4505828]]
normalized log_p: [[-31.4604017]]
convergence gap: [[0.0098189]]


100%|██████████| 137328/137328 [01:37<00:00, 1413.13it/s]
  0%|          | 124/137328 [00:00<01:51, 1233.76it/s]

normalized log_p_hat: [[-31.44286679]]
normalized log_p: [[-31.4505828]]
convergence gap: [[0.00771601]]


100%|██████████| 137328/137328 [01:30<00:00, 1520.02it/s]
  0%|          | 153/137328 [00:00<01:29, 1528.27it/s]

normalized log_p_hat: [[-31.43677819]]
normalized log_p: [[-31.44286679]]
convergence gap: [[0.00608859]]


100%|██████████| 137328/137328 [01:28<00:00, 1555.99it/s]
  0%|          | 163/137328 [00:00<01:24, 1627.33it/s]

normalized log_p_hat: [[-31.43195469]]
normalized log_p: [[-31.43677819]]
convergence gap: [[0.0048235]]
iteration: 15


100%|██████████| 137328/137328 [01:30<00:00, 1514.08it/s]
  0%|          | 159/137328 [00:00<01:26, 1583.54it/s]

normalized log_p_hat: [[-31.42811827]]
normalized log_p: [[-31.43195469]]
convergence gap: [[0.00383642]]


100%|██████████| 137328/137328 [01:28<00:00, 1551.73it/s]
  0%|          | 164/137328 [00:00<01:24, 1630.50it/s]

normalized log_p_hat: [[-31.42505464]]
normalized log_p: [[-31.42811827]]
convergence gap: [[0.00306364]]


100%|██████████| 137328/137328 [01:26<00:00, 1586.23it/s]
  0%|          | 154/137328 [00:00<01:29, 1529.48it/s]

normalized log_p_hat: [[-31.42259802]]
normalized log_p: [[-31.42505464]]
convergence gap: [[0.00245661]]


100%|██████████| 137328/137328 [01:38<00:00, 1396.03it/s]
  0%|          | 148/137328 [00:00<01:33, 1470.94it/s]

normalized log_p_hat: [[-31.42061987]]
normalized log_p: [[-31.42259802]]
convergence gap: [[0.00197815]]


100%|██████████| 137328/137328 [01:32<00:00, 1480.24it/s]
  0%|          | 165/137328 [00:00<01:23, 1646.33it/s]

normalized log_p_hat: [[-31.41902021]]
normalized log_p: [[-31.42061987]]
convergence gap: [[0.00159966]]
iteration: 20


100%|██████████| 137328/137328 [01:30<00:00, 1511.99it/s]
  0%|          | 132/137328 [00:00<01:44, 1311.62it/s]

normalized log_p_hat: [[-31.41772109]]
normalized log_p: [[-31.41902021]]
convergence gap: [[0.00129912]]


100%|██████████| 137328/137328 [01:32<00:00, 1491.43it/s]
  0%|          | 141/137328 [00:00<01:37, 1400.08it/s]

normalized log_p_hat: [[-31.41666156]]
normalized log_p: [[-31.41772109]]
convergence gap: [[0.00105952]]


100%|██████████| 137328/137328 [01:32<00:00, 1486.98it/s]
  0%|          | 158/137328 [00:00<01:27, 1572.04it/s]

normalized log_p_hat: [[-31.41579382]]
normalized log_p: [[-31.41666156]]
convergence gap: [[0.00086775]]


100%|██████████| 137328/137328 [01:29<00:00, 1542.02it/s]
  0%|          | 148/137328 [00:00<01:33, 1470.88it/s]

normalized log_p_hat: [[-31.41508021]]
normalized log_p: [[-31.41579382]]
convergence gap: [[0.00071361]]


100%|██████████| 137328/137328 [01:28<00:00, 1552.28it/s]
  0%|          | 157/137328 [00:00<01:27, 1564.99it/s]

normalized log_p_hat: [[-31.41449099]]
normalized log_p: [[-31.41508021]]
convergence gap: [[0.00058921]]
iteration: 25


100%|██████████| 137328/137328 [01:31<00:00, 1497.78it/s]

normalized log_p_hat: [[-31.41400258]]
normalized log_p: [[-31.41449099]]
convergence gap: [[0.00048841]]





In [21]:
# 26 iterations, ~38 min
np.save('results/em_mu.npy', mu_hat)
np.save('results/em_R.npy', R_hat)
np.save('results/em_log_p.npy', log_p_hat)

## McMichael’s Algorithm

In [22]:
@tf.function(experimental_relax_shapes=True)
def run_graph_mcmichael(mu, R, y_t, H_yt, H_yt_trans, k_t):
    # for R estimation
    R_yt = H_yt @ R @ H_yt_trans
    R_yt_det = tf.linalg.det(R_yt)
    R_yt_inv = tf.linalg.inv(R_yt)
    mu_yt = tf.matmul(H_yt, mu)
    log_p_gradient_part = H_yt_trans @ (R_yt_inv - R_yt_inv @ (y_t - mu_yt) @ tf.transpose(y_t - mu_yt) @ R_yt_inv) @ H_yt

    # for mu estimation
    Hyt_trans_Ryt_inv_Hyt_sum_part = H_yt_trans @ R_yt_inv @ H_yt
    Hyt_trans_Ryt_inv_yt_sum_part = H_yt_trans @ R_yt_inv @ y_t
    
    # for log likelihood calculation
    log_p_hat_part = -1/2*(tf.math.log(R_yt_det) + tf.transpose(y_t - mu_yt) @ R_yt_inv @ (y_t - mu_yt) + k_t*LOG_2PI)
    
    return log_p_gradient_part, Hyt_trans_Ryt_inv_Hyt_sum_part, Hyt_trans_Ryt_inv_yt_sum_part, log_p_hat_part

In [23]:
def mcmichael(mu, R):
    gamma = 0.00001
    Hyt_trans_Ryt_inv_Hyt_sum = 0
    Hyt_trans_Ryt_inv_yt_sum = 0
    log_p_gradient = 0
    log_p_hat = 0

    for t in tqdm(range(n)):
        k_t = t_k_dict[t]
        y_t = t_y_dict[t]
        H_yt = t_Hy_dict[t]
        H_yt_trans = t_Hy_trans_dict[t]
        
        log_p_gradient_part, Hyt_trans_Ryt_inv_Hyt_sum_part, Hyt_trans_Ryt_inv_yt_sum_part, log_p_hat_part = \
            run_graph_mcmichael(mu, R, y_t, H_yt, H_yt_trans, k_t)
        
        log_p_gradient += log_p_gradient_part
        Hyt_trans_Ryt_inv_Hyt_sum += Hyt_trans_Ryt_inv_Hyt_sum_part
        Hyt_trans_Ryt_inv_yt_sum += Hyt_trans_Ryt_inv_yt_sum_part
        log_p_hat += log_p_hat_part
        
    R_hat = R + gamma*(R @ (-1/2*log_p_gradient) @ R)
    mu_hat = tf.matmul(tf.linalg.inv(Hyt_trans_Ryt_inv_Hyt_sum), Hyt_trans_Ryt_inv_yt_sum)
    return mu_hat, R_hat, log_p_hat

In [24]:
delta = 0.0005
mu = mu_hat0
R = R_hat0_4
log_p = tf.constant(-np.inf, dtype=tf.float64)

for i in range(40):
    if i % 5 == 0:
        print(f'iteration: {i}')
    
    mu_hat, R_hat, log_p_hat = mcmichael(mu, R)
    convergence_criterion = log_p_hat/n - log_p/n < delta
    
    print('normalized log_p_hat:', (log_p_hat/n).numpy())
    print('normalized log_p:', (log_p/n).numpy())
    print('convergence gap:', (log_p_hat/n - log_p/n).numpy())
    
    if convergence_criterion:
        break
        
    # use new estimattions for next iteration
    mu = mu_hat
    R = R_hat
    log_p = log_p_hat

  0%|          | 0/137328 [00:00<?, ?it/s]

iteration: 0


100%|██████████| 137328/137328 [01:06<00:00, 2057.72it/s]
  0%|          | 167/137328 [00:00<01:22, 1662.14it/s]

normalized log_p_hat: [[-32.23267267]]
normalized log_p: -inf
convergence gap: [[inf]]


100%|██████████| 137328/137328 [01:04<00:00, 2144.22it/s]
  0%|          | 149/137328 [00:00<01:32, 1481.22it/s]

normalized log_p_hat: [[-32.00694536]]
normalized log_p: [[-32.23267267]]
convergence gap: [[0.2257273]]


100%|██████████| 137328/137328 [01:07<00:00, 2024.04it/s]
  0%|          | 196/137328 [00:00<01:10, 1948.64it/s]

normalized log_p_hat: [[-31.87427149]]
normalized log_p: [[-32.00694536]]
convergence gap: [[0.13267387]]


100%|██████████| 137328/137328 [01:03<00:00, 2160.43it/s]
  0%|          | 162/137328 [00:00<01:24, 1613.74it/s]

normalized log_p_hat: [[-31.78281261]]
normalized log_p: [[-31.87427149]]
convergence gap: [[0.09145888]]


100%|██████████| 137328/137328 [01:09<00:00, 1977.84it/s]
  0%|          | 219/137328 [00:00<01:02, 2182.92it/s]

normalized log_p_hat: [[-31.71493972]]
normalized log_p: [[-31.78281261]]
convergence gap: [[0.06787289]]
iteration: 5


100%|██████████| 137328/137328 [01:05<00:00, 2085.64it/s]
  0%|          | 190/137328 [00:00<01:12, 1897.60it/s]

normalized log_p_hat: [[-31.66230112]]
normalized log_p: [[-31.71493972]]
convergence gap: [[0.0526386]]


100%|██████████| 137328/137328 [01:02<00:00, 2183.43it/s]
  0%|          | 237/137328 [00:00<00:58, 2361.88it/s]

normalized log_p_hat: [[-31.62041707]]
normalized log_p: [[-31.66230112]]
convergence gap: [[0.04188405]]


100%|██████████| 137328/137328 [01:00<00:00, 2280.62it/s]
  0%|          | 235/137328 [00:00<00:58, 2342.16it/s]

normalized log_p_hat: [[-31.58655052]]
normalized log_p: [[-31.62041707]]
convergence gap: [[0.03386655]]


100%|██████████| 137328/137328 [01:01<00:00, 2242.20it/s]
  0%|          | 238/137328 [00:00<00:57, 2379.97it/s]

normalized log_p_hat: [[-31.55885959]]
normalized log_p: [[-31.58655052]]
convergence gap: [[0.02769093]]


100%|██████████| 137328/137328 [00:59<00:00, 2296.82it/s]
  0%|          | 234/137328 [00:00<00:58, 2329.56it/s]

normalized log_p_hat: [[-31.53602707]]
normalized log_p: [[-31.55885959]]
convergence gap: [[0.02283252]]
iteration: 10


100%|██████████| 137328/137328 [01:01<00:00, 2231.56it/s]
  0%|          | 204/137328 [00:00<01:07, 2037.11it/s]

normalized log_p_hat: [[-31.51707512]]
normalized log_p: [[-31.53602707]]
convergence gap: [[0.01895195]]


100%|██████████| 137328/137328 [01:10<00:00, 1937.95it/s]
  0%|          | 230/137328 [00:00<00:59, 2297.06it/s]

normalized log_p_hat: [[-31.50125945]]
normalized log_p: [[-31.51707512]]
convergence gap: [[0.01581567]]


100%|██████████| 137328/137328 [01:02<00:00, 2193.52it/s]
  0%|          | 218/137328 [00:00<01:02, 2177.65it/s]

normalized log_p_hat: [[-31.48800287]]
normalized log_p: [[-31.50125945]]
convergence gap: [[0.01325658]]


100%|██████████| 137328/137328 [01:02<00:00, 2209.73it/s]
  0%|          | 236/137328 [00:00<00:58, 2357.50it/s]

normalized log_p_hat: [[-31.47685087]]
normalized log_p: [[-31.48800287]]
convergence gap: [[0.01115201]]


100%|██████████| 137328/137328 [01:05<00:00, 2111.39it/s]
  0%|          | 195/137328 [00:00<01:10, 1941.92it/s]

normalized log_p_hat: [[-31.46744078]]
normalized log_p: [[-31.47685087]]
convergence gap: [[0.00941008]]
iteration: 15


100%|██████████| 137328/137328 [01:06<00:00, 2079.66it/s]
  0%|          | 215/137328 [00:00<01:03, 2147.62it/s]

normalized log_p_hat: [[-31.45948]]
normalized log_p: [[-31.46744078]]
convergence gap: [[0.00796078]]


100%|██████████| 137328/137328 [01:00<00:00, 2257.24it/s]
  0%|          | 236/137328 [00:00<00:58, 2353.61it/s]

normalized log_p_hat: [[-31.45273014]]
normalized log_p: [[-31.45948]]
convergence gap: [[0.00674986]]


100%|██████████| 137328/137328 [01:00<00:00, 2256.51it/s]
  0%|          | 232/137328 [00:00<00:59, 2318.03it/s]

normalized log_p_hat: [[-31.44699547]]
normalized log_p: [[-31.45273014]]
convergence gap: [[0.00573467]]


100%|██████████| 137328/137328 [01:05<00:00, 2103.95it/s]
  0%|          | 217/137328 [00:00<01:03, 2169.11it/s]

normalized log_p_hat: [[-31.44211422]]
normalized log_p: [[-31.44699547]]
convergence gap: [[0.00488125]]


100%|██████████| 137328/137328 [01:02<00:00, 2192.72it/s]
  0%|          | 231/137328 [00:00<00:59, 2305.46it/s]

normalized log_p_hat: [[-31.43795202]]
normalized log_p: [[-31.44211422]]
convergence gap: [[0.0041622]]
iteration: 20


100%|██████████| 137328/137328 [01:03<00:00, 2166.93it/s]
  0%|          | 236/137328 [00:00<00:58, 2358.68it/s]

normalized log_p_hat: [[-31.43439681]]
normalized log_p: [[-31.43795202]]
convergence gap: [[0.00355521]]


100%|██████████| 137328/137328 [01:03<00:00, 2153.81it/s]
  0%|          | 229/137328 [00:00<00:59, 2289.94it/s]

normalized log_p_hat: [[-31.43135485]]
normalized log_p: [[-31.43439681]]
convergence gap: [[0.00304196]]


100%|██████████| 137328/137328 [01:03<00:00, 2174.89it/s]
  0%|          | 232/137328 [00:00<00:59, 2310.55it/s]

normalized log_p_hat: [[-31.42874753]]
normalized log_p: [[-31.43135485]]
convergence gap: [[0.00260732]]


100%|██████████| 137328/137328 [01:01<00:00, 2244.48it/s]
  0%|          | 228/137328 [00:00<01:00, 2272.70it/s]

normalized log_p_hat: [[-31.42650882]]
normalized log_p: [[-31.42874753]]
convergence gap: [[0.0022387]]


100%|██████████| 137328/137328 [01:01<00:00, 2215.74it/s]
  0%|          | 204/137328 [00:00<01:07, 2032.25it/s]

normalized log_p_hat: [[-31.42458318]]
normalized log_p: [[-31.42650882]]
convergence gap: [[0.00192564]]
iteration: 25


100%|██████████| 137328/137328 [01:06<00:00, 2069.16it/s]
  0%|          | 235/137328 [00:00<00:58, 2347.14it/s]

normalized log_p_hat: [[-31.42292381]]
normalized log_p: [[-31.42458318]]
convergence gap: [[0.00165937]]


100%|██████████| 137328/137328 [01:02<00:00, 2199.75it/s]
  0%|          | 214/137328 [00:00<01:04, 2135.92it/s]

normalized log_p_hat: [[-31.42149125]]
normalized log_p: [[-31.42292381]]
convergence gap: [[0.00143256]]


100%|██████████| 137328/137328 [01:00<00:00, 2269.84it/s]
  0%|          | 232/137328 [00:00<00:59, 2312.09it/s]

normalized log_p_hat: [[-31.42025219]]
normalized log_p: [[-31.42149125]]
convergence gap: [[0.00123906]]


100%|██████████| 137328/137328 [01:03<00:00, 2177.77it/s]
  0%|          | 213/137328 [00:00<01:04, 2129.51it/s]

normalized log_p_hat: [[-31.41917846]]
normalized log_p: [[-31.42025219]]
convergence gap: [[0.00107372]]


100%|██████████| 137328/137328 [01:05<00:00, 2083.87it/s]
  0%|          | 209/137328 [00:00<01:05, 2079.42it/s]

normalized log_p_hat: [[-31.41824626]]
normalized log_p: [[-31.41917846]]
convergence gap: [[0.00093221]]
iteration: 30


100%|██████████| 137328/137328 [01:00<00:00, 2267.79it/s]
  0%|          | 231/137328 [00:00<00:59, 2306.15it/s]

normalized log_p_hat: [[-31.41743538]]
normalized log_p: [[-31.41824626]]
convergence gap: [[0.00081088]]


100%|██████████| 137328/137328 [01:03<00:00, 2163.14it/s]
  0%|          | 238/137328 [00:00<00:57, 2371.16it/s]

normalized log_p_hat: [[-31.41672871]]
normalized log_p: [[-31.41743538]]
convergence gap: [[0.00070667]]


100%|██████████| 137328/137328 [01:04<00:00, 2127.88it/s]
  0%|          | 223/137328 [00:00<01:01, 2226.26it/s]

normalized log_p_hat: [[-31.41611171]]
normalized log_p: [[-31.41672871]]
convergence gap: [[0.00061701]]


100%|██████████| 137328/137328 [01:03<00:00, 2170.82it/s]
  0%|          | 226/137328 [00:00<01:00, 2258.66it/s]

normalized log_p_hat: [[-31.41557199]]
normalized log_p: [[-31.41611171]]
convergence gap: [[0.00053972]]


100%|██████████| 137328/137328 [01:06<00:00, 2074.66it/s]

normalized log_p_hat: [[-31.415099]]
normalized log_p: [[-31.41557199]]
convergence gap: [[0.00047298]]





In [25]:
# 35 iterations, ~38 min
np.save('results/mcmichael_mu.npy', mu_hat)
np.save('results/mcmichael_R.npy', R_hat)
np.save('results/mcmichael_log_p.npy', log_p_hat)

## Evaluation

In [26]:
@tf.function(experimental_relax_shapes=True)
def run_graph_square_error(mu, R, movie_ids_t, labels_t, y_t, H_xt, H_xt_trans, H_yt, H_yt_trans):
    # calculate X_t_hat
    R_xt = H_xt @ R @ H_xt_trans
    R_yt = H_yt @ R @ H_yt_trans
    R_yt_inv = tf.linalg.inv(R_yt)
    R_xtyt = H_xt @ R @ H_yt_trans

    mu_yt = tf.matmul(H_yt, mu)
    mu_xt = tf.matmul(H_xt, mu)

    X_t_hat = R_xtyt @ R_yt_inv @ (y_t - mu_yt) + mu_xt
    
    # clip ratings
    predictions_t = tf.gather(tf.matmul(H_xt_trans, X_t_hat), indices=movie_ids_t-1)
    predictions_t = tf.clip_by_value(predictions_t, 1, 5)
    
    return tf.matmul(tf.transpose(labels_t - predictions_t), labels_t - predictions_t)

In [27]:
def evaluate(mu, R):
    square_error = 0
    l = 0
    for t in tqdm(range(n)):
        movie_ids_t = t_movie_ids_dict[t]
        labels_t = tf.cast(t_labels_dict[t], dtype=tf.float64)

        y_t = t_y_dict[t]
        H_xt = t_Hx_dict[t]
        H_xt_trans = t_Hx_trans_dict[t]
        H_yt = t_Hy_dict[t]
        H_yt_trans = t_Hy_trans_dict[t]
                
        # accumulate square_error and l
        square_error += run_graph_square_error(mu, R, movie_ids_t, labels_t, y_t, H_xt, H_xt_trans, H_yt, H_yt_trans)
        l += len(labels_t)
    return np.sqrt(square_error/l)

In [28]:
em_mu = np.load('results/em_mu.npy')
em_R = np.load('results/em_R.npy')
rmse = evaluate(em_mu, em_R)
rmse

100%|██████████| 137328/137328 [01:23<00:00, 1654.26it/s]


array([[0.91700701]])

In [29]:
mcmichael_mu = np.load('results/mcmichael_mu.npy')
mcmichael_R = np.load('results/mcmichael_R.npy')
rmse = evaluate(mcmichael_mu, mcmichael_R)
rmse

100%|██████████| 137328/137328 [01:12<00:00, 1897.28it/s]


array([[0.91701472]])