In [1]:
import random
random.seed(42)
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

from scipy.stats import invwishart
from numpy import sum, sqrt, outer, transpose
from numpy.random import multivariate_normal
from scipy.sparse import coo_matrix
from numpy.linalg import inv
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE

In [2]:
rating = pd.read_csv("rating.csv")
rating = rating.sample(10000, random_state=42)
rating.describe()

Unnamed: 0,userId,movieId,rating
count,10000.0,10000.0,10000.0
mean,69946.1978,9397.5087,3.5133
std,40270.913643,20266.782058,1.048705
min,14.0,1.0,0.5
25%,35046.75,904.0,3.0
50%,70725.5,2250.5,3.5
75%,105056.0,4878.0,4.0
max,138456.0,130512.0,5.0


In [3]:
# EDA
...

In [4]:
rating_matrix = rating.pivot(index='userId', columns='movieId', values='rating')
rating_matrix = rating_matrix.fillna(0)
R = rating_matrix.to_numpy()

In [5]:
# hyperparams: \Theta_0 = {\mu_0, T_0, \nu_0, S_0, sigma}
# \Theta_U = {\mu_U, \Sigma_U}
# \Theta_V = {\mu_V, \Sigma_V}

# Sample \Theta_U, \Theta_V
# Sample U, V
# Compute R

In [6]:
# Initialization
M = rating_matrix.shape[0] # number of customers/users
N = rating_matrix.shape[1] # number of movies
D = 10 # number of latent features
U = np.ones((D, M))
V = np.ones((D, N))

mu_0, T_0 = np.zeros((D, 1)), np.eye(D)
nu_0, S_0 = D, np.eye(D)
sigma = 1

In [7]:
nonzero_indices = R.nonzero()
nonzero_values = R[nonzero_indices]

all_indices = list(range(10000))
tr_group = random.sample(all_indices, 8000)
te_group = [i for i in all_indices if i not in tr_group]

tr_indices = (nonzero_indices[0][tr_group], nonzero_indices[1][tr_group])
te_indices = (nonzero_indices[0][te_group], nonzero_indices[1][te_group])

tr_values, te_values = R[tr_indices], R[te_indices]
R_flag = np.zeros_like(R, dtype=int)
R_flag[tr_indices] = 1

In [8]:
iters = 20

Sigma_u, Sigma_v = np.eye(D), np.eye(D)

results_df = pd.DataFrame(columns=['Iteration', 'Train_MAE', 'Test_MAE', 'Train_RMSE', 'Test_RMSE'])

for it in range(iters):
    # Sample \Theta_U
    mu_ustar = inv(inv(T_0) + M*inv(Sigma_u))@(inv(T_0)@mu_0 + inv(Sigma_u)@sum(U, axis=1, keepdims=True))
    T_ustar = inv(inv(T_0) + M*inv(Sigma_u))
    mu_u = multivariate_normal(np.squeeze(np.asarray(transpose(mu_ustar))), T_ustar, 1)
    
    nu_ustar = nu_0 + M
    S_ustar = S_0 + (U - transpose(mu_u))@transpose(U - transpose(mu_u))
    inv_wishart = invwishart(nu_ustar, S_ustar)
    Sigma_u = inv_wishart.rvs()
    
    # Sample \Theta_V
    mu_vstar = inv(inv(T_0) + N*inv(Sigma_v))@(inv(T_0)@mu_0 + inv(Sigma_v)@sum(V, axis=1, keepdims=True))
    T_vstar = inv(inv(T_0) + N*inv(Sigma_v))
    mu_v = multivariate_normal(np.squeeze(np.asarray(transpose(mu_vstar))), T_vstar, 1)
    
    nu_vstar = nu_0 + N
    S_vstar = S_0 + (V - transpose(mu_v))@transpose(V - transpose(mu_v))
    inv_wishart = invwishart(nu_vstar, S_vstar)
    Sigma_v = inv_wishart.rvs()
    
    # Sample U
    for i in range(M):
        Lambda_ustar = inv(Sigma_u)
        theta_ustar = transpose(inv(Sigma_u)@transpose(mu_u))
        for j in range(N):
            if R_flag[i, j] == 0:
                continue
            V_j = transpose(V[:, j])
            Lambda_ustar += outer(V_j, V_j)/sigma
            theta_ustar += V_j*R[i, j]/sigma
        theta_ustar = inv(Lambda_ustar)@transpose(theta_ustar)
        U[:, i] = multivariate_normal(np.squeeze(np.asarray(transpose(theta_ustar))), inv(Lambda_ustar), 1)
        
    # Sample V
    for j in range(N):
        Lambda_vstar = inv(Sigma_v)
        theta_vstar = transpose(inv(Sigma_v)@transpose(mu_v))
        for i in range(M):
            if R_flag[i, j] == 0:
                continue
            U_i = transpose(U[:, i])
            Lambda_vstar += outer(U_i, U_i)/sigma
            theta_vstar += U_i*R[i, j]/sigma
        theta_vstar = inv(Lambda_vstar)@transpose(theta_vstar)
        V[:, j] = multivariate_normal(np.squeeze(np.asarray(transpose(theta_vstar))), inv(Lambda_vstar), 1)

    R_star = transpose(U)@V
    tr_pred_values = R_star[tr_indices]
    te_pred_values = R_star[te_indices]
    
    tr_mae = MAE(tr_values, tr_pred_values)
    te_mae = MAE(te_values, te_pred_values)
    tr_rmse = np.sqrt(MSE(tr_values, tr_pred_values))
    te_rmse = np.sqrt(MSE(te_values, te_pred_values))
    
    results_df = results_df.append({
        'Iteration': it,
        'Train_MAE': tr_mae,
        'Test_MAE': te_mae,
        'Train_RMSE': tr_rmse,
        'Test_RMSE': te_rmse
    }, ignore_index=True)
    
    if it % 2 == 0:
        print(f"Iteration {it}: Train MAE={tr_mae:.4f}, Test MAE={te_mae:.4f}, Train RMSE={tr_rmse:.4f}, Test RMSE={te_rmse:.4f}")

Iteration 0: Train MAE=6.0249, Test MAE=6.0885, Train RMSE=6.1297, Test RMSE=6.1918
Iteration 2: Train MAE=2.9495, Test MAE=3.5069, Train RMSE=3.3437, Test RMSE=4.0213
Iteration 4: Train MAE=0.7808, Test MAE=1.1546, Train RMSE=0.9951, Test RMSE=1.5512
Iteration 6: Train MAE=0.7212, Test MAE=0.9216, Train RMSE=0.9191, Test RMSE=1.1780
Iteration 8: Train MAE=0.7317, Test MAE=0.8915, Train RMSE=0.9292, Test RMSE=1.1350
Iteration 10: Train MAE=0.7392, Test MAE=0.8594, Train RMSE=0.9366, Test RMSE=1.0956
Iteration 12: Train MAE=0.7398, Test MAE=0.8899, Train RMSE=0.9388, Test RMSE=1.1167
Iteration 14: Train MAE=0.7446, Test MAE=0.8798, Train RMSE=0.9424, Test RMSE=1.1139
Iteration 16: Train MAE=0.7426, Test MAE=0.8683, Train RMSE=0.9456, Test RMSE=1.0967
Iteration 18: Train MAE=0.7409, Test MAE=0.8602, Train RMSE=0.9449, Test RMSE=1.0924


In [9]:
results_df.to_csv(f'prototype_results_D={D}.csv', index=False)