# Matrix Factorization
* Prediction is $\tilde R = UA^T$ 
* Loss fuction is $L = \lVert (R - \tilde R)^\Omega \rVert _2^2 + \lambda_u \lVert U \rVert _2^2 + \lambda_a \lVert A \rVert _2^2$
* $\Omega$ is the set of oberved pairs $(i, j)$
* $M^\Omega$ is the projection of $M$ onto $\Omega$ for any matrix $M$
* $U$ is an $m x k$ matrix, $A$ is an $n x k$ matrix and $R$ is the $m x n$ ratings matrix

In [1]:
name = "MatrixFactorization";
residual_alphas = ["UserItemBiases"];

In [2]:
using LinearAlgebra
using Memoize
using Random
using SparseArrays

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");

# Alternating Least Squares Algorithm
* $u_{ik} = \dfrac{\sum_{j \in \Omega_i}(r_{ij} - \tilde r_{ij} + u_{ik}a_{kj})}{\sum_{j \in \Omega_i} a_j^2 + \lambda_u}$
* $\Omega$ is the set of (user, item) pairs that we have ratings for
* $\Omega_i$ is subset of $\Omega$ for which the user is the $i$-th user

In [4]:
function make_prediction(users, items, U, A)
    r = zeros(eltype(U), length(users))
    Threads.@threads for i = 1:length(r)
        if (users[i] <= size(U)[1]) && (items[i] <= size(A)[1])
            r[i] = dot(U[users[i], :], A[items[i], :])
        end
    end
    r
end;

In [5]:
function calc_loss(df, U, A)
    loss = mse(df.rating, make_prediction(df.user, df.item, U, A))
    @debug "loss: $loss"
    loss
end;

In [6]:
function ridge_regression(X, y, λ)
    (Matrix(X'X) + λ * I(size(X)[2])) \ Vector(X'y)
end;

In [7]:
@memoize function sparse_csr(i, j, v, m, n)
    sparse(j, i, v, n, m)'
end;

In [8]:
function sparse_subset(A, rows)
    # returns a sparse matrix B such that
    # 1) size(B) == size(A)
    # 2) B[rows, :] = A[rows, :]
    # 3) B[i, :] = 0 if i not in rows
    K = size(A)[2]
    nzval = vec(A[rows, :])
    rowval = repeat(rows, K)
    colptr = [1 + (x - 1) * length(rows) for x = 1:K+1]
    SparseMatrixCSC(size(A)..., colptr, rowval, nzval)
end;

In [9]:
function update_users!(users, items, ratings, U, A, λ_u)
    R = sparse_csr(users, items, ratings, size(U)[1], size(A)[1])
    Threads.@threads for i = 1:size(U)[1]
        X = sparse_subset(A, rowvals(R[i, :]))
        y = R[i, :]
        U[i, :] = ridge_regression(X, y, λ_u)
    end
end;

In [10]:
@memoize function gaussian_init(source, K, el_type)
    Random.seed!(20211204 * hash(source) * K)
    zeros(el_type, maximum(source), K) + randn(maximum(source), K)
end;

In [11]:
function train_model(training, validation, λ_u, λ_a, K, stop_criteria)
    @debug "training model with parameters [$λ_u, $λ_a]"
    users, items, ratings = training.user, training.item, training.rating
    U = copy(gaussian_init(users, K, eltype(λ_u)))
    A = copy(gaussian_init(items, K, eltype(λ_a)))
    loss = calc_loss(validation, U, A)

    while !stop!(stop_criteria, loss)
        update_users!(users, items, ratings, U, A, λ_u)
        update_users!(items, users, ratings, A, U, λ_a)
        loss = calc_loss(validation, U, A)
    end
    U, A, loss
end;

## Training

In [12]:
function validation_mse(λ, K)
    λ = exp.(λ) # ensure λ is nonnegative
    # stop really early so we can spend more computation exploring the parameter space
    stop_criteria = early_stopper(max_iters = 2, patience = 1, min_rel_improvement = 0.01)
    U, A, loss = train_model(training, validation, λ[1], λ[2], K, stop_criteria)
    loss
end;

In [13]:
K = 10;

In [None]:
# Find the best regularization hyperparameters
res = optimize(
    λ -> validation_mse(λ, K),
    fill(0.0, 2),  # intial guess
    LBFGS(),
    autodiff = :forward,
    Optim.Options(show_trace = true, extended_trace = true),
)
λ = exp.(Optim.minimizer(res));

[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 03:11:44 training model with parameters [Dual{ForwardDiff.Tag{var"#16#17", Float64}}(1.0,1.0,0.0), Dual{ForwardDiff.Tag{var"#16#17", Float64}}(1.0,0.0,1.0)]
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 03:11:49 loss: Dual{ForwardDiff.Tag{var"#16#17", Float64}}(11.7421243760049,0.0,0.0)
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 03:17:30 loss: Dual{ForwardDiff.Tag{var"#16#17", Float64}}(1.802953937602069,-0.009483831346817843,-0.00251577296788487)
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 03:22:16 loss: Dual{ForwardDiff.Tag{var"#16#17", Float64}}(1.6632633092287705,-0.013173546425693511,-0.0028235639282069725)


Iter     Function value   Gradient norm 
     0     1.663263e+00     1.317355e-02
 * Current step size: 1.0
 * time: 0.050124168395996094
 * g(x): [-0.013173546425693511, -0.0028235639282069725]
 * x: [0.0, 0.0]


[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 03:22:20 training model with parameters [Dual{ForwardDiff.Tag{var"#16#17", Float64}}(1.0132606998745746,1.0132606998745746,0.0), Dual{ForwardDiff.Tag{var"#16#17", Float64}}(1.002827553939302,0.0,1.002827553939302)]
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 03:22:23 loss: Dual{ForwardDiff.Tag{var"#16#17", Float64}}(11.7421243760049,0.0,0.0)
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 03:27:09 loss: Dual{ForwardDiff.Tag{var"#16#17", Float64}}(1.8028219452763985,-0.009476088796569533,-0.0025188194814296148)
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 03:31:56 loss: Dual{ForwardDiff.Tag{var"#16#17", Float64}}(1.6630818013479762,-0.013172491207473947,-0.002823663425716759)
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 03:31:56 training model with parameters [Dual{ForwardDiff.Tag{var"#16#17", Float64}}(1.0680854344355888,1.0680854344355888,0.0), Dual

     1     1.573310e+00     9.741568e-04
 * Current step size: 627.1652242021512
 * time: 7484.228739976883
 * g(x): [0.00023826559109797947, 0.0009741567787455844]
 * x: [8.261990197607519, 1.7708411040830327]


[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 05:27:00 training model with parameters [Dual{ForwardDiff.Tag{var"#16#17", Float64}}(3452.0695089667806,3452.0695089667806,0.0), Dual{ForwardDiff.Tag{var"#16#17", Float64}}(3.337761410448804,0.0,3.337761410448804)]
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 05:27:04 loss: Dual{ForwardDiff.Tag{var"#16#17", Float64}}(11.7421243760049,0.0,0.0)
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 05:31:49 loss: Dual{ForwardDiff.Tag{var"#16#17", Float64}}(1.6900277915251942,-0.00043677885962418723,-4.055524238620178e-5)
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 05:36:37 loss: Dual{ForwardDiff.Tag{var"#16#17", Float64}}(1.5753297445079928,-0.003649477147177814,-0.004783028556370388)
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20211206 05:36:37 training model with parameters [Dual{ForwardDiff.Tag{var"#16#17", Float64}}(3804.698628803021,3804.698628803021,0.0), Dual{

In [None]:
res

In [None]:
@info "The optimal [λ_u, λ_a] is $λ, found in " *
      repr(Optim.f_calls(res)) *
      " function calls"

In [None]:
stop_criteria = early_stopper(max_iters = 100, patience = 2, min_rel_improvement = 0.0001)
U, A, loss = train_model(training, validation, λ..., K, stop_criteria);

## Inference

In [None]:
model(users, items) = make_prediction(users, items, U, A);

In [None]:
write_predictions(model);

In [None]:
write_params(Dict("U" => U, "A" => A, "λ" => λ));