# Matrix Factorization
* Prediction is $\tilde R = UA^T$ 
* Loss fuction is $L = \lVert (R - \tilde R)^\Omega \rVert _2^2 + \lambda_u \lVert U \rVert _2^2 + \lambda_a \lVert A \rVert _2^2$
* $\Omega$ is the set of oberved pairs $(i, j)$
* $M^\Omega$ is the projection of $M$ onto $\Omega$ for any matrix $M$, that is $M_{ij}^\Omega$ is defined to be $M_{ij}$ when $(i, j) \in \Omega$ and $0$ otherwise
* $U$ is an $m x k$ matrix, $A$ is an $n x k$ matrix and $R$ is the $m x n$ ratings matrix

In [1]:
name = "MatrixFactorizationGenre";
# residual_alphas = ["UserItemBiases"];
downcast_to_int(x) = isinteger(x) ? Int(x) : x
residual_alphas = [
    ["UserItemBiases"]
    ["ItemCF.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["ItemCFResid.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["MatrixFactorization.$K" for K in downcast_to_int.([10, 20, 40])]
    ["ItemCFRelated.$name" for name in ["all"]]
    # ["UserCF.1024"]
    ["ItemCFEmbed.1024"] # 0.12%
]

14-element Vector{String}:
 "UserItemBiases"
 "ItemCF.16"
 "ItemCF.64"
 "ItemCF.256"
 "ItemCF.1024"
 "ItemCFResid.16"
 "ItemCFResid.64"
 "ItemCFResid.256"
 "ItemCFResid.1024"
 "MatrixFactorization.10"
 "MatrixFactorization.20"
 "MatrixFactorization.40"
 "ItemCFRelated.all"
 "ItemCFEmbed.1024"

In [2]:
using Random

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");
@nbinclude("XGBoostFeatures.ipynb");

In [4]:
const training = get_residuals("training", residual_alphas)
const validation = get_residuals("validation", residual_alphas);

# Alternating Least Squares Algorithm
* $u_{ik} = \dfrac{\sum_{j \in \Omega_i}(r_{ij} - \tilde r_{ij} + u_{ik}a_{kj})}{\sum_{j \in \Omega_i} a_j^2 + \lambda_u}$
* $\Omega$ is the set of (user, item) pairs that we have ratings for
* $\Omega_i$ is subset of $\Omega$ for which the user is the $i$-th user
* Note that this equation is equivalent to solving $A^{\Omega_i} u_i = R^{\Omega_i}$ with $L_2$ regularization $\lambda_u$, where $\Omega_i = \{(i', j) \in \Omega | i' = i \}$

In [5]:
function make_prediction(users, items, U, A)
    r = zeros(eltype(U), length(users))
    @views Threads.@threads for i = 1:length(r)
        if (users[i] <= size(U)[1]) && (items[i] <= size(A)[1])
            r[i] = dot(U[users[i], :], A[items[i], :])
        end
    end
    r
end;

In [6]:
function calc_loss(df, U, A)
    truth = df.rating
    pred = make_prediction(df.user, df.item, U, A)
    β = pred \ truth
    loss = mse(truth, pred .* β)
    @debug "loss: $loss β: $β"
    loss
end;

In [7]:
function ridge_regression(X, y, λ)
    (Matrix(X'X) + λ * I(size(X)[2])) \ Vector(X'y)
end;

In [8]:
# julia matrices are column major by default so we take adjoints to make them row major
@memoize function sparse_csr(i, j, v, m, n)
    sparse(j, i, v, n, m)'
end;

@memoize function gaussian_init_csr(source, K, el_type)
    Random.seed!(20211204 * hash(source) * K)
    (zeros(el_type, K, maximum(source)) + randn(K, maximum(source)) * K^(-1 / 4))'
end;

In [9]:
function sparse_subset(A, rows)
    # returns a sparse matrix B such that: 
    # size(B) == size(A), B[rows, :] == A[rows, :], and B[~rows, :] == 0
    K = size(A)[2]
    nzval = vec(A[rows, :])
    rowval = repeat(rows, K)
    colptr = [1 + (x - 1) * length(rows) for x = 1:K+1]
    SparseMatrixCSC(size(A)..., colptr, rowval, nzval)
end;

In [10]:
function update_users!(users, items, ratings, U, A, λ_u)
    R = sparse_csr(users, items, ratings, size(U)[1], size(A)[1])
    @tprogress Threads.@threads for i = 1:size(U)[1]
        X = sparse_subset(A, rowvals(R[i, :]))
        y = R[i, :]
        U[i, :] = ridge_regression(X, y, λ_u)
    end
end;

In [11]:
const A = genre_embedding()
const K = size(A)[2]

function train_model(training, validation, λ_u, stop_criteria)
    @debug "training model with parameters [$λ_u]"
    users, items, ratings = training.user, training.item, training.rating
    U = copy(gaussian_init_csr(users, K, eltype(λ_u)))
    loss = Inf

    while !stop!(stop_criteria, loss)
        update_users!(users, items, ratings, U, A, λ_u)
        calc_loss(training, U, A)
        loss = calc_loss(validation, U, A)
    end
    U, A, loss
end;

[32mProgress: 100%|███████████████████████████| Time: 0:00:02 ( 2.16 ms/it)[39m


## Training

In [12]:
function validation_mse(λ)
    λ = exp.(λ) # ensure λ is nonnegative
    # stop early so we can spend more computation exploring the parameter space
    stop_criteria = early_stopper(max_iters = 1)
    U, A, loss = train_model(training, validation, λ..., stop_criteria)
    loss
end;

In [21]:
function optimize_model()
    # Find the best regularization hyperparameters
    res = optimize(
        λ -> validation_mse(λ),
        [1.0],
        LBFGS(),
        autodiff = :forward,
        Optim.Options(show_trace = true, extended_trace = true),
    )
    λ = exp.(Optim.minimizer(res))
    @info "The optimal [λ_u] is $λ, found in " *
          repr(Optim.f_calls(res)) *
          " function calls"

    # train model
    stop_criteria =
        early_stopper(max_iters = 100, patience = 5, min_rel_improvement = 0.0001)
    U, A, loss = train_model(training, validation, λ..., stop_criteria)

    # save model
    outdir = "$name.$K"
    model(users, items) = make_prediction(users, items, U, A)
    write_predictions(model, outdir = outdir, save_training = true)
    write_params(
        Dict("U" => U, "A" => A, "λ" => λ, "K" => K, "residual_alphas" => residual_alphas),
        outdir = outdir,
    )
end;

In [None]:
optimize_model()

[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220107 14:14:38 training model with parameters [Dual{ForwardDiff.Tag{var"#53#54", Float64}}(2.718281828459045,2.718281828459045)]
[32mProgress: 100%|███████████████████████████| Time: 0:01:11 ( 2.54 ms/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220107 14:15:55 loss: Dual{ForwardDiff.Tag{var"#53#54", Float64}}(0.8786093359726957,0.020388291095571293) β: Dual{ForwardDiff.Tag{var"#53#54", Float64}}(1.3980502802576982,0.2103072726307749)
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220107 14:15:56 loss: Dual{ForwardDiff.Tag{var"#53#54", Float64}}(1.1941826256190053,-9.35899760091501e-5) β: Dual{ForwardDiff.Tag{var"#53#54", Float64}}(0.053890205787598615,0.023710932612840033)


Iter     Function value   Gradient norm 
     0     1.194183e+00     9.358998e-05
 * Current step size: 1.0
 * time: 5.1975250244140625e-5
 * g(x): [-9.35899760091501e-5]
 * x: [1.0]


[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220107 14:15:56 training model with parameters [Dual{ForwardDiff.Tag{var"#53#54", Float64}}(2.7185362442953567,2.7185362442953567)]
[32mProgress: 100%|███████████████████████████| Time: 0:01:09 ( 2.46 ms/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220107 14:17:10 loss: Dual{ForwardDiff.Tag{var"#53#54", Float64}}(0.8786112441406648,0.020388895743581974) β: Dual{ForwardDiff.Tag{var"#53#54", Float64}}(1.3980699633358216,0.2103163848375754)
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220107 14:17:11 loss: Dual{ForwardDiff.Tag{var"#53#54", Float64}}(1.1941826168597511,-9.359361578860486e-5) β: Dual{ForwardDiff.Tag{var"#53#54", Float64}}(0.053892424956163025,0.02371227784926389)
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220107 14:17:11 training model with parameters [Dual{ForwardDiff.Tag{var"#53#54", Float64}}(2.719554145781755,2.719554145781755)]
[32mProgress: 100%|██████████████████