# User Item Biases With Regularization
* Prediction for user $i$ and item $j$ is $\tilde r_{ij} = u_i + a_j$
* Loss function is $L = \sum_{\Omega}w_{ij}\text{loss}(r_{ij}, \tilde r_{ij}) + \lambda_u \sum_i (u_i - \bar u) ^2 + \lambda_a \sum_j (a_j - \bar a)^2 $
* $\bar u$ is the mean of $u_i$ and $\bar a$ is the mean of $a_j$ 
* $\Omega$ is the set of oberved pairs $(i, j)$
* $r_{ij}$ is the rating for user $i$ and item $j$
* $w_{ij}$ is the weight for the prediction $r_{ij}$ and is modeled as a power-law in the number of items seen by $i$ and users than have seen $j$: $w_{ij} = |j' : (i, j') \in \Omega| ^ {\lambda_{wu}} |i' : (i', j) \in \Omega| ^ {\lambda_{wa}}$
* $\text{loss}$ is mean squared error

In [None]:
task = ""

In [None]:
const name = "$task/ExplicitUserItemBiases"
const content = "explicit"
const residual_alphas = String[] 
const implicit = false;

In [None]:
import NBInclude: @nbinclude
@nbinclude("../Alpha.ipynb");
@nbinclude("ExplicitUserItemBiasesBase.ipynb");

In [None]:
const training = get_split("training", "all", content)
const validation = get_split("validation", task, content);

## Alternating Least Squares
* Given some hyperparameters $\lambda$, we can solve for $U$ and $A$ via Alternating Least Squares
* This is an iterative algorithm where we fix $A$, then solve for the $U$ that minimizes the loss function
* Then we fix $U$ and solve for the best $A$
* These two steps are repeated until the matrices $U$ and $A$ converge
### More details
* If we fix $a$, then for each user $i$, $u_i$ is optimized when
* $u_i = \dfrac{\sum_{j \in \Omega_i}(r_{ij} - a_j) w_{ij} + \bar u \lambda_u}{ \sum_{j \in \Omega_i} w_{ij} + \lambda_u}$
* $\Omega$ is the set of (user, item) pairs that we have ratings for
* $\Omega_i$ is subset of $\Omega$ for which the user is the $i$-th user

In [None]:
function train_model(training, stop_criteria, λ)
    @info "training model with parameters $λ"
    λ_u, λ_a, λ_wu, λ_wa, λ_wt = λ
    users, items, ratings = training.user, training.item, training.rating
    weights =
        powerdecay(get_counts("training", "all", content), log(λ_wu)) .*
        powerdecay(get_counts("training", "all", content; by_item = true), log(λ_wa)) .*
        powerlawdecay(1 .- max.(training.timestamp, 0), λ_wt)
    u = zeros(eltype(λ_u), num_users())
    a = zeros(eltype(λ_a), num_items())

    ρ_u = zeros(eltype(u), length(u), Threads.nthreads())
    Ω_u = zeros(eltype(u), length(u), Threads.nthreads())
    ρ_a = zeros(eltype(a), length(a), Threads.nthreads())
    Ω_a = zeros(eltype(a), length(a), Threads.nthreads())

    while !stop!(stop_criteria, [u, a])
        update_users!(users, items, ratings, weights, u, a, λ_u, ρ_u, Ω_u)
        update_users!(items, users, ratings, weights, a, u, λ_a, ρ_a, Ω_a)
    end
    u, a
end;

In [None]:
function validation_mse(λ)
    λ = exp.(λ) # ensure λ is nonnegative
    stop_criteria = convergence_stopper(1e-6, max_iters = 16)
    u, a = train_model(training, stop_criteria, λ)
    r = make_prediction(validation.user, validation.item, u, a)
    residualized_loss(residual_alphas, task, content, implicit, r)
end;

In [None]:
# Find the best regularization hyperparameters
res = Optim.optimize(
    validation_mse,
    fill(0.0f0, 5),
    Optim.LBFGS(),
    autodiff = :forward,
    Optim.Options(show_trace = true, extended_trace = true, iterations = 50, time_limit=3600 * 3),
);
λ = exp.(Optim.minimizer(res));

In [None]:
@info "The optimal λ is $λ, found in " * repr(Optim.f_calls(res)) * " function calls"

In [None]:
stop_criteria = convergence_stopper(1e-6, max_iters = 16)
u, a = train_model(training, stop_criteria, λ);

In [None]:
validation_mse(Optim.minimizer(res))

## Inference

In [None]:
model(users, items) = make_prediction(users, items, u, a)
write_alpha(
    model,
    name;
    log = true,
    log_task = task,
    log_content = content,
    log_alphas = residual_alphas,
)

In [None]:
write_params(Dict("u" => u, "a" => a, "λ" => λ), name);