# Neighborhood Collaborative Filtering
* This notebook implements both item-based and user-based collaborative filtering
* Prediction is $\tilde r_{ij} = \sum_{k \in N(j)} r_{ik}w_{kj}$ for item-based collaborative filtering
* Prediction is $\tilde r_{ij} = \sum_{k \in N(i)} w_{ik}r_{kj}$ for user-based collaborative filtering
* $r_{ij}$ is the rating for user $i$ and item $j$
* $w_{kj}$ is the cosine similarity between items $j$ and $k$
* $N(j)$ is the largest $K$ items $k$ sorted by $|w_{kj}|$

In [1]:
name = "NeighborhoodCollaborativeFiltering";
residual_alphas = ["UserItemBiases"];

In [2]:
using LinearAlgebra
using Memoize
using SparseArrays
# TODO upstream imports

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");

In [4]:
#TODO documentation

## Determine the neighborhoods for each user and item

In [5]:
@memoize function get_norms(R)
    norms = map(norm, eachslice(R, dims = 2))
    norms[norms.==0] .= 1 # prevent division by 0
    norms
end;

In [6]:
function get_correlation_neighborhood(item, R, K)
    norms = get_norms(R)
    weights = vec(R[:, item]' * R) ./ norms ./ norms[item]
    order = sortperm(abs.(weights), rev = true)[1:K]
    order[1:K], weights[order[1:K]]
end;

In [7]:
function get_embedding_neighborhood(item, kernel, K, λ)
    dists = map(norm, eachslice(kernel .- kernel[:, item], dims = 2))
    weights = exp.(-dists .* λ)
    order = sortperm(dists)[1:K]
    order[1:K], weights[order[1:K]]
end;

In [8]:
A = read_params("MatrixFactorization")["A"]
U = read_params("MatrixFactorization")["U"]
function impute_item(user, item, R)
    dot(A[user, :]
end

function make_prediction(item, users, R, get_neighborhood)
    items, weights = get_neighborhood(item)
    predictions = zeros(eltype(weights), length(users))
    weight_sum = zeros(eltype(weights), length(users))
    for u = 1:length(users)
        for (i, weight) in zip(items, weights)
            if R[users[u], i] != 0
                predictions[u] += weight * R[users[u], i]
                weight_sum[u] += abs(weight)
            end
        end
    end
    for u = 1:length(users)
        if weight_sum[u] != 0
            predictions[u] /= weight_sum[u]
        end
    end
    predictions
end;

In [9]:
function collaborative_filtering(training, inference, get_neighborhood)
    R = sparse(
        training.user,
        training.item,
        training.rating,
        maximum(training.user),
        maximum(training.item),
    )

    el_type = eltype(get_neighborhood(1)[2])
    preds = zeros(el_type, length(inference.rating), Threads.nthreads())
    @tprogress Threads.@threads for item in collect(Set(inference.item))
        mask = inference.item .== item
        preds[mask, Threads.threadid()] =
            make_prediction(item, inference.user[mask], R, get_neighborhood)
    end

    vec(sum(preds, dims = 2))
end;

In [10]:
function model(users, items, predictions)
    result = zeros(length(users))
    for i = 1:length(users)
        if users[i] < size(predictions)[1] && items[i] < size(predictions)[2]
            result[i] = predictions[users[i], items[i]]
        end
    end
    result
end;

In [11]:
struct cf_params
    name::Any
    neighborhoods::Any
    hyperparams::Any
end;

## User based CF

In [12]:
training_t = RatingsDataset(training.item, training.user, training.rating)
validation_t = RatingsDataset(validation.item, validation.user, validation.rating);

In [13]:
# function user_validation_mse(λ)
#     @debug "training model with parameters $λ"
#     user_embeddings = collect(read_params("MatrixFactorization")["U"]')
#     neighborhoods = user -> get_embedding_neighborhood(user, user_embeddings, 100, λ[1])
#     preds = collaborative_filtering(training_t, validation_t, neighborhoods)
#     loss = mse(validation_t.rating, preds)
#     @debug "loss: $loss"
#     loss
# end;

# # Find the best regularization hyperparameters
# res_user_embedding = optimize(
#     user_validation_mse,
#     [3.],  # intial guess
#     LBFGS(),
#     autodiff = :forward,
#     Optim.Options(show_trace = true, extended_trace = true),
# )

In [14]:
# user_ratings = sparse(
#     training.item,
#     training.user,
#     training.rating,
#     maximum(training.item),
#     maximum(training.user),
# )
# user_embeddings = collect(read_params("MatrixFactorization")["U"]')
# user_cf_params = [
#     cf_params(
#         "UserEmbedCF",
#         user -> get_embedding_neighborhood(user, user_embeddings, 200, 0.3),
#         Dict("K" => 200, "λ" => 0.3),
#     ),
#     cf_params(
#         "UserEmbedCFReg100",
#         user -> get_embedding_neighborhood(user, user_embeddings, 200, 0.3),
#         Dict("K" => 200, "λ" => 100),
#     ),
#     cf_params(
#         "UserCF",
#         user -> get_correlation_neighborhood(user, user_ratings, 200),
#         Dict("K" => 200),
#     ),
#     cf_params(
#         "UserCF1000",
#         user -> get_correlation_neighborhood(user, user_ratings, 1000),
#         Dict("K" => 1000),
#     ),
# ];

In [None]:
# test = get_residuals("test", residual_alphas)
# inference_t = RatingsDataset(
#     user = [validation.item; test.item],
#     item = [validation.user; test.user],
#     rating = [validation.rating; test.rating],
# )

# for param in user_cf_params
#     preds = collaborative_filtering(training_t, inference_t, param.neighborhoods)
#     sparse_preds = sparse(inference_t.item, inference_t.user, preds)
#     write_predictions(
#         (users, items) -> model(users, items, sparse_preds),
#         outdir = param.name,
#     )
#     write_params(param.hyperparams, outdir = param.name)
# end

[32mProgress:   6%|█▋                         |  ETA: 9:50:21 (86.91 ms/it)[39mmt)[39mm

## Item based CF

### optimize hyperparams

In [None]:
function item_validation_mse(λ)
    @debug "training model with parameters $λ"
    item_embeddings = collect(read_params("MatrixFactorization")["A"]')
    neighborhoods = item -> get_embedding_neighborhood(item, item_embeddings, 200, λ[1])
    preds = collaborative_filtering(training, validation, neighborhoods)
    loss = mse(validation.rating, preds)
    @debug "loss: $loss"
    loss
end;

In [None]:
# Find the best regularization hyperparameters
res_item_embedding = optimize(
    item_validation_mse,
    [0.3],  # intial guess
    LBFGS(),
    autodiff = :forward,
    Optim.Options(show_trace = true, extended_trace = true),
)

In [None]:
# todo store nhood size
item_ratings = sparse(
    training.user,
    training.item,
    training.rating,
    maximum(training.user),
    maximum(training.item),
)
item_embeddings = collect(read_params("MatrixFactorization")["A"]')
item_cf_params = [
    # cf_params(
    #     "ItemEmbedCF",
    #     item -> get_embedding_neighborhood(item, item_embeddings, 200, Optim.minimizer(res_item_embedding)[1]),
    #     Dict("K"=>200, "λ"=>Optim.minimizer(res_item_embedding)[1])        
    # ),
    cf_params(
        "ItemEmbedImputeCF",
        item -> get_embedding_neighborhood(item, item_embeddings, 200, Optim.minimizer(res_item_embedding)[1]),
        Dict("K"=>200, "λ"=>0.3)        
    ),    
    cf_params(
        "ItemImputeCF",
        item -> get_correlation_neighborhood(item, item_ratings, 200),
        Dict("K" => 200),
    ),  
];

In [None]:
test = get_residuals("test", residual_alphas)
inference = RatingsDataset(
    user = [validation.user; test.user],
    item = [validation.item; test.item],
    rating = [validation.rating; test.rating],
)

for param in item_cf_params
    preds = collaborative_filtering(training, inference, param.neighborhoods)
    sparse_preds = sparse(inference.user, inference.item, preds)
    write_predictions(
        (users, items) -> model(users, items, sparse_preds),
        outdir = param.name,
    )
    write_params(param.hyperparams, outdir = param.name)
end