# User Collaborative Filtering
* See `ItemCollaborativeFilteringBase.ipynb` for algorithm details
* The weights here are the cosine correlation between the two users
* The algorithm is nearly identical to the algorithm in `ItemCF`, with the following exceptions
  * we transpose users and items
  * the similarity matrix is too big to store in memory, so we generate weights on the fly


In [1]:
name = "UserCF";

In [2]:
using DataStructures
using LSHFunctions
using Random

In [3]:
using NBInclude
@nbinclude("ItemCollaborativeFilteringBase.ipynb");

## Compute nearest neighbors using Locality Sensitive Hashing

In [4]:
# Use locality sensitive hashing to reduce the computational cost of finding nearest neighbors
# TODO automatically adjust scale based on the number of items
# TODO save hash function or random seed
# TODO maybe use LRU caches
const n_hashes = 64
const bits_per_hash = 12

Random.seed!(20220103)
simhashfn = SimHash(n_hashes * bits_per_hash);
function bitvec_to_int(x)
    sum(2^(j - 1) * x[j] for j = 1:length(x))
end

function hashfn(x)
    chunk_size = bits_per_hash
    hashbits = simhashfn(x)
    [
        bitvec_to_int(hashbits[(k-1)*chunk_size+1:k*chunk_size]) for
        k = 1:length(hashbits)÷chunk_size
    ]
end;

In [5]:
@memoize function get_hash_buckets(R)
    @debug "generating hash buckets"
    hash_buckets = [DefaultDict(() -> Int32[]) for i = 1:n_hashes]
    @showprogress for j = 1:size(R)[2]
        hashes = hashfn(R[:, j])
        for i = 1:n_hashes
            push!(hash_buckets[i][hashes[i]], j)
        end
    end
    hash_buckets
end

function average_hash_neighborhood_size(R)
    hash_neighborhood_sizes = zeros(size(R)[2])
    @showprogress for j = 1:size(R)[2]
        hash_neighborhood_sizes[j] = length(get_hash_neighbors(R, j))
    end
    mean(hash_neighborhood_sizes)
end

function get_hash_neighbors(R, j)
    hash_buckets = get_hash_buckets(R)
    hash = hashfn(R[:, j])
    vcat([hash_buckets[i][hash[i]] for i = 1:length(hash)]...)
end;

In [6]:
@memoize function get_norms(R)
    norms = map(norm, eachslice(R, dims = 2))
    norms[norms.==0] .= 1 # prevent division by 0
    norms
end;

@memoize function get_abs_neighborhood_cache(R, K)
    @debug "initializing neighborhood cache"
    # we need to preallocate the dict to avoid resizing
    # while accessing from multiple threads
    [(zeros(Int32, K), zeros(Float32, K)) for i = 1:maximum(size(R)[2])]
end

function get_abs_neighborhood(item, R, K)
    cache = get_abs_neighborhood_cache(R, K)
    val = cache[item]
    if val[1][1] == 0
        norms = get_norms(R)
        weights = zeros(size(R)[2])
        mask = get_hash_neighbors(R, item)
        weights[mask] = vec(R[:, item]' * R[:, mask])
        weights = weights ./ norms ./ norms[item]
        weights[item] = Inf
        order = partialsortperm(abs.(weights), 2:K+1, rev = true)
        val = (convert.(Int32, order), convert.(Float32, weights[order]))
        cache[item] = val
    end
    val
end;

In [7]:
function optimize_model(param; optimize = true)
    # unpack parameters
    training = get_training(param.training_residuals)'
    validation = get_validation(param.validation_residuals)'
    R = sparse(
        training.user,
        training.item,
        training.rating,
        maximum(training.user),
        maximum(training.item),
    )
    K = param.K
    neighborhood_types = Dict("abs" => get_abs_neighborhood)
    neighborhoods = i -> neighborhood_types[param.neighborhood_type](i, R, K)

    # preallocate caches
    get_hash_buckets(R)
    get_abs_neighborhood_cache(R, K)
    @debug "average hash neighborhood size $(average_hash_neighborhood_size(R))"

    # optimize hyperparameters
    function validation_mse(λ)
        pred = collaborative_filtering(training, validation, neighborhoods, λ)
        truth = validation.rating
        β = pred \ truth
        loss = mse(truth, pred .* β)
        @debug "loss: $loss β: $β: λ $λ"
        loss
    end
    if optimize
        res = optimize(
            validation_mse,
            param.λ,
            LBFGS(),
            autodiff = :forward,
            Optim.Options(show_trace = true, extended_trace = true),
        )
        param.λ = Optim.minimizer(res)
    end

    # save predictions
    training = get_training(param.training_residuals)'
    inference = get_inference()'
    preds = collaborative_filtering(training, inference, neighborhoods, param.λ)
    sparse_preds = sparse(inference.user, inference.item, preds)'
    function model(users, items, predictions)
        result = zeros(length(users))
        for i = 1:length(users)
            if users[i] <= size(predictions)[1] && items[i] <= size(predictions)[2]
                result[i] = predictions[users[i], items[i]]
            end
        end
        result
    end
    write_predictions(
        (users, items) -> model(users, items, sparse_preds),
        outdir = param.name,
        residual_alphas = param.validation_residuals,
        save_training = true,
    )
    write_params(to_dict(param), outdir = param.name)
end;

## Setup hyperparameters

In [8]:
downcast_to_int(x) = isinteger(x) ? Int(x) : x
user_cf_params = [[
        cf_params(
            name = "UserCF.$K",
            training_residuals = ["UserItemBiases"],
            validation_residuals = ["UserItemBiases"],
            neighborhood_type = "abs",
            S = "",
            K = K,
            λ = [1.0, 1.0, 0.0],
        ) for K in downcast_to_int.([2^8])
    ];
];

## Train models

In [9]:
for param in user_cf_params
    optimize_model(param, optimize=true)    
end

[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220106 05:09:42 generating hash buckets
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:07:51[39m:43[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220106 05:17:34 initializing neighborhood cache
[32mProgress:  20%|████████▏                                |  ETA: 0:06:16[39m[38;5;1m[1m┌ [22m[39m[38;5;1m[1mError: [22m[39m20220106 05:19:09 Exception while generating log record in module Main at In[7]:19
[38;5;1m[1m│ [22m[39m  exception =
[38;5;1m[1m│ [22m[39m   InterruptException:
[38;5;1m[1m│ [22m[39m   Stacktrace:
[38;5;1m[1m│ [22m[39m     [1] [0m[1mArray[22m
[38;5;1m[1m│ [22m[39m   [90m    @ [39m[90m./[39m[90;4mboot.jl:448[0m[90m [inlined][39m
[38;5;1m[1m│ [22m[39m     [2] [0m[1mBitArray[22m
[38;5;1m[1m│ [22m[39m   [90m    @ [39m[90m./[39m[90;4mbitarray.jl:37[0m[90m [inlined][39m
[38;5;1m[1m│ [22m[39m     [3] [0m[1mBitArray[

LoadError: MethodError: objects of type Bool are not callable