# User Collaborative Filtering
* Computes Item Collaborative Filtering alphas for the recommendee
* See `../TrainingAlphas/UserCF.ipynb` for algorithm details

In [1]:
source = "UserCF";

In [2]:
using DataStructures
using LSHFunctions
using Random

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");

## Determine the neighborhoods for each user and item

In [4]:
const n_hashes = 64
const bits_per_hash = 12

Random.seed!(20220103)
simhashfn = SimHash(n_hashes * bits_per_hash);
function bitvec_to_int(x)
    sum(2^(j - 1) * x[j] for j = 1:length(x))
end

function hashfn(x)
    chunk_size = bits_per_hash
    hashbits = simhashfn(x)
    [
        bitvec_to_int(hashbits[(k-1)*chunk_size+1:k*chunk_size]) for
        k = 1:length(hashbits)÷chunk_size
    ]
end;

In [5]:
@memoize function get_hash_buckets(R)
    @debug "generating hash buckets"
    t_hash_buckets =
        [[DefaultDict(() -> Int32[]) for i = 1:n_hashes] for t = 1:Threads.nthreads()]
    @tprogress Threads.@threads for j = 1:size(R)[2]
        hashes = hashfn(R[:, j])
        for i = 1:n_hashes
            push!(t_hash_buckets[Threads.threadid()][i][hashes[i]], j)
        end
    end

    hash_buckets = [DefaultDict(() -> Int32[]) for i = 1:n_hashes]
    @tprogress Threads.@threads for i = 1:n_hashes
        for t = 1:Threads.nthreads()
            for k in keys(t_hash_buckets[t][i])
                push!(hash_buckets[i][k], t_hash_buckets[t][i][k]...)
            end
        end
    end
    hash_buckets
end

function get_hash_neighbors(R, j)
    hash_buckets = get_hash_buckets(R)
    hash = hashfn(R[:, j])
    vcat([hash_buckets[i][hash[i]] for i = 1:length(hash)]...)
end;

In [6]:
@memoize function get_norms(R)
    norms = map(norm, eachslice(R, dims = 2))
    norms[norms.==0] .= 1 # prevent division by 0
    norms
end;

function get_abs_neighborhood(item, R, K)
    norms = get_norms(R)
    weights = zeros(size(R)[2])
    mask = get_hash_neighbors(R, item)
    weights[mask] = vec(R[:, item]' * R[:, mask])
    weights = weights ./ norms ./ norms[item]
    weights[item] = Inf
    order = partialsortperm(abs.(weights), 2:K+1, rev = true)
    (convert.(Int32, order), convert.(Float32, weights[order]))
end;

In [7]:
# each prediction is just the weighted sum of all items in the neighborhood
# we apply regularization terms to decay the weights, ratings, and final prediction
function make_prediction(item, users, R, get_neighborhood, λ)
    if item > size(R)[2]
        # the item was not in our training set; we have no information
        return zeros(length(item))
    end
    items, weights = get_neighborhood(item)
    decay(x, a) = x != 0 ? sign(x) * abs(x)^a : 0
    weights = decay.(weights, λ[1])
    predictions = zeros(eltype(weights), length(users))
    weight_sum = zeros(eltype(weights), length(users))
    for u = 1:length(users)
        for (i, weight) in zip(items, weights)
            if R[users[u], i] != 0
                predictions[u] += weight * decay(R[users[u], i], λ[2])
                weight_sum[u] += abs(weight)
            end
        end
    end
    for u = 1:length(users)
        if weight_sum[u] + λ[3] != 0
            predictions[u] /= (weight_sum[u] + λ[3])
        end
    end
    predictions
end;

In [8]:
function collaborative_filtering(R, inference, get_neighborhood, λ)
    preds = zeros(eltype(λ), length(inference.rating), Threads.nthreads())
    @tprogress Threads.@threads for item in collect(Set(inference.item))
        mask = inference.item .== item
        preds[mask, Threads.threadid()] =
            make_prediction(item, inference.user[mask], R, get_neighborhood, λ)
    end

    vec(sum(preds, dims = 2))
end;

## User based CF

In [9]:
cf_alphas = [x for x in read_params("CombineSignals")["alphas"] if occursin("UserCF", x)]

1-element Vector{String}:
 "UserCF.256"

In [10]:
# alpha = cf_alphas[1]
# params = read_params(alpha)
# K = params["K"]
# λ = params["λ"]
# training = get_residuals("training", params["training_residuals"])'
# R = sparse(
#     training.user,
#     training.item,
#     training.rating,
#     num_items(),
#     maximum(training.item),
# )

In [11]:
function get_alphas()
    [x for x in read_params("CombineSignals")["alphas"] if occursin("UserCF", x)]
end;

In [12]:
function compute_alpha(alpha)
    # read in parameters
    params = read_params(alpha)
    K = params["K"]
    λ = params["λ"]
    training = get_residuals("training", params["training_residuals"])'
    R = sparse(
        training.user,
        training.item,
        training.rating,
        num_items(),
        maximum(training.item),
    )

    # if the recommendee was also in the training set, then remove the stale list
    username_to_uid = DataFrame(CSV.File("../../data/processed_data/username_to_uid.csv"))
    username_to_uid.uid .+= 1
    username_to_uid.username = lowercase.(username_to_uid.username)
    username_uid =
        filter(x -> x.username == lowercase(get_recommendee_username()), username_to_uid).uid
    if length(username_uid) > 0
        @assert length(username_uid) == 1
        R[:, username_uid] .= 0
    end
    dropzeros!(R)

    # cache neighborhoods
    get_hash_buckets(R)

    # compute alpha
    neighborhood_types = Dict("abs" => get_abs_neighborhood)
    neighborhoods = i -> neighborhood_types[params["neighborhood_type"]](i, R, K)
    recommendee_uid = get_split("recommendee").user[1]
    inference =
        RatingsDataset(
            fill(recommendee_uid, num_items()),
            1:num_items(),
            zeros(num_items()),
        )'
    predictions = collaborative_filtering(R, inference, neighborhoods, λ)
    model(items) = predictions[items]
    write_recommendee_alpha(model, outdir = alpha)
end;

In [13]:
for alpha in get_alphas()
    compute_alpha(alpha)
end;

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:01:27[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220115 01:40:00 generating hash buckets
[32mProgress: 100%|███████████████████████████| Time: 0:00:33 ( 1.20 ms/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:03 ( 0.91  s/it)[39m
