# Neighborhood Collaborative Filtering
* This notebook implements both item-based and user-based collaborative filtering
* Prediction is $\tilde r_{ij} = \sum_{k \in N(j)} r_{ik}w_{kj}$ for item-based collaborative filtering
* Prediction is $\tilde r_{ij} = \sum_{k \in N(i)} w_{ik}r_{kj}$ for user-based collaborative filtering
* $r_{ij}$ is the rating for user $i$ and item $j$
* $w_{kj}$ is the cosine similarity between items $j$ and $k$
* $N(j)$ is the largest $K$ items $k$ sorted by $|w_{kj}|$

In [1]:
name = "NeighborhoodCollaborativeFiltering";
residual_alphas = ["UserItemBiases"];

In [2]:
using LinearAlgebra
using Memoize
using SparseArrays
# TODO upstream imports

In [3]:
using NBInclude
@nbinclude("Alpha.ipynb");

## Determine the neighborhoods for each user and item

In [4]:
@memoize function get_norms(R)
    norms = map(norm, eachslice(R, dims = 2))
    norms[norms.==0] .= 1 # prevent division by 0
    norms
end;

In [5]:
function get_neighborhood(item, K, R)
    norms = get_norms(R)
    weights = vec(R[:, item]' * R) ./ norms ./ norms[item]
    order = sortperm(abs.(weights), rev = true)[1:K]
    order[1:K], weights[order[1:K]]
end;

In [6]:
function make_prediction(item, users, K, R)
    items, weights = get_neighborhood(item, K, R)
    predictions = zeros(length(users))
    weight_sum = zeros(length(users))
    for u = 1:length(users)
        for (i, weight) in zip(items, weights)
            if R[users[u], i] != 0
                predictions[u] += weight * R[users[u], i]
                weight_sum[u] += abs(weight)
            else
                # TODO impute?
            end
        end
    end
    for u = 1:length(users)
        if weight_sum[u] != 0
            predictions[u] /= weight_sum[u]
        end
    end
    predictions
end;

In [7]:
function collaborative_filtering(training, validation, K)
    R = sparse(
        training.user,
        training.item,
        training.rating,
        maximum(training.user),
        maximum(training.item),
    )

    preds = zeros(length(validation.rating), Threads.nthreads())
    @tprogress Threads.@threads for item in collect(Set(validation.item))
        mask = validation.item .== item
        preds[mask, Threads.threadid()] = make_prediction(item, validation.user[mask], K, R)
    end

    sum(preds, dims = 2)
end;

## Item based CF

In [8]:
baseline_preds = zeros(length(validation.rating))
mse(validation.rating, baseline_preds)

1.6981920453148056

In [9]:
item_preds = collaborative_filtering(training, validation, 200)
mse(validation.rating, item_preds)

[32mProgress: 100%|███████████████████████████| Time: 0:28:33 ( 0.11  s/it)[39m


1.3567526929280826

## User based CF

In [10]:
training_t = RatingsDataset(training.item, training.user, training.rating)
validation_t = RatingsDataset(validation.item, validation.user, validation.rating);

In [None]:
user_preds = collaborative_filtering(training_t, validation_t, 200)
mse(validation.rating, user_preds)

[32mProgress:   0%|▏                          |  ETA: 9 days, 6:10:23 ( 1.91  s/it)[39mm

In [None]:
# TODO param optimization
# TODO run both user centric and item centric
# TODO productionalize