# Helper functions that are useful for generating alphas

In [1]:
import Flux: softmax
import JLD2
import LRUCache: LRU
import Memoize: @memoize
import NBInclude: @nbinclude
import Optim
import ProgressMeter: @showprogress

## General utilities

In [2]:
@nbinclude("AlphaUtils.ipynb");

## Logging

In [3]:
if !@isdefined name
    name = "Alpha"
end
redirect_logging("../../data/alphas/$name");

## Structs for handling ratings

In [4]:
# user[i] has seen item[i] and given it a score of rating[i]
@with_kw struct RatingsDataset
    user::Vector{Int32}
    item::Vector{Int32}
    rating::Vector{Float32}
end

# swap users with items
function Base.adjoint(x::RatingsDataset)
    RatingsDataset(x.item, x.user, x.rating)
end

# TODO maybe we can delete this with a newer julia version
# Some sparse matrix operations require indices to be Int64
@with_kw struct RatingsDataset64
    user::Vector{Int64}
    item::Vector{Int64}
    rating::Vector{Float32}
end

function RatingsDataset64(x::RatingsDataset)
    RatingsDataset64(
        convert.(Int64, x.user),
        convert.(Int64, x.item),
        convert.(Float32, x.rating),
    )
end

# append two datasets
function Base.cat(x::RatingsDataset, y::RatingsDataset)
    RatingsDataset([x.user; y.user], [x.item; y.item], [x.rating; y.rating])
end;

In [5]:
# we sanitize the splits such that no user/item in the validation
# or test splits has a higher id that the the training set does
@memoize function num_users()
    max(
        maximum(get_split("training", false).user),
        maximum(get_split("training", true).user),
    )
end

@memoize function num_items()
    max(
        maximum(get_split("training", false).item),
        maximum(get_split("training", true).item),
    )
end;

## Reading and writing data

In [6]:
# a split is a collection of (user, item, rating) triples that are stored as a RatingsDataset
# 
# @param split the following splits are supported:
#     training: used to train an alpha's parameters
#     validation: used to tune an alpha's hyperpameters
#     test: used to measure out of sample performance
#
# @param implicit replace the explicit ratings with an implicit rating. 
#     the implicit rating is 1 if they watched the series and 0 if they have not
# @param transpose return an (item, user, rating) dataset instead of a (user, item, rating) dataset
@memoize LRU{Tuple{String,Bool,Bool},RatingsDataset}(maxsize = 2) function get_split(
    split,
    implicit;
    transpose = false,
)
    @assert split in ["training", "validation", "test", "negative"]

    file = "../../data/splits/splits.jld2"
    if split == "negative"
        df = JLD2.load(file, split)
    elseif implicit
        df = cat(JLD2.load(file, "explicit_" * split), JLD2.load(file, "implicit_" * split))
        fill!(df.rating, 1)
    else
        df = JLD2.load(file, "explicit_" * split)
    end
    transpose ? df' : df
end;

In [7]:
# an alpha is a model that is used to predict whether a user will like an item.
# it's often useful to know an alpha model's value for a given (user, item) pair.
# alphas can be expensive to compute, so we precompute the model's values on
# (user-item) pairs and store the resultant RatingsDatasets to disk.
# storing the model values for all (user, item) pairs would be prohibitively
# large, so we only store values for our splits

function write_alpha(model, alphas, implicit; outdir = name)
    splits_to_save = ["validation", "training", "test", "negative"]
    splits_to_not_log = ["test", "negative"]

    β = nothing
    predictions = Dict()
    for split in splits_to_save
        df = get_split(split, implicit)
        x = model(df.user, df.item)
        if isnothing(β)
            # we need to evaluate the validation set to get the regression coefficients
            @assert split == "validation"
            _, β = regress(alphas, implicit, x)
        end
        predictions[split] = RatingsDataset(df.user, df.item, x)
        if split ∉ splits_to_not_log
            @info "$(split) loss: $(residualized_loss(alphas, implicit, x, β, split)), β: $β"
        end
    end

    outdir = mkpath("../../data/alphas/$outdir")
    JLD2.save("$outdir/predictions.jld2", predictions)
end

function read_alpha(alpha, split)
    @assert split in ["training", "validation", "test"]
    file = "../../data/alphas/$(alpha)/predictions.jld2"
    JLD2.load(file, split)
end;

In [8]:
# params consist of two things:
# 1) the hyperparameters that are used to train an alpha
# 2) the trained parameters of an alpha 
# in general, params should contain all information necessary to
# efficiently train the alpha model for a new user

function write_params(params; outdir = name)
    outdir = mkpath("../../data/alphas/$outdir")
    JLD2.save("$outdir/params.jld2", params)
end

function read_params(alpha)
    JLD2.load("../../data/alphas/$alpha/params.jld2")
end;

## Weight decays

In [9]:
# the number of items a user has seen spans orders of magnitude.
# if we place an equal weight on each (user, item) pair, then highly
# active users will skew the loss function. It is generally best practice
# to weight each user equally (see Deep Neural Networks for YouTube Recommendations 
# https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45530.pdf]).
# we achieve this by weighting our validation and test loss functions such that each
# (user, item) pair has a weight of 1 / |number of items the user has seen|.
# 
# during training, the same skew issue appears. instead of weighting each (user, item)
# pair by 1 / |number of items the user has seen|, we take the more general approach
# of weighting each (user, item) pair by 
# |number of items the user has seen| ^ w_u * |number of users that have seen the item| ^ w_a
# when w_u=-1 and w_a=0, then we recover the equal-user weighting that we used for
# validation and test sets.

function expdecay(x, a)
    x == 0 ? zero(eltype(a)) : sign(x) * abs(x)^a
end

function expdecay(x::Vector, a)
    y = Array{eltype(a)}(undef, length(x))
    Threads.@threads for i = 1:length(x)
        @inbounds y[i] = expdecay(x[i], a)
    end
    y
end

function weighting_scheme(scheme::String)
    if scheme == "linear"
        return 1.0f0
    elseif scheme == "constant"
        return 0.0f0
    elseif scheme == "inverse"
        return -1.0f0
    else
        @assert false
        return 0.0f0
    end
end;

function get_user_counts(split::RatingsDataset)
    counts = zeros(eltype(split.rating), maximum(split.user), Threads.nthreads())
    @tprogress Threads.@threads for i = 1:length(split.rating)
        @inbounds counts[split.user[i], Threads.threadid()] += 1
    end
    vec(sum(counts, dims = 2))
end

@memoize function get_counts(split, implicit; per_rating = true, by_item = false)
    split = get_split(split, implicit; transpose = by_item)
    user_counts = get_user_counts(split)

    if !per_rating
        return user_counts
    end

    counts = Array{eltype(user_counts)}(undef, length(split.user))
    Threads.@threads for i = 1:length(counts)
        @inbounds counts[i] = user_counts[split.user[i]]
    end
    counts
end

function get_weights(split, implicit, scheme::String)
    expdecay(get_counts(split, implicit), weighting_scheme(scheme))
end;

## Loss functions

In [10]:
# most alphas can be classified as one of two types:
# 1) explicit alphas predict what rating a user will give to
#    a given show conditional on having watched the show. these 
#    alphas are trained using mean squared error
# 2) implicit alphas predict whether a user will watch a
#    a given show. these alphas are trained using
#    cross-entropy loss
# both loss functions are weighted according to the weight decay
# logic described above

function weighted_loss(x, y, w, lossfn)
    sum(lossfn(x, y) .* w) / sum(w)
end

function weighted_loss_multithreaded(x, y, w, lossfn)
    a = Array{eltype(x)}(undef, Threads.nthreads())
    b = Array{eltype(w)}(undef, Threads.nthreads())
    Threads.@threads for t = 1:Threads.nthreads()
        range = thread_range(length(x))
        # Base.sum uses pairwise summation which is important for accuracy
        @views weight = sum(w[range])
        @views a[Threads.threadid()] =
            weighted_loss(x[range], y[range], w[range], lossfn) * weight
        b[Threads.threadid()] = weight
    end
    sum(a) / sum(b)
end

function error(x, y, w, implicit)
    lossfn = implicit ? (x, y) -> -y .* log.(x) : (x, y) -> (x - y) .^ 2
    weighted_error(x, y, w, lossfn)
end

function loss(x, y, w, implicit; normalize = true, multithreaded = false)
    if implicit
        lossfn = (x, y) -> -y .* log.(x)
    else
        lossfn = (x, y) -> (x - y) .^ 2
    end
    if normalize
        if multithreaded
            evaluator = weighted_loss_multithreaded
        else
            evaluator = weighted_loss
        end
    else
        evaluator = weighted_unnormalized_loss
    end
    evaluator(x, y, w, lossfn)
end;

## Regressions

In [11]:
# given a matrix of features X, a vector of true labels y, and
# a vector of weights w, a regression will find the β
# that minimizes the weighted between X * β and y. for explicit
# alphas, the loss is mean squared error and there is a closed
# form solution. for implicit alphas, the loss is cross-entropy
# and we solve for β numerically.

function regress(X, y, w, implicit)
    if implicit
        β = softmax(
            Optim.minimizer(
                Optim.optimize(
                    β -> loss(X * softmax(β), y, w, implicit; multithreaded = true),
                    fill(0.0f0, size(X)[2]),
                    Optim.NewtonTrustRegion(),
                    autodiff = :forward,
                ),
            ),
        )
    else
        Xw = (X .* sqrt.(w))
        yw = (y .* sqrt.(w))
        β = Xw'Xw \ Xw'yw
    end
    X * β, β
end

# regress the given features on the validation set
function regress(alphas, implicit, x = nothing)
    split = "validation"
    X = regression_features(alphas, split, implicit, x)
    y = get_split(split, implicit).rating
    w = get_weights(split, implicit, "inverse")
    regress(X, y, w, implicit)
end

# concatenates x, if given, with the alphas
function regression_features(alphas, split, implicit, x = nothing)
    ncols = length(alphas) + (isnothing(x) ? 0 : 1) + implicit
    shape = isnothing(x) ? get_split(split, implicit).rating : x
    X = Array{eltype(shape),2}(undef, length(shape), ncols)
    @tprogress Threads.@threads for j = 1:length(alphas)
        @inbounds X[:, j] = read_alpha(alphas[j], split).rating
    end

    if implicit
        # add a baseline feature for non-degeneracy
        @views fill!(X[:, length(alphas)+1], 1.0f0)
    end
    if !isnothing(x)
        X[:, end] = x
    end
    X
end

# linearly combinine the given alphas
function read_alpha(alphas::Vector, split::String, implicit)
    df = get_split(split, implicit)
    _, β = regress(alphas, implicit)
    X = regression_features(alphas, split, implicit)
    RatingsDataset(df.user, df.item, X * β)
end

# performs a regression on the validation set and then 
# calculates the validation loss of that linear combination
function residualized_loss(alphas, implicit, x)
    split = "validation"
    x, β = regress(alphas, implicit, x)
    y = get_split(split, implicit).rating
    loss(x, y, get_weights(split, implicit, "inverse"), implicit; multithreaded = true)
end

function residualized_loss(alphas, implicit, x, β, split)
    X = regression_features(alphas, split, implicit, x)
    x = X * β
    y = get_split(split, implicit).rating
    loss(x, y, get_weights(split, implicit, "inverse"), implicit; multithreaded = true)
end;