# Helper functions that are useful for generating alphas

In [None]:
import Flux: softmax
import JLD2
import Memoize: @memoize
import NBInclude: @nbinclude
import Optim

## General utilities

In [None]:
@nbinclude("AlphaUtils.ipynb");

## Logging

In [None]:
if !@isdefined name
    name = "Alpha"
end
redirect_logging("../../data/alphas/$name");

## Structs for handling ratings

In [None]:
# user[i] has seen item[i] and given it a score of rating[i]
@with_kw struct RatingsDataset
    user::Vector{Int32}
    item::Vector{Int32}
    rating::Vector{Float32}
end

# swap users with items
function Base.adjoint(x::RatingsDataset)
    RatingsDataset(x.item, x.user, x.rating)
end

# Some sparse matrix operations require indices to be Int64
@with_kw struct RatingsDataset64
    user::Vector{Int64}
    item::Vector{Int64}
    rating::Vector{Float32}
end

function RatingsDataset64(x::RatingsDataset)
    RatingsDataset64(
        convert.(Int64, x.user),
        convert.(Int64, x.item),
        convert.(Float32, x.rating),
    )
end

# append two datasets
function Base.cat(x::RatingsDataset, y::RatingsDataset)
    RatingsDataset([x.user; y.user], [x.item; y.item], [x.rating; y.rating])
end;

In [None]:
# we sanitize the splits such that no user/item in the validation
# or test splits has a higher id that the the training set does

@memoize function num_users()
    maximum(get_split("training").user)
end

@memoize function num_items()
    maximum(get_split("training").item)
end;

## Reading and writing data

In [None]:
# a split is a collection of (user, item, rating) triples that are stored as a RatingsDataset
# 
# @param split the following splits are supported:
#     training: used to train an alpha's parameters
#     validation: used to tune an alpha's hyperpameters
#     test: used to measure out of sample performance
#     implicit: contains all items a user has seen, including shows which they have not rated.
#     impliict_training: contains all items in the implicit split that are not in the validation or splits
#
# @param implicit replace the explicit ratings with an implicit rating. 
#     the implicit rating is 1 if they watched the series and 0 if they have not
# @param transpose return an (item, user, rating) dataset instead of a (user, item, rating) dataset
function get_split(split; implicit = false, transpose = false)
    # TODO negative test split
    @assert split in ["training", "validation", "test", "implicit", "implicit_training"]
    if split in ["implicit", "implicit_training"]
        @assert implicit
    end
    file = "../../data/splits/splits.jld2"
    df = JLD2.load(file, split)
    if implicit
        fill!(df.rating, 1)
    end
    transpose ? df' : df
end;

In [None]:
# an alpha is a model that is used to predict whether a user will like an item.
# it's often useful to know an alpha model's value for a given (user, item) pair.
# alphas can be expensive to compute, so we precompute the model's values on
# (user-item) pairs and store the resultant RatingsDatasets to disk.
# storing the model values for all (user, item) pairs would be prohibitively
# large, so we only store values for our splits

function write_alpha(model, alphas, implicit; outdir = name)
    splits_to_save = ["validation", "training", "test"]
    splits_to_not_log = ["test"]

    β = nothing
    predictions = Dict()
    for split in splits_to_save
        df = get_split(split; implicit = implicit)
        x = model(df.user, df.item)
        if isnothing(β)
            # we need to evaluate the validation set to get the regression coefficients
            @assert split == "validation"
            _, β = regress(x, alphas, implicit)
        end
        predictions[split] = RatingsDataset(df.user, df.item, x)
        if split ∉ splits_to_not_log
            @info "$(split) loss: $(residualized_loss(x, alphas, implicit, β, split))"
        end
    end

    outdir = mkpath("../../data/alphas/$outdir")
    JLD2.save("$outdir/predictions.jld2", predictions)
end

function read_alpha(alpha, split)
    @assert split in ["training", "validation", "test"]
    file = "../../data/alphas/$(alpha)/predictions.jld2"
    JLD2.load(file, split)
end;

In [None]:
# params consist of two things:
# 1) the hyperparameters that are used to train an alpha
# 2) the trained parameters of an alpha 
# in general, params should contain all information necessary to
# efficiently train the alpha model for a new user

function write_params(params; outdir = name)
    outdir = mkpath("../../data/alphas/$outdir")
    JLD2.save("$outdir/params.jld2", params)
end

function read_params(alpha)
    JLD2.load("../../data/alphas/$alpha/params.jld2")
end;

## Weight decays

In [None]:
# the number of items a user has seen spans orders of magnitude.
# if we place an equal weight on each (user, item) pair, then highly
# active users will skew the loss function. It is generally best practice
# to weight each user equally (see Deep Neural Networks for YouTube Recommendations 
# https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45530.pdf]).
# we achieve this by weighting our validation and test loss functions such that each
# (user, item) pair has a weight of 1 / |number of items the user has seen|.
# 
# during training, the same skew issue appears. instead of weighting each (user, item)
# pair by 1 / |number of items the user has seen|, we take the more general approach
# of weighting each (user, item) pair by 
# |number of items the user has seen| ^ w_u * |number of users that have seen the item| ^ w_a
# when w_u=-1 and w_a=0, then we recover the equal-user weighting that we used for
# validation and test sets.

function expdecay(x, a)
    x == 0 ? zero(eltype(a)) : sign(x) * abs(x)^a
end

function expdecay(x::Vector, a)
    y = Array{eltype(a)}(undef, length(x))
    Threads.@threads for i = 1:length(x)
        @inbounds y[i] = expdecay(x[i], a)
    end
    y
end

function weighting_scheme(scheme::String)
    if scheme == "linear"
        return 1f0
    elseif scheme == "constant"
        return 0f0
    elseif scheme == "inverse"
        return -1f0
    else
        @assert false
        return 0f0
    end
end;

function get_user_counts(split::RatingsDataset)
    counts = zeros(eltype(split.rating), maximum(split.user), Threads.nthreads())
    @tprogress Threads.@threads for i = 1:length(split.rating)
        @inbounds counts[split.user[i], Threads.threadid()] += 1
    end
    vec(sum(counts, dims = 2))
end

@memoize function get_counts(split; per_rating = true, by_item = false)
    split = get_split(split; transpose = by_item)
    user_counts = get_user_counts(split)

    if !per_rating
        return user_counts
    end

    counts = Array{eltype(user_counts)}(undef, length(split.user))
    Threads.@threads for i = 1:length(counts)
        @inbounds counts[i] = user_counts[split.user[i]]
    end
    counts
end

function get_weights(split, scheme::String)
    expdecay(get_counts(split), weighting_scheme(scheme))
end;

## Loss functions

In [None]:
# most alphas can be classified as one of two types:
# 1) explicit alphas predict what rating a user will give to
#    a given show conditional on having watched the show. these 
#    alphas are trained using mean squared error
# 2) implicit alphas predict whether a user will watch a
#    a given show. these alphas are trained using
#    cross-entropy loss
# both loss functions are weighted according to the weight decay
# logic described above

function weighted_loss(x, y, w, lossfn)
    a = Array{eltype(x)}(undef, Threads.nthreads())
    b = Array{eltype(w)}(undef, Threads.nthreads())    
    Threads.@threads for t = 1:Threads.nthreads()
        range = thread_range(length(x))
        # Base.sum uses pairwise summation which is important for accuracy
        @views a[Threads.threadid()] = sum(lossfn(x[range], y[range]) .* w[range])
        @views b[Threads.threadid()] = sum(w[range])
    end
    sum(a) / sum(b)
end  

function loss(x, y, w, implicit)
    lossfn = implicit ? (x, y) -> -y .* log.(x) : (x, y) -> (x - y) .^ 2
    weighted_loss(x, y, w, lossfn)
end;

# function weighted_crossentropy_loss(x, y, w)
#     weighted_loss(x, y, w, (x, y) -> -y .* log.(x))
# end

# function weighted_mean_squared_error(x, y, w)
#     weighted_loss(x, y, w, (x, y) -> (x - y) .^ 2)
# end

# loss(x, y, w, implicit) =
#     implicit ? weighted_crossentropy_loss(x, y, w) : weighted_mean_squared_error(x, y, w);

## Regressions

In [None]:
# given a matrix of features X, a vector of true labels y, and
# a vector of weights w, a regression will find the β
# that minimizes the weighted between X * β and y. for explicit
# alphas, the loss is mean squared error and there is a closed
# form solution. for implicit alphas, the loss is cross-entropy
# and we solve for β numerically.

function regress(X, y, w, implicit)
    if implicit
        β = softmax(
            Optim.minimizer(
                Optim.optimize(
                    β -> loss(X * softmax(β), y, w, implicit),
                    fill(0.0f0, size(X)[2]),
                    Optim.NewtonTrustRegion(),
                    autodiff = :forward,
                ),
            ),
        )
    else
        β = (X .* sqrt.(w)) \ (y .* sqrt.(w))
    end
    @info "regression coefficients: $β"
    X * β, β
end

# regress the given features on the validation set
function regress(x, alphas, implicit)
    split = "validation"
    X = regression_features(x, alphas, split)
    y = get_split(split; implicit = implicit).rating
    w = get_weights(split, "inverse")
    regress(X, y, w, implicit)
end

# concatenates x with the given alphas
function regression_features(x, alphas, split)
    X = Array{eltype(x),2}(undef, length(x), length(alphas) + 1)
    @tprogress Threads.@threads for j = 1:length(alphas)
        @inbounds X[:, j] = read_alpha(alphas[j], split).rating
    end
    X[:, end] = x
    X
end


# performs a regression on the validation set and then 
# calculates the validation loss of that linear combination
function residualized_loss(x, alphas, implicit)
    split = "validation"
    x, β = regress(x, alphas, implicit)
    y = get_split(split; implicit = implicit).rating
    loss(x, y, get_weights(split, "inverse"), implicit)
end

function residualized_loss(x, alphas, implicit, β, split)
    X = regression_features(x, alphas, split)
    x = X * β
    y = get_split(split; implicit = implicit).rating
    loss(x, y, get_weights(split, "inverse"), implicit)
end

# linearly combinine the given alphas
function read_alpha(alphas::Vector, split::String, implicit)
    df = get_split(split)
    baseline = implicit ? 1.0f0 / num_items() : 0.0f0
    _, β = regress(fill(baseline, length(get_split("validation").rating)), alphas, implicit)
    X = regression_features(fill(baseline, length(df.rating)), alphas, split)
    RatingsDataset(df.user, df.item, X * β)
end;