# Ranking
* This is trained to learn the partial ordering implied by each user's watches
* Items that are watched are preferred to items that have not been watched
* If two items have been watched, then the impression metadata determines
  which one, if any, is liked more
* It uses the position aware maximum likehood estimation loss  
* The inputs to this model are features generated by other models

In [1]:
import NBInclude: @nbinclude
@nbinclude("MLE.Base.ipynb");

## Define Subclass

In [2]:
@with_kw struct EnsembleFeatures <: Features
    user_features::SparseMatrixCSC{Float32,Int32}
    query_features::Matrix{Float32}
    preprocessing_data::Dict

    priorities::Matrix{Float16}

    index_to_item::Vector{Int32}
    user_to_indexes::Dict{Int32,Vector{Int32}}
end

function get_inference_data(f::Features)
    f.preprocessing_data
end;

In [3]:
function get_query_features(alphas::Vector{String}, split::String, content::String)
    @info "getting $split $content alphas"
    df = get_raw_split(split, content)
    T = Float16
    A = Matrix{T}(undef, length(df.user), length(alphas))
    @tprogress Threads.@threads for i = 1:length(alphas)
        A[:, i] = convert.(T, read_raw_alpha(alphas[i], split, content).rating)
    end
    collect(A')
end;

function normalize(x::AbstractArray; dims = 1)
    T = eltype(x)
    x = convert.(Float32, x)
    μ = mean(x, dims = dims)
    σ = std(x, dims = dims, mean = μ, corrected = false)
    convert.(T, (x .- μ) ./ σ), Dict("μ" => μ, "σ" => σ)
end

function get_implicit_features()
    df = get_split("training", "implicit")
    sparse(df.item, df.user, df.rating, num_items(), num_users())
end

function get_explicit_features()
    df = get_split("training", "explicit")
    sparse(df.item, df.user, df.rating, num_items(), num_users())
end

function get_user_features()
    vcat(get_implicit_features(), get_explicit_features())
end;

In [4]:
function get_features(alphas::Vector{String}, allow_ptw_in_labels::Bool)
    contents = all_contents
    if !allow_ptw_in_labels
        contents = filter(x -> x != "ptw", contents)
    end
    splits = ["test"]

    user_to_indexes = get_user_to_indexes(
        [(split, content) for split in splits for content in contents],
        (split, content) -> true,
    )

    hreduce(f; agg = hcat) =
        reduce(agg, f(split, content) for split in splits for content in contents)
    user_features = get_user_features()
    query_features, preprocessing_data = normalize(
        hreduce((split, content) -> get_query_features(alphas, split, content));
        dims = 2,
    )
    query_features = convert.(Float32, query_features)
    priorities = hreduce(get_priorities)
    index_to_item =
        hreduce((split, content) -> get_raw_split(split, content).item; agg = vcat)

    EnsembleFeatures(
        user_features = user_features,
        query_features = query_features,
        preprocessing_data = preprocessing_data,
        priorities = priorities,
        index_to_item = index_to_item,
        user_to_indexes = user_to_indexes,
    )
end

function get_user_embedding(u::Integer, f::Features)
    f.user_features[:, u]
end

function get_item_embedding(q::Integer, f::Features)
    f.index_to_item[q]
end;

function get_query_embedding(q::Integer, f::Features)
    f.query_features[:, q]
end;

In [5]:
function get_sample(f::Features, training::Bool, list_size::Integer)
    max_training_user = Int(floor(num_users() * 0.9))
    if training
        user_range = Int32(1):Int32(max_training_user)
    else
        user_range = Int32(max_training_user + 1):Int32(num_users())
    end

    while true
        # sample a random user
        u = rand(user_range)
        if u ∉ keys(f.user_to_indexes)
            continue
        end
        idxs = f.user_to_indexes[u]
        if length(idxs) >= list_size
            # sample random items for the user
            list = sample(idxs, list_size; replace = false)
            # construct a random ranking that is consistent with the user's preferences            
            if all(f.priorities[1, i] == 0 for i in list)
                # if all the objects are unseen, then topological_sort will fail.
                # topological_sort takes O(N^2) time, so we use this check to
                # fail fast in O(N) time
                continue
            end
            if !topological_sort!(
                list,
                (i, j) -> compare(f.priorities[:, i], f.priorities[:, j]),
            )
                continue
            end

            # batch the input features  
            u_embs = hcat(fill(get_user_embedding(u, f), list_size)...)
            a_embs = Int32[get_item_embedding(q, f) for q in list]
            q_embs = hcat((get_query_embedding(q, f) for q in list)...)
            return u_embs, a_embs, q_embs
        end
    end
end;

In [6]:
function get_batch(f::Features, training::Bool, list_size::Integer, batch_size::Integer)
    u_embs = SparseMatrixCSC{Float32,Int32}[]
    a_embs = Vector{Int32}[]
    q_embs = Matrix{Float32}[]
    for _ = 1:batch_size
        u_emb, a_emb, q_emb = get_sample(f, training, list_size)
        push!(u_embs, u_emb)
        push!(a_embs, a_emb)
        push!(q_embs, q_emb)
    end
    hcat(u_embs...), hcat(a_embs...), Flux.batch(q_embs)
end

function get_batch(c::Channel, training::Bool, holdout::Float32)
    u_embs, a_embs, q_embs = device.(take!(c))
    batch_size = size(a_embs)[2]
    tsize = (size(u_embs)[1], size(u_embs)[2] ÷ batch_size, batch_size)
    if training
        randfn = CUDA.functional() ? CUDA.rand : rand
        mask = randfn(num_items()) .>= holdout
        u_embs .*= repeat(mask, size(u_embs)[1] ÷ size(mask)[1])
    end
    reshape(u_embs, tsize), a_embs, q_embs
end;

In [7]:
function build_model(hyp::Hyperparams)
    K = hyp.embedding_size
    Chain(
        Join(
            vcat,
            Dense(num_items() * 2 => K),
            Embedding(num_items() => K; init = Flux.glorot_uniform),
            identity,
        ),
        Dense(length(hyp.alphas) + K * 2, K, relu),
        Dense(K => K ÷ 2, relu),
        Dense(K ÷ 2, 1),
    )
end;

In [8]:
function model(
    hyp::Hyperparams,
    f::Features,
    split::String,
    content::String;
    raw_splits = true,
)
    if raw_splits
        df = get_raw_split(split, content)
    else
        df = get_split(split, content)
    end
    if split in ["training", "validation"]
        return zeros(Float32, length(df.item))
    end

    output = Array{Float32}(undef, length(df.item))
    @showprogress for batch in
                      collect(Iterators.partition(1:length(df.item), hyp.batch_size))
        u_embs = SparseVector{Float32,Int32}[]
        a_embs = Int32[]
        q_embs = Matrix{Int32}[]
        for i in batch
            push!(u_embs, get_user_embedding(df.user[i], f))
            push!(a_embs, get_item_embedding(df.item[i], f))
            push!(q_embs, get_query_embedding(df.item[i], f))
        end
        U, A, Q = device(hcat(u_embs...)), device(vcat(a_embs...)), device(hcat(q_embs...))
        output[batch] .= cpu(vec(m((U, A, Q))))
    end
    output
end;

## Train model

In [9]:
alphas = [
    "LinearExplicit"
    "LinearImplicit"
    "LinearPtw"
    "Explicit"
    "NonlinearImplicit"
    "NonlinearPtw"
    explicit_raw_alphas
    implicit_raw_alphas
    ptw_raw_alphas
    nondirectional_raw_alphas
];
hyp = Hyperparams(
    allow_ptw_in_labels = false,
    alphas = alphas,
    batch_size = 1024,
    embedding_size = 256,
    holdout = NaN,
    l2penalty = NaN,
    learning_rate = NaN,
    list_size = 64,
    seed = 20220609,
)
hyp = create_hyperparams(hyp, [0.0f0, 0.0f0, 0.0f0])

Hyperparams
  allow_ptw_in_labels: Bool false
  alphas: Array{String}((19,))
  batch_size: Int32 1024
  holdout: Float32 0.26894143f0
  l2penalty: Float32 1.0f-5
  learning_rate: Float32 0.0003f0
  list_size: Int32 2
  seed: UInt64 0x0000000001348ac1


In [None]:
train_alpha(hyp, "MLE.Neural")

[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221126 17:57:27 Training model...
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221126 17:57:27 Initializing model
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221126 17:57:28 Getting data
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 ( 0.19 μs/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (50.98 ns/it)[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221126 18:03:56 getting test explicit alphas
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221126 18:05:15 getting test implicit alphas
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221126 18:05:22 getting test negative alphas
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221126 18:06:20 getting test explicit priorities
[32mProgress: 100%|███████████████████████████| Time: 0:00:06 ( 3.80 μs/it)[39mit)[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221126 18:06:33 

In [None]:
# 0.18465150250650614 using new mle loss formulation ( -> 64 -> 32 -> 1)
# 0.1770052581855019 using input normalization
# going wider by 4x didn't help
# going deeper by 2 layers didn't help
# 0.18148092587706433 using 50% drpout make things worse
# 0.05950891730785771 by scaling the loss function down (should be a no-op)
# 0.056652724017389744 with double embedding

In [None]:
# max_checkpoints::Integer = 100
# epochs_per_checkpoint::Integer = 1
# patience::Integer = 0
# verbose::Bool = true

In [None]:
# if verbose
#     @info "Initializing model"
# end
# opt = ADAMW(hyp.learning_rate, (0.9, 0.999), hyp.l2penalty)
# rng = Random.Xoshiro(hyp.seed)
# Random.seed!(rand(rng, UInt64))
# if CUDA.functional()
#     Random.seed!(CUDA.default_rng(), rand(rng, UInt64))
#     Random.seed!(CUDA.CURAND.default_rng(), rand(rng, UInt64))
# end
# m = build_model(hyp) |> device
# best_model = m |> cpu
# ps = Flux.params(m)
# stopper = early_stopper(
#     max_iters = max_checkpoints,
#     patience = patience,
#     min_rel_improvement = 1e-3,
# )
# batchloss(x...) = position_aware_list_mle_loss(m, x)
# epoch_size = Int(round(num_users() / hyp.batch_size))
# function loginfo(x)
#     if verbose
#         @info x
#     end
# end

# loginfo("Getting data")
# f = get_features(hyp.alphas, false)

In [None]:
# setup_channel(training) = setup_batch_channel(f, training, hyp, 64)
# training_batches = setup_channel(true)
# test_batches = setup_channel(false)
# @info "Testing channels"
# @info size.(get_batch(training_batches, true, hyp.holdout))
# @info size.(get_batch(test_batches, false, hyp.holdout))

In [None]:
# @showprogress for _ = 1:epoch_size
#     batch = get_batch(training_batches, true, hyp.holdout)
# end