# Ranking
* This is trained to learn the partial ordering implied by each user's watches
* Items that are watched are preferred to items that have not been watched
* If two items have been watched, then the impression metadata determines
  which one, if any, is liked more
* It uses the position aware maximum likehood estimation loss  
* The inputs to this model are features generated by other models

In [1]:
import NBInclude: @nbinclude
@nbinclude("MLE.Base.ipynb");

## Define model

In [2]:
@with_kw struct EnsembleFeatures <: Features
    user_features::AbstractMatrix
    query_features::AbstractMatrix
    priorities::AbstractMatrix
    user_to_indexes::Dict
    index_to_item::Vector
    preprocessing_data::Dict
end;

In [3]:
function training_test_split(f::Features; p::Real = 0.9)
    training = Int32[]
    test = Int32[]
    cutoff = num_users() * p
    @showprogress for u in keys(f.user_to_indexes)
        if u < cutoff
            push!(training, u)
        else
            push!(test, u)
        end
    end
    training, test
end

function get_inference_data(f::Features)
    f.preprocessing_data
end;

In [4]:
function get_query_features(alphas::Vector{String}, split::String, content::String)
    @info "getting $split $content alphas"
    df = get_raw_split(split, content)
    A = Matrix{Float16}(undef, length(df.user), length(alphas))
    @tprogress Threads.@threads for i = 1:length(alphas)
        A[:, i] = convert.(Float16, read_raw_alpha(alphas[i], split, content).rating)
    end
    collect(A')
end;

function normalize(x::AbstractArray; dims = 1)
    T = eltype(x)
    x = convert.(Float32, x)
    μ = mean(x, dims = dims)
    σ = std(x, dims = dims, mean = μ, corrected = false)
    convert.(T, (x .- μ) ./ σ), Dict("μ" => μ, "σ" => σ)
end

function get_user_features()
    df = get_split("training", "implicit")
    implicit =
        sparse(df.item, df.user, convert.(Float16, df.rating), num_items(), num_users())
    df = get_split("training", "explicit")
    explicit =
        sparse(df.item, df.user, convert.(Float16, df.rating), num_items(), num_users())
    vcat(implicit, explicit)
end

function get_features(alphas::Vector{String}, allow_ptw::Bool)
    contents = all_contents
    if !allow_ptw
        contents = filter(x -> x != "ptw", contents)
    end
    hreduce(f; agg = hcat) = reduce(agg, f("test", content) for content in contents)
    user_features = get_user_features()
    query_features = hreduce((split, content) -> get_query_features(alphas, split, content))
    query_features, preprocessing_data = normalize(query_features; dims = 2)
    priorities = hreduce(get_priorities)
    user_to_indexes = get_user_to_indexes([("test", content) for content in contents])
    index_to_item =
        hreduce((split, content) -> get_raw_split(split, content).item; agg = vcat)
    EnsembleFeatures(
        user_features = user_features,
        query_features = query_features,
        priorities = priorities,
        user_to_indexes = user_to_indexes,
        index_to_item = index_to_item,
        preprocessing_data = preprocessing_data,
    )
end

function get_embedding(u::Integer, q::Integer, f::Features)
    f.user_features[:, u], [f.index_to_item[q]], f.query_features[:, q]
end;

In [5]:
function get_sample(f::Features, users::Vector, list_size::Integer)
    comparator = (i, j) -> compare(f.priorities[:, i], f.priorities[:, j])
    while true
        u = rand(users)
        idxs = f.user_to_indexes[u]
        if length(idxs) > 1
            list = sample(idxs, list_size; replace = false)
            if all(f.priorities[1, i] == 0 for i in list)
                # if all the objects are unseen, then topological_sort will fail.
                # topological_sort takes O(N^2) time, so we use this check to
                # fail fast in O(N) time
                continue
            end
            valid = topological_sort(list, comparator)
            if !valid
                # fail if all objects are non-comparable
                continue
            end
            embs = []
            for q in list
                emb = get_embedding(u, q, f)
                if length(embs) == 0
                    for _ = 1:length(emb)
                        push!(embs, [])
                    end
                end
                push!.(embs, emb)
            end
            for i = 1:length(embs)
                if eltype(embs[i][1]) <: Integer
                    # for inputs that will be passed into an embedding layer
                    embs[i] = convert.(Int32, reduce(vcat, embs[i]))
                else
                    # for inputs that will be passed into a dense layer
                    embs[i] = convert.(Float32, reduce(hcat, embs[i]))
                end
            end
            return embs
        end
    end
end;

In [6]:
function build_model(hyp::Hyperparams)
    Chain(
        Join(
            vcat,
            Dense(num_items() * 2 => 32, bias = false),
            Embedding(num_items() => 32),
            identity,
        ),
        Dense(length(hyp.alphas) + 32 * 2, 64, relu),
        Dense(64 => 32, relu),
        Dense(32, 1),
    )
end;

## Train model

In [7]:
alphas = [
    "LinearExplicit"
    "LinearImplicit"
    "LinearPtw"
    "Explicit"
    "NonlinearImplicit"
    "NonlinearPtw"
    explicit_raw_alphas
    implicit_raw_alphas
    ptw_raw_alphas
    nondirectional_raw_alphas
];
hyp = Hyperparams(
    allow_ptw = false,
    alphas = alphas,
    batch_size = 1024,
    l2penalty = NaN,
    learning_rate = NaN,
    list_size = 2,
    seed = 20220609,
)
hyp = create_hyperparams(hyp, [0.0f0, 0.0f0])

Hyperparams
  allow_ptw: Bool false
  alphas: Array{String}((19,))
  batch_size: Int32 1024
  l2penalty: Float32 1.0f-5
  learning_rate: Float32 0.0003f0
  list_size: Int32 2
  seed: UInt64 0x0000000001348ac1


In [8]:
# train_alpha(hyp, "MLE.Neural")

In [9]:
# 0.18465150250650614 using new mle loss formulation ( -> 64 -> 32 -> 1)
# 0.1770052581855019 using input normalization
# going wider by 4x didn't help
# going deeper by 2 layers didn't help
# 0.18148092587706433 using 50% drpout make things worse
# 0.05950891730785771 by scaling the loss function down (should be a no-op)
# 0.056652724017389744 with double embedding

In [10]:
max_checkpoints::Integer = 100
epochs_per_checkpoint::Integer = 1
patience::Integer = 0
verbose::Bool = true

true

In [11]:
if verbose
    @info "Initializing model"
end
opt = ADAMW(hyp.learning_rate, (0.9, 0.999), hyp.l2penalty)
Random.seed!(hyp.seed)
m = build_model(hyp) |> device
best_model = m |> cpu
ps = Flux.params(m)
stopper = early_stopper(
    max_iters = max_checkpoints,
    patience = patience,
    min_rel_improvement = 1e-3,
)
batchloss(x...) = position_aware_list_mle_loss(m, x)
epoch_size = Int(round(num_users() / hyp.batch_size))
function loginfo(x)
    if verbose
        @info x
    end
end

loginfo("Getting data")
f = get_features(hyp.alphas, false)
training_users, test_users = training_test_split(f)
setup_channel(users) = setup_batch_channel(f, users, hyp, 64)

[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221124 02:30:02 Initializing model
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221124 02:30:04 Getting data
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221124 02:30:51 getting test explicit alphas
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221124 02:30:55 getting test implicit alphas
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221124 02:30:57 getting test negative alphas
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221124 02:32:04 getting test explicit priorities
[32mProgress: 100%|███████████████████████████| Time: 0:00:05 ( 3.19 μs/it)[39mit)[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221124 02:32:11 getting test implicit priorities
[32mProgress: 100%|███████████████████████████| Time: 0:00:01 ( 1.49 μs/it)[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221124 02:32:14 getting test negative priorities
[32mProgress: 100%|██████████████

setup_channel (generic function with 1 method)

In [12]:
# training_batches = setup_channel(training_users)
# test_batches = setup_channel(test_users)
# @info "Testing channels"
# @info size.(get_batch(training_batches))
# @info size.(get_batch(test_batches))

In [13]:
for _ = 1:100
    @time batch = get_batch(f, training_users, hyp.list_size, hyp.batch_size)
end

  1.210589 seconds (6.11 M allocations: 344.920 MiB, 14.69% gc time, 98.52% compilation time)
  0.032599 seconds (204.72 k allocations: 19.338 MiB, 54.93% gc time)
  0.014125 seconds (203.99 k allocations: 19.056 MiB)
  0.031455 seconds (204.10 k allocations: 19.913 MiB, 53.88% gc time)
  0.013228 seconds (203.74 k allocations: 18.514 MiB)
  0.013833 seconds (205.72 k allocations: 19.149 MiB)
  0.028171 seconds (203.17 k allocations: 18.490 MiB, 51.90% gc time)
  0.013380 seconds (202.92 k allocations: 19.042 MiB)
  0.024673 seconds (203.91 k allocations: 19.305 MiB, 46.40% gc time)
  0.013333 seconds (204.86 k allocations: 18.546 MiB)
  0.024385 seconds (204.02 k allocations: 20.226 MiB, 45.86% gc time)
  0.013281 seconds (205.76 k allocations: 18.801 MiB)
  0.013086 seconds (203.28 k allocations: 18.514 MiB)
  0.024162 seconds (204.13 k allocations: 18.932 MiB, 46.32% gc time)
  0.013190 seconds (203.68 k allocations: 18.777 MiB)
  0.023981 seconds (203.47 k allocations: 19.371 MiB, 

### 