# MLE Training

In [1]:
import NBInclude: @nbinclude
@nbinclude("MLE.Base.ipynb");

## Define Subclass

In [2]:
@with_kw struct TrainingFeatures <: Features
    user_features::SparseMatrixCSC{Float32,Int32}
    priorities::Matrix{Float16}

    index_to_item::Vector{Int32}
    index_to_training::Vector{Bool}
    item_user_index::SparseMatrixCSC{Int32,Int32}

    user_to_training_indexes::Dict{Int32,Vector{Int32}}
    user_to_validation_indexes::Dict{Int32,Vector{Int32}}
    user_to_training_items::Dict{Int32,Set{Int32}}
end;

In [3]:
function get_inference_data(f::Features)
    Dict()
end;

In [4]:
function get_implicit_features()
    df = get_split("training", "implicit")
    sparse(df.item, df.user, df.rating, num_items() + 1, num_users())
end
function get_explicit_features()
    df = get_split("training", "explicit")
    sparse(df.item, df.user, df.rating, num_items() + 1, num_users())
end
function get_user_features()
    vcat(get_implicit_features(), get_explicit_features())
end;

In [5]:
function get_features(alphas::Vector{String}, allow_ptw_in_labels::Bool)
    @assert length(alphas) == 0
    contents = filter(x -> x != "negative", all_contents)
    if !allow_ptw_in_labels
        contents = filter(x -> x != "ptw", contents)
    end
    splits = ["training", "validation"]

    user_to_training_indexes = get_user_to_indexes(
        [(split, content) for split in splits for content in contents],
        (split, content) -> split == "training",
    )
    user_to_validation_indexes = get_user_to_indexes(
        [(split, content) for split in splits for content in contents],
        (split, content) -> split == "validation",
    )

    hreduce(f; agg = hcat) =
        reduce(agg, f(split, content) for split in splits for content in contents)
    user_features = get_user_features()
    priorities = hreduce(get_priorities)
    index_to_item =
        hreduce((split, content) -> get_raw_split(split, content).item; agg = vcat)
    index_to_training = hreduce(
        (split, content) -> fill(
            split == "training" ? true : false,
            length(get_raw_split(split, content).item),
        );
        agg = vcat,
    )

    item_user_index = sparse(Int32[], Int32[], Int32[], num_items() + 1, num_users())
    idx = 1
    for split in splits
        @showprogress for content in contents
            df = get_raw_split(split, content)
            item_user_index += sparse(
                df.item,
                df.user,
                collect(idx:(length(df.item)+idx-1)),
                num_items() + 1,
                num_users(),
            )
            idx += length(df.item)
        end
    end

    user_to_training_items::Dict{Int32,Set{Int32}} =
        Dict(u => Set{Int32}() for u = 1:num_users())
    for u = 1:num_users()
        s = Set{Int32}()
        if u in keys(user_to_training_indexes)
            for i in user_to_training_indexes[u]
                push!(s, index_to_item[i])
            end
        end
        user_to_training_items[u] = s
    end

    TrainingFeatures(
        user_features = user_features,
        priorities = priorities,
        index_to_item = index_to_item,
        index_to_training = index_to_training,
        item_user_index = item_user_index,
        user_to_training_indexes = user_to_training_indexes,
        user_to_validation_indexes = user_to_validation_indexes,
        user_to_training_items = user_to_training_items,
    )
end

function get_user_embedding(u::Integer, list::Vector{Int32}, f::Features)
    U = f.user_features[:, u]
    for a in list
        for i = a:num_items()+1:length(U)
            U[i] = 0
        end
    end
    U
end

function get_item_embedding(q::Integer, f::Features)
    q
end;

In [6]:
function get_priority(f::Features, u::Integer, i::Integer, training::Bool)
    idx = f.item_user_index[i, u]
    if (idx == 0) || (training && !f.index_to_training[idx])
        return Float16[0, NaN, NaN, NaN]
    end
    f.priorities[:, idx]
end

function comparator(f::Features, u::Integer, i::Integer, j::Integer, training::Bool)
    lhs = get_priority(f, u, i, training)
    rhs = get_priority(f, u, j, training)
    compare(lhs, rhs)
end

function get_sample(f::Features, training::Bool, list_size::Integer)
    while true
        # sample an item the user has seen
        u = rand(1:num_users())
        if training
            u_to_idxs = f.user_to_training_indexes
        else
            u_to_idxs = f.user_to_validation_indexes
        end
        if u ∉ keys(u_to_idxs)
            continue
        end
        nonneg_item = f.index_to_item[rand(u_to_idxs[u])]
        # sample random items to fill out the list
        list = sample(Int32(1):Int32(num_items() + 1), list_size; replace = false)
        if nonneg_item ∉ list
            list[1] = nonneg_item
        end
        if !training
            training_items = f.user_to_training_items[u]
            for i = 1:length(list)
                if list[i] in training_items
                    list[i] = num_items() + 1
                end
            end
        end
        # construct a random ranking that is consistent with the user's preferences
        prefs = get_preferences(list, (i, j) -> comparator(f, u, i, j, training))
        # batch the input features
        u_embs = hcat(fill(get_user_embedding(u, list, f), list_size)...)
        a_embs = Int32[get_item_embedding(q, f) for q in list]
        return u_embs, a_embs, prefs
    end
end;

In [7]:
function get_batch(
    f::Features,
    training::Bool,
    list_size::Integer,
    batch_size::Integer,
    holdout::Float32,
)
    u_embs = SparseMatrixCSC{Float32,Int32}[]
    a_embs = Vector{Int32}[]
    prefs = Matrix{Int32}[]
    for i = 1:batch_size
        u_emb, a_emb, pref = get_sample(f, training, list_size)
        push!(u_embs, u_emb)
        push!(a_embs, a_emb)
        push!(prefs, pref)
    end

    # move to GPU
    U = device(hcat(u_embs...))
    A = device(hcat(a_embs...))
    P = device(Flux.batch(prefs))
    batch_size = size(A)[2]
    tsize = (size(U)[1], size(U)[2] ÷ batch_size, batch_size)
    if training
        randfn = CUDA.functional() ? CUDA.rand : rand
        mask = randfn(num_items() + 1) .>= holdout
        U .*= repeat(mask, size(U)[1] ÷ size(mask)[1])
    end
    (reshape(U, tsize), A), P
end;

In [8]:
function build_model(hyp::Hyperparams)
    K = hyp.embedding_size
    Chain(
        Join(
            vcat,
            Dense((num_items() + 1) * 2 => K),
            Embedding((num_items() + 1) => K; init = Flux.glorot_uniform),
        ),
        Dense(K * 2, K, relu),
        Dense(K => K ÷ 2, relu),
        Dense(K ÷ 2, 1),
    )
end;

In [9]:
function inference_model(
    hyp::Hyperparams,
    f::Features,
    split::String,
    content::String;
    raw_splits = true,
)
    if raw_splits
        df = get_raw_split(split, content)
    else
        df = get_split(split, content)
    end
    if split == "training"
        return zeros(Float32, length(df.item))
    end

    output = Array{Float32}(undef, length(df.item))
    @showprogress for batch in
                      collect(Iterators.partition(1:length(df.item), hyp.batch_size))
        u_embs = SparseVector{Float32,Int32}[]
        a_embs = Int32[]
        for i in batch
            push!(u_embs, get_user_embedding(df.user[i], Int32[], f))
            push!(a_embs, get_item_embedding(df.item[i], f))
        end
        U, A = device(hcat(u_embs...)), device(vcat(a_embs...))
        output[batch] .= cpu(vec(m((U, A))))
    end
    output
end;

## Train model

In [10]:
hyp = Hyperparams(
    allow_ptw_in_labels = true,
    alphas = String[],
    embedding_size = 256,
    batch_size = 1024,
    holdout = NaN,
    l2penalty = NaN,
    learning_rate = NaN,
    list_size = 16,
    seed = 20220609,
)
hyp = create_hyperparams(hyp, Float32[0, 0, 0])

Hyperparams
  allow_ptw_in_labels: Bool true
  alphas: Array{String}((0,))
  batch_size: Int32 1024
  embedding_size: Int32 256
  holdout: Float32 0.26894143f0
  l2penalty: Float32 1.0f-5
  learning_rate: Float32 0.0003f0
  list_size: Int32 16
  seed: UInt64 0x0000000001348ac1


In [None]:
train_alpha(hyp, "MLE.Training")

[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221129 12:57:36 Training model...
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221129 12:57:36 Initializing model
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221129 12:57:37 Getting data
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (64.26 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:00 (51.61 ns/it)[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:05 ( 0.92 μs/it)[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221129 13:01:34 getting training explicit priorities
[32mProgress: 100%|███████████████████████████| Time: 0:00:08 ( 0.69 μs/it)[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221129 13:01:55 getting training implicit priorities
[32mProgress: 100%|███████████████████████████| Time: 0:00:08 ( 1.45 μs/it)[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20221129 13:02:11 getting training ptw priorities
[

In [None]:
# verbose = true
# max_checkpoints = 25
# epochs_per_checkpoint = 1
# patience = 2;

In [None]:
# if verbose
#     @info "Initializing model"
# end
# opt = ADAMW(hyp.learning_rate, (0.9, 0.999), hyp.l2penalty)
# rng = Random.Xoshiro(hyp.seed)
# Random.seed!(rand(rng, UInt64))
# if CUDA.functional()
#     Random.seed!(CUDA.default_rng(), rand(rng, UInt64))
#     Random.seed!(CUDA.CURAND.default_rng(), rand(rng, UInt64))
# end
# m = build_model(hyp) |> device
# best_model = m |> cpu
# ps = Flux.params(m)
# stopper = early_stopper(
#     max_iters = max_checkpoints,
#     patience = patience,
#     min_rel_improvement = 1e-3,
# )
# epoch_size = Int(round(num_users() / hyp.batch_size))
# function loginfo(x)
#     if verbose
#         @info x
#     end
# end

In [None]:
# loginfo("Getting data")
# F = get_features(hyp.alphas, false)

In [None]:
# GC.gc(true)
# CUDA.reclaim()
# train_epoch!(m, opt, F, hyp, epoch_size)

In [None]:
# average_loss(m, F, hyp, epoch_size)