# Ranking
* Learns the preference relation implied by future watches
* Uses a modified form of the position-aware list-mle loss

In [None]:
medium = ""

In [None]:
import NBInclude: @nbinclude
@nbinclude("../Alpha.ipynb");

In [None]:
import MLUtils
import Random
import StatsBase

In [None]:
import H5Zblosc
import HDF5

# Data

In [None]:
function get_features(alphas::Vector{String}, split::String, medium::String)
    @info "getting $split $medium $alphas"
    N = length(get_raw_split(split, medium, [:userid], nothing).userid)
    T = Float16
    A = Matrix{T}(undef, N, length(alphas))
    @assert length(alphas) == get_feature_size()
    @showprogress for i = 1:length(alphas)
        x = get_raw_split(split, medium, Symbol[], alphas[i]).alpha
        # normalize and make monotonic
        if alphas[i] == "$medium/Linear/rating"
            x = clamp.(x / 10, 0, 1)
        elseif alphas[i] in ["$medium/Linear/watch", "$medium/Linear/plantowatch"]
            nothing
        elseif alphas[i] == "$medium/Linear/drop"
            x = 1 .- x
        else
            @assert false
        end
        @assert minimum(x) >= 0 && maximum(x) <= 1
        A[:, i] = convert.(T, x)
    end
    collect(A')
end

function get_features(alphas::Vector{String}, medium::String)
    reduce(hcat, [get_features(alphas, x, medium) for x in ["test", "negative"]])
end

function get_feature_size()
    4
end;

In [None]:
function get_priority_size()
    3
end

function get_priorities(split::String, medium::String)
    @info "getting $split $medium priorities"
    if split == "test"
        fields = [:userid, :itemid, :rating, :status]
    elseif split == "negative"
        fields = [:userid, :itemid]
    else
        @assert false
    end
    df = get_raw_split(split, medium, fields, nothing)
    A = Matrix{Float16}(undef, get_priority_size(), length(df.userid))
    @showprogress for i = 1:length(df.userid)
        if split == "test"
            p = Float16[1, NaN, df.status[i]]
            if df.rating[i] != 0
                p[2] = df.rating[i]
            end
        elseif split == "negative"
            p = Float16[0, NaN, NaN]
        else
            @assert false
        end
        A[:, i] = p
    end
    A
end

function get_priorities(medium::String)
    reduce(hcat, [get_priorities(x, medium) for x in ["test", "negative"]])
end;

In [None]:
function get_user_to_indexes(medium::String, splits::Vector{String})
    u_to_xs = Dict{Int32,Vector{Int32}}()
    index::Int32 = 1
    for split in splits
        df = get_raw_split(split, medium, [:userid], nothing)
        @showprogress for u in df.userid
            if u ∉ keys(u_to_xs)
                u_to_xs[u] = Int32[]
            end
            push!(u_to_xs[u], index)
            index += 1
        end
    end
    u_to_xs
end;

In [None]:
@kwdef struct Features
    features::Matrix{Float32}
    priorities::Matrix{Float16}
    user_to_indexes::Dict{Int32,Vector{Int32}}
    user_to_watched_indexes::Dict{Int32,Vector{Int32}}
    training_users::Vector{Int32}
    test_users::Vector{Int32}
end

function load_features()
    alphas = ["$medium/Linear/$metric" for metric in ALL_METRICS]
    F = get_features(alphas, medium)
    P = get_priorities(medium)
    u_to_i = get_user_to_indexes(medium, ["test", "negative"])
    u_to_w = get_user_to_indexes(medium, ["test"])

    users = collect(keys(u_to_i))
    test_users =
        Set(StatsBase.sample(users, Int(round(length(users) * 0.1)); replace = false))
    training_users = Set(x for x in users if x ∉ test_users)
    Features(F, P, u_to_i, u_to_w, collect(training_users), collect(test_users))
end;

# Batching

In [None]:
function subsample(f::Features, u::Int32, list_size::Int32)
    l = f.user_to_indexes[u]
    w = f.user_to_watched_indexes[u]
    list = StatsBase.sample(l, min(length(l), list_size); replace = false)
    # ensure at least one item is watched
    if all(f.priorities[1, i] == 0 for i in list)
        list[1] = rand(w)
    end
    # pad to list_size
    while length(list) < list_size
        push!(list, -1)
    end
    list
end

function get_feature(f::Features, i::Int32)
    if i == -1
        return zeros(Float32, size(f.features)[1])
    else
        return f.features[:, i]
    end
end

function get_priority(f::Features, i::Int32)
    if i == -1
        return Float16[0, NaN, NaN]
    else
        return f.priorities[:, i]
    end
end

function get_sample(f::Features, user::Int32, list_size::Int32)
    list = subsample(f, user, list_size)
    features = hcat((get_feature(f, q) for q in list)...)
    prios = MLUtils.batch(get_priority(f, i) for i in list)
    features, prios
end

function get_epoch(f::Features, training::Bool, list_size::Int32)
    if training
        users = f.training_users
    else
        users = f.test_users
    end
    users = Random.shuffle(users)
    feats = Vector{Matrix{Float32}}(undef, length(users))
    prios = Vector{Matrix{Float16}}(undef, length(users))
    @showprogress for i = 1:length(users)
        feat, prio = get_sample(f, users[i], list_size)
        feats[i] = feat
        prios[i] = prio
    end
    Q = MLUtils.batch(feats)
    P = MLUtils.batch(prios)
    Q, P
end;

# Run

In [None]:
function save_epoch(epoch, fn)
    d = Dict{String,Any}()
    d["features"] = epoch[1]
    d["prios"] = convert.(Float32, epoch[2])
    HDF5.h5open(fn, "w") do file
        for (k, v) in d
            file[k, blosc = 1] = v
        end
    end
end;

In [None]:
f = load_features();

In [None]:
list_size = Int32(10240);

In [None]:
outdir = mkpath(get_data_path("alphas/$medium/Ranking/data"))

In [None]:
save_epoch(get_epoch(f, true, list_size), "$outdir/training.h5")

In [None]:
save_epoch(get_epoch(f, false, list_size), "$outdir/test.h5")