# TrainingServingSkew
* Moniitors whether the alphas generated for inference are identical 
    to the alphas generated during training

In [None]:
username = ""
source = ""
medium = ""

In [None]:
import NBInclude: @nbinclude
import Statistics: mean
@nbinclude("../TrainingAlphas/Alpha.ipynb");

In [None]:
get_fields() = [x for x in fieldnames(RatingsDataset) if x ∉ [:alpha, :metric, :userid]];

In [None]:
function get_rec_split(medium)
    get_raw_split("rec_training", "manga", get_fields(), nothing)
end;

In [None]:
function get_training_split(medium)
    username_to_uid =
        DataFrame(CSV.File(get_data_path("processed_data/username_to_uid.csv")))
    uid = username_to_uid.uid[findfirst(
        x -> x == "$source@$username",
        username_to_uid.userid,
    )]
    df = get_raw_split("training", "manga", vcat([:userid], get_fields()), nothing)
    filter(df, df.userid .== uid)
end;

In [None]:
function check_raw_splits(medium)
    x = get_rec_split(medium)
    y = get_training_split(medium)
    for k in get_fields()
        @assert all(getfield(x, k) .== getfield(y, k))
    end
end;

In [None]:
function read_training_alpha(
    alpha::String,
    split::String,
    medium::String;
    content = nothing,
)
    uid = parse(Int, get_recommendee_username()) + 1
    if isnothing(content)
        if occursin("implicit", lowercase(alpha))
            content = "implicit"
        elseif occursin("explicit", lowercase(alpha))
            content = "explicit"
        end
        @assert !isnothing(content) alpha
    end
    df = read_alpha(alpha, split, task, content, medium)
    filter(df, df.user .== uid)
end;

In [None]:
function get_errors(alpha, split, medium, content, fn)
    serving = read_recommendee_alpha(alpha, "all", medium)
    training = read_training_alpha(alpha, split, medium; content = content)
    if length(training.item) == 0
        return 0
    end
    fn(training, serving)

    abs.(training.rating .- serving.rating[training.item])
end

function average_errors(alpha, split, medium; content = nothing)
    get_errors(
        alpha,
        split,
        medium,
        content,
        (training, serving) -> abs.(training.rating .- serving.rating[training.item]),
    )
end

function relative_errors(alpha, split, medium; content = nothing)
    get_errors(
        alpha,
        split,
        medium,
        content,
        (training, serving) ->
            abs.(
                abs.(training.rating) ./
                (abs.(serving.rating[training.item]) .+ eps(Float64)) .- 1,
            ),
    )
end;

In [None]:
function maximum_error(alpha, split, medium; content = nothing)
    maximum(average_errors(alpha, split, medium; content = content))
end

function average_abs_error(alpha, split, medium; content = nothing)
    mean(average_errors(alpha, split, medium; content = content))
end

function average_rel_error(alpha, split, medium; content = nothing)
    mean(relative_errors(alpha, split, medium; content = content))
end;

In [None]:
function display_errors(alphas, split, medium; content = nothing)
    if isempty(alphas)
        return
    end
    alpha_padding = maximum(length.(alphas))
    number_padding = 16

    header_1 = rpad("Alpha", alpha_padding)
    header_2 = rpad("Avg Abs Error", number_padding)
    header_3 = rpad("Avg Rel Error", number_padding)
    header_4 = rpad("Max Error", number_padding)
    @info "$header_1 $header_2 $header_3 $header_4"

    for alpha in alphas
        GC.gc()
        avg_abs_error =
            rpad(average_abs_error(alpha, split, medium; content = content), number_padding)
        avg_rel_error =
            rpad(average_rel_error(alpha, split, medium; content = content), number_padding)
        max_error =
            rpad(maximum_error(alpha, split, medium; content = content), number_padding)
        alpha = rpad(alpha, alpha_padding)
        @info "$alpha $avg_abs_error $avg_rel_error $max_error"
    end
end;

In [None]:
check_raw_splits(medium)

In [None]:
# training_alphas = vcat(
#     ["$medium/$task/ExplicitUserItemBiases"],
#     ["$medium/$task/Transformer/v1/$content" for content in ["implicit", "explicit"]],
#     ["$medium/$task/BagOfWords/$content/v1" for content in ["implicit", "explicit"]],
# )
# test_alphas = ["$medium/$task/Linear$x" for x in ["Implicit", "Explicit"]];

In [None]:
# display_errors(training_alphas, "test", medium)
# println()
# display_errors(test_alphas, "test", medium)

In [None]:
# display_errors(training_alphas, "test", medium; content = "negative")
# println()
# display_errors(test_alphas, "test", medium; content = "negative")

In [None]:
# display_errors(training_alphas, "training", medium)