# TreeModelBase
* Uses LightGBM to fit a tree model

In [None]:
using LightGBM

import Flux: sigmoid
import SparseArrays: sparse
import Statistics: mean
import NBInclude: @nbinclude
@nbinclude("../Alpha.ipynb")
@nbinclude("EnsembleInputs.ipynb");

## LightGBM interface

In [None]:
function augment_dataset(ds, y, w)
    LightGBM.LGBM_DatasetSetField(ds, "label", y)
    LightGBM.LGBM_DatasetSetField(ds, "weight", w)
    ds
end

function create_train_dataset(X, y, w, estimator)
    augment_dataset(
        LightGBM.LGBM_DatasetCreateFromMat(X, LightGBM.stringifyparams(estimator), false),
        y,
        w,
    )
end

function create_test_dataset(X, y, w, estimator, train_ds)
    augment_dataset(
        LightGBM.LGBM_DatasetCreateFromMat(
            X,
            LightGBM.stringifyparams(estimator),
            train_ds,
            false,
        ),
        y,
        w,
    )
end;

In [None]:
function create_estimator(λ::Vector{Float64}, implicit::Bool, error::Bool)
    n_trees = 100
    if error || !implicit
        return LGBMRegression(
            num_iterations = n_trees,
            early_stopping_round = 10,
            learning_rate = 0.1 * exp(λ[1]),
            feature_fraction = sigmoid(λ[2]),
            bagging_fraction = sigmoid(λ[3]),
            bagging_freq = 1,
            num_leaves = Int(round(1000 * exp(λ[4]))),
            min_data_in_leaf = Int(round(100 * exp(λ[5]))),
        )
    else
        estimator = LGBMClassification(
            objective = "binary",
            num_iterations = n_trees,
            early_stopping_round = 10,
            learning_rate = 0.1 * exp(λ[1]),
            feature_fraction = sigmoid(λ[2]),
            bagging_fraction = sigmoid(λ[3]),
            bagging_freq = 1,
            num_leaves = Int(round(1000 * exp(λ[4]))),
            min_data_in_leaf = Int(round(100 * exp(λ[5]))),
            num_class = 1,
            metric = ["binary_logloss", "auc"],
        )
    end
end;

## Data

In [None]:
function get_features(alphas::Vector{String}, split::String, task::String, content::String)
    @info "getting features for $split, $task, $content"
    base_features =
        reduce(hcat, [read_alpha(x, split, task, content).rating for x in alphas])
    base_features
end

function get_data(
    split::String,
    feature_alphas::Vector{String},
    target_alphas::Vector{String},
    task::String,
    content::String,
    error_model::Bool,
)
    X = get_features(feature_alphas, split, task, content)
    if content ∈ ["implicit", "ptw", "negative"]
        if error_model
            y =
                get_split(split, task, content; fields = [:rating]).rating -
                read_alpha(target_alphas, split, task, content).rating
            w = get_weights(split, content, "constant")
        else
            @assert length(target_alphas) == 0
            y = get_split(split, task, content; fields = [:rating]).rating
            w = get_weights(split, task, content, "constant")
        end
    elseif content == "explicit"
        y =
            get_split(split, task, content; fields = [:rating]).rating -
            read_alpha(target_alphas, split, task, content, false).rating
        w = get_weights(split, task, content, "inverse")
    else
        @assert false
    end
    if error_model
        y = abs.(y)
    end
    training_mask = get_split(split, task, content).user .<= num_users() * 0.9
    X_train, X_test = X[training_mask, :], X[.!training_mask, :]
    y_train, y_test = y[training_mask], y[.!training_mask]
    w_train, w_test = w[training_mask], w[.!training_mask]
    X_train, X_test, y_train, y_test, w_train, w_test
end

function get_data(
    split::String,
    feature_alphas::Vector{String},
    target_alphas::Vector{String},
    task::String,
    contents::Vector{String},
    error_model::Bool,
    estimator,
)
    data = []
    for content in contents
        push!(
            data,
            get_data(split, feature_alphas, target_alphas, task, content, error_model),
        )
    end
    X_train = reduce(vcat, data[n][1] for n = 1:length(data))
    X_test = reduce(vcat, data[n][2] for n = 1:length(data))
    y_train = reduce(vcat, data[n][3] for n = 1:length(data))
    y_test = reduce(vcat, data[n][4] for n = 1:length(data))
    w_train = reduce(vcat, data[n][5] for n = 1:length(data))
    w_test = reduce(vcat, data[n][6] for n = 1:length(data))
    train_ds = create_train_dataset(X_train, y_train, w_train, estimator)
    test_ds = create_test_dataset(X_test, y_test, w_test, estimator, train_ds)
    train_ds, test_ds
end;

## Predictions

In [None]:
function memory_efficient_predict(estimator, features::Matrix{Float32})
    batch_size = 1024
    n = size(features)[1]
    preds = fill(NaN32, n)
    @showprogress for iter = 1:Int(ceil(n / batch_size))
        range = (iter-1)*batch_size+1:min(iter * batch_size, n)
        batch = features[range, :]
        preds[range] .= convert.(Float32, predict(estimator, batch))
    end
    preds
end

function memory_efficient_predict(
    estimator,
    alphas::Vector{String},
    split::String,
    task::String,
    content::String,
    splits_to_skip::Vector{String},
    raw_splits::Bool,
)
    @assert "training" in splits_to_skip
    @info "predicting $split $task $content"
    if raw_splits
        split_fn = get_raw_split
        alpha_fn = read_raw_alpha
    else
        split_fn = get_split
        alpha_fn = read_alpha
    end
    df = split_fn(split, task, content; fields = [:user, :item])
    if split in splits_to_skip
        return zeros(Float32, length(df.user))
    end

    N = length(df.user)
    preds = zeros(Float32, N)
    chunk_size = Int(5e7)
    n_chunks = Int(ceil(N / chunk_size))
    for i = 1:n_chunks
        @info "saving chunk $i out of $n_chunks ($N / $chunk_size)"
        range = (i-1)*chunk_size+1:min(i * chunk_size - 1, N)
        features =
            reduce(hcat, [alpha_fn(x, split, task, content).rating[range] for x in alphas])
        preds[range] .= memory_efficient_predict(estimator, features)
    end
    preds
end;

## Training

In [None]:
function train_model(
    feature_alphas::Vector{String},
    target_alphas::Vector{String},
    task::String,
    contents::Vector{String},
    implicit::Bool,
    training_split::String,
    error_model::Bool,
    outdir::String;
    λ::Union{Vector{Float32},Nothing} = nothing,
)
    set_logging_outdir(outdir)

    # create a lightgbm tree model
    if isnothing(λ)
        inverse_sigmoid(y) = -log(1 / y - 1)
        λ = [0, inverse_sigmoid(0.8), inverse_sigmoid(0.9), 0, 0]
    end
    estimator = create_estimator(λ, implicit, error_model)

    # get training data
    train_ds, test_ds = get_data(
        training_split,
        feature_alphas,
        target_alphas,
        task,
        contents,
        error_model,
        estimator,
    )

    # train model
    fit!(estimator, train_ds, test_ds)

    # save model
    @info "Saving model..."
    write_params(Dict("model" => estimator, "alphas" => feature_alphas), outdir)
    if training_split == "test"
        splits_to_skip = ["training", "validation"]
    elseif training_split == "validation"
        splits_to_skip = ["training"]
    else
        @assert false
    end
    function model(split, task, content; raw_splits)
        memory_efficient_predict(
            estimator,
            feature_alphas,
            split,
            task,
            content,
            splits_to_skip,
            raw_splits,
        )
    end
    write_alpha(model, outdir; task = task, by_split = true, log = false)
end;

In [None]:
# we only need to tune once every now and then
# function nlopt_loss(λ)
#     implicit = false
#     linear_alphas = ["LinearExplicit", "LinearImplicit"]
#     all_features = [
#         explicit_raw_alphas
#         implicit_raw_alphas
#         nondirectional_raw_alphas
#         linear_alphas
#     ]
#     estimator = create_estimator(λ)
#     train_ds, test_ds = get_data(
#         ["validation"],
#         all_features,
#         ["LinearExplicit"],
#         implicit,
#         false,
#         estimator,
#     )
#     results = fit!(estimator, train_ds, test_ds, verbosity = -1)
#     loss = results["metrics"]["test_1"]["l2"][end]
#     @info loss, λ
#     loss
# end

# function nlopt_loss_no_lr(λ, grad)
#     # don't the learning rate
#     nlopt_loss([[0.0]; λ])
# end

# import NLopt
# opt = NLopt.Opt(:LN_NELDERMEAD, 4)
# opt.initial_step = 1
# opt.maxeval = 25
# opt.min_objective = nlopt_loss_no_lr
# minf, λ, ret = NLopt.optimize(opt, zeros(4))
# numevals = opt.numevals;