# Bayesian Personalized Ranking Trees
* Creates a model for pairwise classification

In [1]:
using LightGBM
import NBInclude: @nbinclude
@nbinclude("BPRBase.ipynb");

┌ Info: lib_lightgbm not found in system dirs, trying fallback
└ @ LightGBM /home/kundan/.julia/packages/LightGBM/A7zVd/src/LightGBM.jl:25


## Lightgbm Datasets

In [2]:
# TODO move to a shared LGBM package

function augment_dataset(ds, y, w)
    LightGBM.LGBM_DatasetSetField(ds, "label", y)
    LightGBM.LGBM_DatasetSetField(ds, "weight", w)
    ds
end

function create_train_dataset(X, y, w, estimator)
    augment_dataset(
        LightGBM.LGBM_DatasetCreateFromMat(X, LightGBM.stringifyparams(estimator), false),
        y,
        w,
    )
end

function create_test_dataset(X, y, w, estimator, train_ds)
    augment_dataset(
        LightGBM.LGBM_DatasetCreateFromMat(
            X,
            LightGBM.stringifyparams(estimator),
            train_ds,
            false,
        ),
        y,
        w,
    )
end;

In [3]:
function get_pairwise_dataset(
    split,
    user_features,
    training;
    batch_size = 1024,
    epochs = 10000,
)
    @info "getting pairwise dataset"
    Xs = []
    ys = []
    @showprogress for _ = 1:epochs
        batch = get_batch(split, user_features, batch_size, training = training)
        push!(Xs, cpu(batch[1][1])')
        push!(ys, cpu(batch[1][2])')
    end
    X = vcat(Xs...)
    y = vec(vcat(ys...))
    w = copy(y)
    w .= 1
    X, y, w
end;

## Save Model

In [4]:
function train_alpha(outdir)
    set_logging_outdir(outdir)
    ensemble_alphas = [
        "Explicit"
        "LinearExplicit"
        "LinearImplicit"
        "ErrorExplicit"
        "ErrorImplicit"
    ]
    all_alphas = [
        ensemble_alphas
        explicit_raw_alphas
        implicit_raw_alphas
        nondirectional_raw_alphas
    ]

    training, test, user_features = get_data(all_alphas)
    estimator = LGBMClassification(
        objective = "binary",
        num_iterations = 1000,
        learning_rate = 0.01,
        early_stopping_round = 10,
        feature_fraction = 0.8,
        bagging_fraction = 0.9,
        bagging_freq = 1,
        num_leaves = 1000,
        num_class = 1,
        metric = ["binary_logloss", "auc"],
    )
    X_train, y_train, w_train = get_pairwise_dataset(training, user_features, false)
    X_test, y_test, w_test = get_pairwise_dataset(test, user_features, false)

    train_ds = create_train_dataset(X_train, y_train, w_train, estimator)
    test_ds = create_test_dataset(X_test, y_test, w_test, estimator, train_ds)
    fit!(estimator, train_ds, test_ds)
    write_params(Dict("model" => estimator, "alphas" => all_alphas), outdir)
end;

In [5]:
train_alpha("BPRT")

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:02[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:22[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:11[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:59[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220713 20:57:09 getting user features
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:04[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220713 20:57:13 getting explicit_test alphas
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:01:34[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:03:47[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220713 21:02:37 getting implicit_test alphas
[32mProgress: 100%|████████████████████████████

[LightGBM] [Info] Number of positive: 5119871, number of negative: 5120129
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8670
[LightGBM] [Info] Number of data points in the train set: 10240000, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499987 -> initscore=-0.000050
[LightGBM] [Info] Start training from score -0.000050
Iteration: 1, test_1's binary_logloss: 0.6846590498137767, 
Iteration: 1, test_1's auc: 0.9911974771745864
Iteration: 2, test_1's binary_logloss: 0.6763301485666318, 
Iteration: 2, test_1's auc: 0.9915591863211923
Iteration: 3, test_1's binary_logloss: 0.6683953897776906, 
Iteration: 3, test_1's auc: 0.9914039840201644
Iteration: 4, test_1's binary_logloss: 0.6603865057664732, 
Iteration: 4, test_1's auc: 0.9915514259507832
Iteration: 5, test_1's binary_logloss: 0.6525328942397853, 
Iteration: 5, test_1's auc: 0.9916341115587084
Iteration: 6, test_1's binary_logloss: 0.6448213731081242, 
Iteratio

In [6]:
# Iteration: 1000, test_1's binary_logloss: 0.11233399764707044, 
# Iteration: 1000, test_1's auc: 0.992262893711801