# Bayesian Personalized Ranking Trees
* Creates a model for pairwise classification

In [1]:
using LightGBM
import NBInclude: @nbinclude
@nbinclude("BPRBase.ipynb")

┌ Info: lib_lightgbm found in system dirs!
└ @ LightGBM /Users/kundan/.julia/packages/LightGBM/A7zVd/src/LightGBM.jl:28


## Lightgbm Datasets

In [6]:
function augment_dataset(ds, y, w)
    LightGBM.LGBM_DatasetSetField(ds, "label", y)
    LightGBM.LGBM_DatasetSetField(ds, "weight", w)
    ds
end

function create_train_dataset(X, y, w, estimator)
    augment_dataset(
        LightGBM.LGBM_DatasetCreateFromMat(X, LightGBM.stringifyparams(estimator), false),
        y,
        w,
    )
end

function create_test_dataset(X, y, w, estimator, train_ds)
    augment_dataset(
        LightGBM.LGBM_DatasetCreateFromMat(
            X,
            LightGBM.stringifyparams(estimator),
            train_ds,
            false,
        ),
        y,
        w,
    )
end;

In [7]:
function get_pairwise_dataset(split, user_features; batch_size = 1024, epochs = 10000)
    Xs = []
    ys = []
    @showprogress for _ = 1:epochs
        batch = get_batch(split, user_features, batch_size)
        push!(Xs, batch[1][1]')
        push!(ys, batch[1][2]')
    end
    X = vcat(Xs...)
    y = vec(vcat(ys...))
    w = copy(y)
    w .= 1
    X, y, w
end;

## Save Model

In [17]:
function train_alpha(outdir)
    set_logging_outdir(outdir)
    ensemble_alphas = [
        "Explicit"
        "LinearExplicit"
        "LinearImplicit"
        "ErrorExplicit"
        "ErrorImplicit"
    ]
    all_alphas = [
        ensemble_alphas
        explicit_raw_alphas
        implicit_raw_alphas
        nondirectional_raw_alphas
    ]

    training, test, user_features = get_data(all_alphas)
    estimator = LGBMClassification(
        objective = "binary",
        num_iterations = 1000,
        learning_rate = 0.01,
        early_stopping_round = 10,
        feature_fraction = 0.8,
        bagging_fraction = 0.9,
        bagging_freq = 1,
        num_leaves = 1000,
        num_class = 1,
        metric = ["binary_logloss"],
    )
    X_train, y_train, w_train = get_pairwise_dataset(training, user_features)
    X_test, y_test, w_test = get_pairwise_dataset(test, user_features)

    train_ds = create_train_dataset(X_train, y_train, w_train, estimator)
    test_ds = create_test_dataset(X_test, y_test, w_test, estimator, train_ds)
    fit!(estimator, train_ds, test_ds)
    write_params(Dict("model" => estimator, "alphas" => all_alphas), outdir)
end;

train_alpha (generic function with 1 method)

In [18]:
train_alpha("BPRT")

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:02[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:20[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:13[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:01:06[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:01[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:04[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:01:04[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:03:51[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:01:41[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:02:03[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:02:16[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:04:42[39m
[32mProgress: 1

[LightGBM] [Info] Number of positive: 5120569, number of negative: 5119431
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 10240000, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500056 -> initscore=0.000222
[LightGBM] [Info] Start training from score 0.000222
Iteration: 1, test_1's binary_logloss: 0.6846598164251062
Iteration: 2, test_1's binary_logloss: 0.6763334423501781
Iteration: 3, test_1's binary_logloss: 0.6681727846337339
Iteration: 4, test_1's binary_logloss: 0.6601767592873866
Iteration: 5, test_1's binary_logloss: 0.6523265689936284
Iteration: 6, test_1's binary_logloss: 0.6446261110761169
Iteration: 7, test_1's binary_logloss: 0.6370710651135377
Iteration: 8, test_1's binary_logloss: 0.6296580011678694
Iteration: 9, test_1's binary_logloss: 0.6223916720400602
Iteration: 10, test